From 04d417a7ca8ef694658b26fb697a035717414731 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 13 Oct 2016 11:12:30 -0700
Subject: [PATCH 001/162] [SPARK-17830][SQL] Annotate remaining SQL APIs with
 InterfaceStability

## What changes were proposed in this pull request?
This patch annotates all the remaining APIs in SQL (excluding streaming) with InterfaceStability.

## How was this patch tested?
N/A - just annotation change.

Author: Reynold Xin <rxin@databricks.com>

Closes #15457 from rxin/SPARK-17830-2.
---
 .../java/org/apache/spark/sql/SaveMode.java   |  3 +++
 .../org/apache/spark/sql/api/java/UDF1.java   |  8 +++---
 .../org/apache/spark/sql/api/java/UDF10.java  |  8 +++---
 .../org/apache/spark/sql/api/java/UDF11.java  |  8 +++---
 .../org/apache/spark/sql/api/java/UDF12.java  |  8 +++---
 .../org/apache/spark/sql/api/java/UDF13.java  |  8 +++---
 .../org/apache/spark/sql/api/java/UDF14.java  |  8 +++---
 .../org/apache/spark/sql/api/java/UDF15.java  |  8 +++---
 .../org/apache/spark/sql/api/java/UDF16.java  |  8 +++---
 .../org/apache/spark/sql/api/java/UDF17.java  |  8 +++---
 .../org/apache/spark/sql/api/java/UDF18.java  |  8 +++---
 .../org/apache/spark/sql/api/java/UDF19.java  |  8 +++---
 .../org/apache/spark/sql/api/java/UDF2.java   |  8 +++---
 .../org/apache/spark/sql/api/java/UDF20.java  |  8 +++---
 .../org/apache/spark/sql/api/java/UDF21.java  |  8 +++---
 .../org/apache/spark/sql/api/java/UDF22.java  |  8 +++---
 .../org/apache/spark/sql/api/java/UDF3.java   |  8 +++---
 .../org/apache/spark/sql/api/java/UDF4.java   |  8 +++---
 .../org/apache/spark/sql/api/java/UDF5.java   |  8 +++---
 .../org/apache/spark/sql/api/java/UDF6.java   |  8 +++---
 .../org/apache/spark/sql/api/java/UDF7.java   |  8 +++---
 .../org/apache/spark/sql/api/java/UDF8.java   |  8 +++---
 .../org/apache/spark/sql/api/java/UDF9.java   |  8 +++---
 .../spark/sql/expressions/javalang/typed.java |  2 ++
 .../apache/spark/sql/catalog/Catalog.scala    |  9 ++++++-
 .../apache/spark/sql/catalog/interface.scala  |  5 ++++
 .../spark/sql/expressions/Aggregator.scala    |  3 ++-
 .../sql/expressions/UserDefinedFunction.scala |  3 ++-
 .../apache/spark/sql/expressions/Window.scala |  4 ++-
 .../spark/sql/expressions/WindowSpec.scala    |  7 ++---
 .../sql/expressions/scalalang/typed.scala     |  3 ++-
 .../apache/spark/sql/expressions/udaf.scala   |  8 +++++-
 .../apache/spark/sql/jdbc/JdbcDialects.scala  |  5 +++-
 .../apache/spark/sql/sources/filters.scala    | 18 +++++++++++++
 .../apache/spark/sql/sources/interfaces.scala | 26 +++++++++++++++++--
 35 files changed, 150 insertions(+), 122 deletions(-)

diff --git a/sql/core/src/main/java/org/apache/spark/sql/SaveMode.java b/sql/core/src/main/java/org/apache/spark/sql/SaveMode.java
index 9665c3c46f901..1c3c9794fb6bb 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/SaveMode.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/SaveMode.java
@@ -16,11 +16,14 @@
  */
 package org.apache.spark.sql;
 
+import org.apache.spark.annotation.InterfaceStability;
+
 /**
  * SaveMode is used to specify the expected behavior of saving a DataFrame to a data source.
  *
  * @since 1.3.0
  */
+@InterfaceStability.Stable
 public enum SaveMode {
   /**
    * Append mode means that when saving a DataFrame to a data source, if data/table already exists,
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF1.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF1.java
index ef959e35e1027..1460daf27dc20 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF1.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF1.java
@@ -19,14 +19,12 @@
 
 import java.io.Serializable;
 
-// **************************************************
-// THIS FILE IS AUTOGENERATED BY CODE IN
-// org.apache.spark.sql.api.java.FunctionRegistration
-// **************************************************
+import org.apache.spark.annotation.InterfaceStability;
 
 /**
  * A Spark SQL UDF that has 1 arguments.
  */
+@InterfaceStability.Stable
 public interface UDF1<T1, R> extends Serializable {
-  public R call(T1 t1) throws Exception;
+  R call(T1 t1) throws Exception;
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF10.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF10.java
index 96ab3a96c3d5e..7c4f1e4897084 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF10.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF10.java
@@ -19,14 +19,12 @@
 
 import java.io.Serializable;
 
-// **************************************************
-// THIS FILE IS AUTOGENERATED BY CODE IN
-// org.apache.spark.sql.api.java.FunctionRegistration
-// **************************************************
+import org.apache.spark.annotation.InterfaceStability;
 
 /**
  * A Spark SQL UDF that has 10 arguments.
  */
+@InterfaceStability.Stable
 public interface UDF10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, R> extends Serializable {
-  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10) throws Exception;
+  R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10) throws Exception;
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF11.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF11.java
index 58ae8edd6d817..26a05106aebd6 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF11.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF11.java
@@ -19,14 +19,12 @@
 
 import java.io.Serializable;
 
-// **************************************************
-// THIS FILE IS AUTOGENERATED BY CODE IN
-// org.apache.spark.sql.api.java.FunctionRegistration
-// **************************************************
+import org.apache.spark.annotation.InterfaceStability;
 
 /**
  * A Spark SQL UDF that has 11 arguments.
  */
+@InterfaceStability.Stable
 public interface UDF11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, R> extends Serializable {
-  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11) throws Exception;
+  R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11) throws Exception;
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF12.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF12.java
index d9da0f6eddd94..8ef7a99042025 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF12.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF12.java
@@ -19,14 +19,12 @@
 
 import java.io.Serializable;
 
-// **************************************************
-// THIS FILE IS AUTOGENERATED BY CODE IN
-// org.apache.spark.sql.api.java.FunctionRegistration
-// **************************************************
+import org.apache.spark.annotation.InterfaceStability;
 
 /**
  * A Spark SQL UDF that has 12 arguments.
  */
+@InterfaceStability.Stable
 public interface UDF12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, R> extends Serializable {
-  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12) throws Exception;
+  R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12) throws Exception;
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF13.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF13.java
index 095fc1a8076b5..5c3b2ec1222e2 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF13.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF13.java
@@ -19,14 +19,12 @@
 
 import java.io.Serializable;
 
-// **************************************************
-// THIS FILE IS AUTOGENERATED BY CODE IN
-// org.apache.spark.sql.api.java.FunctionRegistration
-// **************************************************
+import org.apache.spark.annotation.InterfaceStability;
 
 /**
  * A Spark SQL UDF that has 13 arguments.
  */
+@InterfaceStability.Stable
 public interface UDF13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, R> extends Serializable {
-  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13) throws Exception;
+  R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13) throws Exception;
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF14.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF14.java
index eb27eaa180086..97e744d843466 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF14.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF14.java
@@ -19,14 +19,12 @@
 
 import java.io.Serializable;
 
-// **************************************************
-// THIS FILE IS AUTOGENERATED BY CODE IN
-// org.apache.spark.sql.api.java.FunctionRegistration
-// **************************************************
+import org.apache.spark.annotation.InterfaceStability;
 
 /**
  * A Spark SQL UDF that has 14 arguments.
  */
+@InterfaceStability.Stable
 public interface UDF14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, R> extends Serializable {
-  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14) throws Exception;
+  R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14) throws Exception;
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF15.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF15.java
index 1fbcff56332b6..7ddbf914fc11a 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF15.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF15.java
@@ -19,14 +19,12 @@
 
 import java.io.Serializable;
 
-// **************************************************
-// THIS FILE IS AUTOGENERATED BY CODE IN
-// org.apache.spark.sql.api.java.FunctionRegistration
-// **************************************************
+import org.apache.spark.annotation.InterfaceStability;
 
 /**
  * A Spark SQL UDF that has 15 arguments.
  */
+@InterfaceStability.Stable
 public interface UDF15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, R> extends Serializable {
-  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14, T15 t15) throws Exception;
+  R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14, T15 t15) throws Exception;
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF16.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF16.java
index 1133561787a69..0ae5dc7195ad6 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF16.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF16.java
@@ -19,14 +19,12 @@
 
 import java.io.Serializable;
 
-// **************************************************
-// THIS FILE IS AUTOGENERATED BY CODE IN
-// org.apache.spark.sql.api.java.FunctionRegistration
-// **************************************************
+import org.apache.spark.annotation.InterfaceStability;
 
 /**
  * A Spark SQL UDF that has 16 arguments.
  */
+@InterfaceStability.Stable
 public interface UDF16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, R> extends Serializable {
-  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14, T15 t15, T16 t16) throws Exception;
+  R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14, T15 t15, T16 t16) throws Exception;
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF17.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF17.java
index dfae7922c9b63..03543a556c614 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF17.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF17.java
@@ -19,14 +19,12 @@
 
 import java.io.Serializable;
 
-// **************************************************
-// THIS FILE IS AUTOGENERATED BY CODE IN
-// org.apache.spark.sql.api.java.FunctionRegistration
-// **************************************************
+import org.apache.spark.annotation.InterfaceStability;
 
 /**
  * A Spark SQL UDF that has 17 arguments.
  */
+@InterfaceStability.Stable
 public interface UDF17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, R> extends Serializable {
-  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14, T15 t15, T16 t16, T17 t17) throws Exception;
+  R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14, T15 t15, T16 t16, T17 t17) throws Exception;
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF18.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF18.java
index e9d1c6d52d4ea..46740d3443916 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF18.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF18.java
@@ -19,14 +19,12 @@
 
 import java.io.Serializable;
 
-// **************************************************
-// THIS FILE IS AUTOGENERATED BY CODE IN
-// org.apache.spark.sql.api.java.FunctionRegistration
-// **************************************************
+import org.apache.spark.annotation.InterfaceStability;
 
 /**
  * A Spark SQL UDF that has 18 arguments.
  */
+@InterfaceStability.Stable
 public interface UDF18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, R> extends Serializable {
-  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14, T15 t15, T16 t16, T17 t17, T18 t18) throws Exception;
+  R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14, T15 t15, T16 t16, T17 t17, T18 t18) throws Exception;
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF19.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF19.java
index 46b9d2d3c9457..33fefd8ecaf1d 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF19.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF19.java
@@ -19,14 +19,12 @@
 
 import java.io.Serializable;
 
-// **************************************************
-// THIS FILE IS AUTOGENERATED BY CODE IN
-// org.apache.spark.sql.api.java.FunctionRegistration
-// **************************************************
+import org.apache.spark.annotation.InterfaceStability;
 
 /**
  * A Spark SQL UDF that has 19 arguments.
  */
+@InterfaceStability.Stable
 public interface UDF19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, R> extends Serializable {
-  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14, T15 t15, T16 t16, T17 t17, T18 t18, T19 t19) throws Exception;
+  R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14, T15 t15, T16 t16, T17 t17, T18 t18, T19 t19) throws Exception;
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF2.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF2.java
index cd3fde8da419e..9822f19217d76 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF2.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF2.java
@@ -19,14 +19,12 @@
 
 import java.io.Serializable;
 
-// **************************************************
-// THIS FILE IS AUTOGENERATED BY CODE IN
-// org.apache.spark.sql.api.java.FunctionRegistration
-// **************************************************
+import org.apache.spark.annotation.InterfaceStability;
 
 /**
  * A Spark SQL UDF that has 2 arguments.
  */
+@InterfaceStability.Stable
 public interface UDF2<T1, T2, R> extends Serializable {
-  public R call(T1 t1, T2 t2) throws Exception;
+  R call(T1 t1, T2 t2) throws Exception;
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF20.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF20.java
index 113d3d26be4a7..8c5e90182da1c 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF20.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF20.java
@@ -19,14 +19,12 @@
 
 import java.io.Serializable;
 
-// **************************************************
-// THIS FILE IS AUTOGENERATED BY CODE IN
-// org.apache.spark.sql.api.java.FunctionRegistration
-// **************************************************
+import org.apache.spark.annotation.InterfaceStability;
 
 /**
  * A Spark SQL UDF that has 20 arguments.
  */
+@InterfaceStability.Stable
 public interface UDF20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, R> extends Serializable {
-  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14, T15 t15, T16 t16, T17 t17, T18 t18, T19 t19, T20 t20) throws Exception;
+  R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14, T15 t15, T16 t16, T17 t17, T18 t18, T19 t19, T20 t20) throws Exception;
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF21.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF21.java
index 74118f2cf8da7..e3b09f5167cff 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF21.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF21.java
@@ -19,14 +19,12 @@
 
 import java.io.Serializable;
 
-// **************************************************
-// THIS FILE IS AUTOGENERATED BY CODE IN
-// org.apache.spark.sql.api.java.FunctionRegistration
-// **************************************************
+import org.apache.spark.annotation.InterfaceStability;
 
 /**
  * A Spark SQL UDF that has 21 arguments.
  */
+@InterfaceStability.Stable
 public interface UDF21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, R> extends Serializable {
-  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14, T15 t15, T16 t16, T17 t17, T18 t18, T19 t19, T20 t20, T21 t21) throws Exception;
+  R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14, T15 t15, T16 t16, T17 t17, T18 t18, T19 t19, T20 t20, T21 t21) throws Exception;
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF22.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF22.java
index 0e7cc40be45ec..dc6cfa9097bab 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF22.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF22.java
@@ -19,14 +19,12 @@
 
 import java.io.Serializable;
 
-// **************************************************
-// THIS FILE IS AUTOGENERATED BY CODE IN
-// org.apache.spark.sql.api.java.FunctionRegistration
-// **************************************************
+import org.apache.spark.annotation.InterfaceStability;
 
 /**
  * A Spark SQL UDF that has 22 arguments.
  */
+@InterfaceStability.Stable
 public interface UDF22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, R> extends Serializable {
-  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14, T15 t15, T16 t16, T17 t17, T18 t18, T19 t19, T20 t20, T21 t21, T22 t22) throws Exception;
+  R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14, T15 t15, T16 t16, T17 t17, T18 t18, T19 t19, T20 t20, T21 t21, T22 t22) throws Exception;
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF3.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF3.java
index 6a880f16be47a..7c264b69ba195 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF3.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF3.java
@@ -19,14 +19,12 @@
 
 import java.io.Serializable;
 
-// **************************************************
-// THIS FILE IS AUTOGENERATED BY CODE IN
-// org.apache.spark.sql.api.java.FunctionRegistration
-// **************************************************
+import org.apache.spark.annotation.InterfaceStability;
 
 /**
  * A Spark SQL UDF that has 3 arguments.
  */
+@InterfaceStability.Stable
 public interface UDF3<T1, T2, T3, R> extends Serializable {
-  public R call(T1 t1, T2 t2, T3 t3) throws Exception;
+  R call(T1 t1, T2 t2, T3 t3) throws Exception;
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF4.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF4.java
index fcad2febb18e6..58df38fc3c911 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF4.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF4.java
@@ -19,14 +19,12 @@
 
 import java.io.Serializable;
 
-// **************************************************
-// THIS FILE IS AUTOGENERATED BY CODE IN
-// org.apache.spark.sql.api.java.FunctionRegistration
-// **************************************************
+import org.apache.spark.annotation.InterfaceStability;
 
 /**
  * A Spark SQL UDF that has 4 arguments.
  */
+@InterfaceStability.Stable
 public interface UDF4<T1, T2, T3, T4, R> extends Serializable {
-  public R call(T1 t1, T2 t2, T3 t3, T4 t4) throws Exception;
+  R call(T1 t1, T2 t2, T3 t3, T4 t4) throws Exception;
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF5.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF5.java
index ce0cef43a2144..4146f96e2eed5 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF5.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF5.java
@@ -19,14 +19,12 @@
 
 import java.io.Serializable;
 
-// **************************************************
-// THIS FILE IS AUTOGENERATED BY CODE IN
-// org.apache.spark.sql.api.java.FunctionRegistration
-// **************************************************
+import org.apache.spark.annotation.InterfaceStability;
 
 /**
  * A Spark SQL UDF that has 5 arguments.
  */
+@InterfaceStability.Stable
 public interface UDF5<T1, T2, T3, T4, T5, R> extends Serializable {
-  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5) throws Exception;
+  R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5) throws Exception;
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF6.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF6.java
index f56b806684e61..25d39654c1095 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF6.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF6.java
@@ -19,14 +19,12 @@
 
 import java.io.Serializable;
 
-// **************************************************
-// THIS FILE IS AUTOGENERATED BY CODE IN
-// org.apache.spark.sql.api.java.FunctionRegistration
-// **************************************************
+import org.apache.spark.annotation.InterfaceStability;
 
 /**
  * A Spark SQL UDF that has 6 arguments.
  */
+@InterfaceStability.Stable
 public interface UDF6<T1, T2, T3, T4, T5, T6, R> extends Serializable {
-  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6) throws Exception;
+  R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6) throws Exception;
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF7.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF7.java
index 25bd6d3241bd4..ce63b6a91adbb 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF7.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF7.java
@@ -19,14 +19,12 @@
 
 import java.io.Serializable;
 
-// **************************************************
-// THIS FILE IS AUTOGENERATED BY CODE IN
-// org.apache.spark.sql.api.java.FunctionRegistration
-// **************************************************
+import org.apache.spark.annotation.InterfaceStability;
 
 /**
  * A Spark SQL UDF that has 7 arguments.
  */
+@InterfaceStability.Stable
 public interface UDF7<T1, T2, T3, T4, T5, T6, T7, R> extends Serializable {
-  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7) throws Exception;
+  R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7) throws Exception;
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF8.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF8.java
index a3b7ac5f94ce7..0e00209ef6b9f 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF8.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF8.java
@@ -19,14 +19,12 @@
 
 import java.io.Serializable;
 
-// **************************************************
-// THIS FILE IS AUTOGENERATED BY CODE IN
-// org.apache.spark.sql.api.java.FunctionRegistration
-// **************************************************
+import org.apache.spark.annotation.InterfaceStability;
 
 /**
  * A Spark SQL UDF that has 8 arguments.
  */
+@InterfaceStability.Stable
 public interface UDF8<T1, T2, T3, T4, T5, T6, T7, T8, R> extends Serializable {
-  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8) throws Exception;
+  R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8) throws Exception;
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF9.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF9.java
index 205e72a1522fc..077981bb3e3ee 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF9.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF9.java
@@ -19,14 +19,12 @@
 
 import java.io.Serializable;
 
-// **************************************************
-// THIS FILE IS AUTOGENERATED BY CODE IN
-// org.apache.spark.sql.api.java.FunctionRegistration
-// **************************************************
+import org.apache.spark.annotation.InterfaceStability;
 
 /**
  * A Spark SQL UDF that has 9 arguments.
  */
+@InterfaceStability.Stable
 public interface UDF9<T1, T2, T3, T4, T5, T6, T7, T8, T9, R> extends Serializable {
-  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9) throws Exception;
+  R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9) throws Exception;
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/expressions/javalang/typed.java b/sql/core/src/main/java/org/apache/spark/sql/expressions/javalang/typed.java
index 247e94b86c349..ec9c107b1c119 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/expressions/javalang/typed.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/expressions/javalang/typed.java
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.expressions.javalang;
 
 import org.apache.spark.annotation.Experimental;
+import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.TypedColumn;
 import org.apache.spark.sql.execution.aggregate.TypedAverage;
@@ -34,6 +35,7 @@
  * @since 2.0.0
  */
 @Experimental
+@InterfaceStability.Evolving
 public class typed {
   // Note: make sure to keep in sync with typed.scala
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala
index 18cba8ce28b4d..889b8a02784d6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalog
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, InterfaceStability}
 import org.apache.spark.sql.{AnalysisException, DataFrame, Dataset}
 import org.apache.spark.sql.types.StructType
 
@@ -27,6 +27,7 @@ import org.apache.spark.sql.types.StructType
  *
  * @since 2.0.0
  */
+@InterfaceStability.Stable
 abstract class Catalog {
 
   /**
@@ -193,6 +194,7 @@ abstract class Catalog {
    * @since 2.0.0
    */
   @Experimental
+  @InterfaceStability.Evolving
   def createExternalTable(tableName: String, path: String): DataFrame
 
   /**
@@ -203,6 +205,7 @@ abstract class Catalog {
    * @since 2.0.0
    */
   @Experimental
+  @InterfaceStability.Evolving
   def createExternalTable(tableName: String, path: String, source: String): DataFrame
 
   /**
@@ -213,6 +216,7 @@ abstract class Catalog {
    * @since 2.0.0
    */
   @Experimental
+  @InterfaceStability.Evolving
   def createExternalTable(
       tableName: String,
       source: String,
@@ -227,6 +231,7 @@ abstract class Catalog {
    * @since 2.0.0
    */
   @Experimental
+  @InterfaceStability.Evolving
   def createExternalTable(
       tableName: String,
       source: String,
@@ -240,6 +245,7 @@ abstract class Catalog {
    * @since 2.0.0
    */
   @Experimental
+  @InterfaceStability.Evolving
   def createExternalTable(
       tableName: String,
       source: String,
@@ -255,6 +261,7 @@ abstract class Catalog {
    * @since 2.0.0
    */
   @Experimental
+  @InterfaceStability.Evolving
   def createExternalTable(
       tableName: String,
       source: String,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalog/interface.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalog/interface.scala
index 33032f07f7bea..c0c5ebc2ba2d6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/catalog/interface.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/catalog/interface.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalog
 
 import javax.annotation.Nullable
 
+import org.apache.spark.annotation.InterfaceStability
 import org.apache.spark.sql.catalyst.DefinedByConstructorParams
 
 
@@ -33,6 +34,7 @@ import org.apache.spark.sql.catalyst.DefinedByConstructorParams
  * @param locationUri path (in the form of a uri) to data files.
  * @since 2.0.0
  */
+@InterfaceStability.Stable
 class Database(
     val name: String,
     @Nullable val description: String,
@@ -59,6 +61,7 @@ class Database(
  * @param isTemporary whether the table is a temporary table.
  * @since 2.0.0
  */
+@InterfaceStability.Stable
 class Table(
     val name: String,
     @Nullable val database: String,
@@ -90,6 +93,7 @@ class Table(
  * @param isBucket whether the column is a bucket column.
  * @since 2.0.0
  */
+@InterfaceStability.Stable
 class Column(
     val name: String,
     @Nullable val description: String,
@@ -122,6 +126,7 @@ class Column(
  * @param isTemporary whether the function is a temporary function or not.
  * @since 2.0.0
  */
+@InterfaceStability.Stable
 class Function(
     val name: String,
     @Nullable val database: String,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Aggregator.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Aggregator.scala
index 51179a528c503..eea98414003ba 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Aggregator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Aggregator.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.expressions
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, InterfaceStability}
 import org.apache.spark.sql.{Dataset, Encoder, TypedColumn}
 import org.apache.spark.sql.catalyst.encoders.encoderFor
 import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete}
@@ -51,6 +51,7 @@ import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression
  * @since 1.6.0
  */
 @Experimental
+@InterfaceStability.Evolving
 abstract class Aggregator[-IN, BUF, OUT] extends Serializable {
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
index 49fdec57558e8..2e0e937e4aff7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.expressions
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, InterfaceStability}
 import org.apache.spark.sql.catalyst.expressions.ScalaUDF
 import org.apache.spark.sql.Column
 import org.apache.spark.sql.functions
@@ -40,6 +40,7 @@ import org.apache.spark.sql.types.DataType
  * @since 1.3.0
  */
 @Experimental
+@InterfaceStability.Evolving
 case class UserDefinedFunction protected[sql] (
     f: AnyRef,
     dataType: DataType,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala
index 3c1f6e897ea62..07ef60183f6fb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.expressions
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, InterfaceStability}
 import org.apache.spark.sql.Column
 import org.apache.spark.sql.catalyst.expressions._
 
@@ -37,6 +37,7 @@ import org.apache.spark.sql.catalyst.expressions._
  * @since 1.4.0
  */
 @Experimental
+@InterfaceStability.Evolving
 object Window {
 
   /**
@@ -177,4 +178,5 @@ object Window {
  * @since 1.4.0
  */
 @Experimental
+@InterfaceStability.Evolving
 class Window private()  // So we can see Window in JavaDoc.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala
index 8ebed399bf2d0..18778c8d1c294 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala
@@ -17,8 +17,8 @@
 
 package org.apache.spark.sql.expressions
 
-import org.apache.spark.annotation.Experimental
-import org.apache.spark.sql.{catalyst, Column}
+import org.apache.spark.annotation.{Experimental, InterfaceStability}
+import org.apache.spark.sql.Column
 import org.apache.spark.sql.catalyst.expressions._
 
 /**
@@ -30,10 +30,11 @@ import org.apache.spark.sql.catalyst.expressions._
  * @since 1.4.0
  */
 @Experimental
+@InterfaceStability.Evolving
 class WindowSpec private[sql](
     partitionSpec: Seq[Expression],
     orderSpec: Seq[SortOrder],
-    frame: catalyst.expressions.WindowFrame) {
+    frame: WindowFrame) {
 
   /**
    * Defines the partitioning columns in a [[WindowSpec]].
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/scalalang/typed.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/scalalang/typed.scala
index 60d7b7d0894d0..aa71cb9e3bc85 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/scalalang/typed.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/scalalang/typed.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.expressions.scalalang
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, InterfaceStability}
 import org.apache.spark.sql._
 import org.apache.spark.sql.execution.aggregate._
 
@@ -30,6 +30,7 @@ import org.apache.spark.sql.execution.aggregate._
  * @since 2.0.0
  */
 @Experimental
+@InterfaceStability.Evolving
 // scalastyle:off
 object typed {
   // scalastyle:on
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala
index 5417a0e481158..ef7c09c72b82d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.expressions
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, InterfaceStability}
 import org.apache.spark.sql.{Column, Row}
 import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete}
 import org.apache.spark.sql.execution.aggregate.ScalaUDAF
@@ -26,8 +26,11 @@ import org.apache.spark.sql.types._
 /**
  * :: Experimental ::
  * The base class for implementing user-defined aggregate functions (UDAF).
+ *
+ * @since 1.5.0
  */
 @Experimental
+@InterfaceStability.Evolving
 abstract class UserDefinedAggregateFunction extends Serializable {
 
   /**
@@ -136,8 +139,11 @@ abstract class UserDefinedAggregateFunction extends Serializable {
  * A [[Row]] representing a mutable aggregation buffer.
  *
  * This is not meant to be extended outside of Spark.
+ *
+ * @since 1.5.0
  */
 @Experimental
+@InterfaceStability.Evolving
 abstract class MutableAggregationBuffer extends Row {
 
   /** Update the ith value of this buffer. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
index 8dd4b8f662713..dec316be7aea1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.jdbc
 
 import java.sql.Connection
 
-import org.apache.spark.annotation.{DeveloperApi, Since}
+import org.apache.spark.annotation.{DeveloperApi, InterfaceStability, Since}
 import org.apache.spark.sql.types._
 
 /**
@@ -31,6 +31,7 @@ import org.apache.spark.sql.types._
  *                     send a null value to the database.
  */
 @DeveloperApi
+@InterfaceStability.Evolving
 case class JdbcType(databaseTypeDefinition : String, jdbcNullType : Int)
 
 /**
@@ -53,6 +54,7 @@ case class JdbcType(databaseTypeDefinition : String, jdbcNullType : Int)
  * for the given Catalyst type.
  */
 @DeveloperApi
+@InterfaceStability.Evolving
 abstract class JdbcDialect extends Serializable {
   /**
    * Check if this dialect instance can handle a certain jdbc url.
@@ -142,6 +144,7 @@ abstract class JdbcDialect extends Serializable {
  * sure to register your dialects first.
  */
 @DeveloperApi
+@InterfaceStability.Evolving
 object JdbcDialects {
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala
index 13c0766219a8e..e0494dfd9343b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.sources
 
+import org.apache.spark.annotation.InterfaceStability
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // This file defines all the filters that we can push down to the data sources.
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -26,6 +28,7 @@ package org.apache.spark.sql.sources
  *
  * @since 1.3.0
  */
+@InterfaceStability.Stable
 abstract class Filter {
   /**
    * List of columns that are referenced by this filter.
@@ -45,6 +48,7 @@ abstract class Filter {
  *
  * @since 1.3.0
  */
+@InterfaceStability.Stable
 case class EqualTo(attribute: String, value: Any) extends Filter {
   override def references: Array[String] = Array(attribute) ++ findReferences(value)
 }
@@ -56,6 +60,7 @@ case class EqualTo(attribute: String, value: Any) extends Filter {
  *
  * @since 1.5.0
  */
+@InterfaceStability.Stable
 case class EqualNullSafe(attribute: String, value: Any) extends Filter {
   override def references: Array[String] = Array(attribute) ++ findReferences(value)
 }
@@ -66,6 +71,7 @@ case class EqualNullSafe(attribute: String, value: Any) extends Filter {
  *
  * @since 1.3.0
  */
+@InterfaceStability.Stable
 case class GreaterThan(attribute: String, value: Any) extends Filter {
   override def references: Array[String] = Array(attribute) ++ findReferences(value)
 }
@@ -76,6 +82,7 @@ case class GreaterThan(attribute: String, value: Any) extends Filter {
  *
  * @since 1.3.0
  */
+@InterfaceStability.Stable
 case class GreaterThanOrEqual(attribute: String, value: Any) extends Filter {
   override def references: Array[String] = Array(attribute) ++ findReferences(value)
 }
@@ -86,6 +93,7 @@ case class GreaterThanOrEqual(attribute: String, value: Any) extends Filter {
  *
  * @since 1.3.0
  */
+@InterfaceStability.Stable
 case class LessThan(attribute: String, value: Any) extends Filter {
   override def references: Array[String] = Array(attribute) ++ findReferences(value)
 }
@@ -96,6 +104,7 @@ case class LessThan(attribute: String, value: Any) extends Filter {
  *
  * @since 1.3.0
  */
+@InterfaceStability.Stable
 case class LessThanOrEqual(attribute: String, value: Any) extends Filter {
   override def references: Array[String] = Array(attribute) ++ findReferences(value)
 }
@@ -105,6 +114,7 @@ case class LessThanOrEqual(attribute: String, value: Any) extends Filter {
  *
  * @since 1.3.0
  */
+@InterfaceStability.Stable
 case class In(attribute: String, values: Array[Any]) extends Filter {
   override def hashCode(): Int = {
     var h = attribute.hashCode
@@ -131,6 +141,7 @@ case class In(attribute: String, values: Array[Any]) extends Filter {
  *
  * @since 1.3.0
  */
+@InterfaceStability.Stable
 case class IsNull(attribute: String) extends Filter {
   override def references: Array[String] = Array(attribute)
 }
@@ -140,6 +151,7 @@ case class IsNull(attribute: String) extends Filter {
  *
  * @since 1.3.0
  */
+@InterfaceStability.Stable
 case class IsNotNull(attribute: String) extends Filter {
   override def references: Array[String] = Array(attribute)
 }
@@ -149,6 +161,7 @@ case class IsNotNull(attribute: String) extends Filter {
  *
  * @since 1.3.0
  */
+@InterfaceStability.Stable
 case class And(left: Filter, right: Filter) extends Filter {
   override def references: Array[String] = left.references ++ right.references
 }
@@ -158,6 +171,7 @@ case class And(left: Filter, right: Filter) extends Filter {
  *
  * @since 1.3.0
  */
+@InterfaceStability.Stable
 case class Or(left: Filter, right: Filter) extends Filter {
   override def references: Array[String] = left.references ++ right.references
 }
@@ -167,6 +181,7 @@ case class Or(left: Filter, right: Filter) extends Filter {
  *
  * @since 1.3.0
  */
+@InterfaceStability.Stable
 case class Not(child: Filter) extends Filter {
   override def references: Array[String] = child.references
 }
@@ -177,6 +192,7 @@ case class Not(child: Filter) extends Filter {
  *
  * @since 1.3.1
  */
+@InterfaceStability.Stable
 case class StringStartsWith(attribute: String, value: String) extends Filter {
   override def references: Array[String] = Array(attribute)
 }
@@ -187,6 +203,7 @@ case class StringStartsWith(attribute: String, value: String) extends Filter {
  *
  * @since 1.3.1
  */
+@InterfaceStability.Stable
 case class StringEndsWith(attribute: String, value: String) extends Filter {
   override def references: Array[String] = Array(attribute)
 }
@@ -197,6 +214,7 @@ case class StringEndsWith(attribute: String, value: String) extends Filter {
  *
  * @since 1.3.1
  */
+@InterfaceStability.Stable
 case class StringContains(attribute: String, value: String) extends Filter {
   override def references: Array[String] = Array(attribute)
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index 6484c782b5d15..3172d5ded9504 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.sources
 
-import org.apache.spark.annotation.{DeveloperApi, Experimental}
+import org.apache.spark.annotation.{DeveloperApi, Experimental, InterfaceStability}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.InternalRow
@@ -37,6 +37,7 @@ import org.apache.spark.sql.types.StructType
  * @since 1.5.0
  */
 @DeveloperApi
+@InterfaceStability.Evolving
 trait DataSourceRegister {
 
   /**
@@ -68,6 +69,7 @@ trait DataSourceRegister {
  * @since 1.3.0
  */
 @DeveloperApi
+@InterfaceStability.Evolving
 trait RelationProvider {
   /**
    * Returns a new base relation with the given parameters.
@@ -99,6 +101,7 @@ trait RelationProvider {
  * @since 1.3.0
  */
 @DeveloperApi
+@InterfaceStability.Evolving
 trait SchemaRelationProvider {
   /**
    * Returns a new base relation with the given parameters and user defined schema.
@@ -114,17 +117,26 @@ trait SchemaRelationProvider {
 /**
  * ::Experimental::
  * Implemented by objects that can produce a streaming [[Source]] for a specific format or system.
+ *
+ * @since 2.0.0
  */
 @Experimental
+@InterfaceStability.Unstable
 trait StreamSourceProvider {
 
-  /** Returns the name and schema of the source that can be used to continually read data. */
+  /**
+   * Returns the name and schema of the source that can be used to continually read data.
+   * @since 2.0.0
+   */
   def sourceSchema(
       sqlContext: SQLContext,
       schema: Option[StructType],
       providerName: String,
       parameters: Map[String, String]): (String, StructType)
 
+  /**
+   * @since 2.0.0
+   */
   def createSource(
       sqlContext: SQLContext,
       metadataPath: String,
@@ -136,8 +148,11 @@ trait StreamSourceProvider {
 /**
  * ::Experimental::
  * Implemented by objects that can produce a streaming [[Sink]] for a specific format or system.
+ *
+ * @since 2.0.0
  */
 @Experimental
+@InterfaceStability.Unstable
 trait StreamSinkProvider {
   def createSink(
       sqlContext: SQLContext,
@@ -150,6 +165,7 @@ trait StreamSinkProvider {
  * @since 1.3.0
  */
 @DeveloperApi
+@InterfaceStability.Evolving
 trait CreatableRelationProvider {
   /**
    * Save the DataFrame to the destination and return a relation with the given parameters based on
@@ -186,6 +202,7 @@ trait CreatableRelationProvider {
  * @since 1.3.0
  */
 @DeveloperApi
+@InterfaceStability.Evolving
 abstract class BaseRelation {
   def sqlContext: SQLContext
   def schema: StructType
@@ -237,6 +254,7 @@ abstract class BaseRelation {
  * @since 1.3.0
  */
 @DeveloperApi
+@InterfaceStability.Evolving
 trait TableScan {
   def buildScan(): RDD[Row]
 }
@@ -249,6 +267,7 @@ trait TableScan {
  * @since 1.3.0
  */
 @DeveloperApi
+@InterfaceStability.Evolving
 trait PrunedScan {
   def buildScan(requiredColumns: Array[String]): RDD[Row]
 }
@@ -268,6 +287,7 @@ trait PrunedScan {
  * @since 1.3.0
  */
 @DeveloperApi
+@InterfaceStability.Evolving
 trait PrunedFilteredScan {
   def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row]
 }
@@ -291,6 +311,7 @@ trait PrunedFilteredScan {
  * @since 1.3.0
  */
 @DeveloperApi
+@InterfaceStability.Evolving
 trait InsertableRelation {
   def insert(data: DataFrame, overwrite: Boolean): Unit
 }
@@ -306,6 +327,7 @@ trait InsertableRelation {
  * @since 1.3.0
  */
 @Experimental
+@InterfaceStability.Unstable
 trait CatalystScan {
   def buildScan(requiredColumns: Seq[Attribute], filters: Seq[Expression]): RDD[Row]
 }

From 84f149e414475c2e60863898992001c21cfc13b2 Mon Sep 17 00:00:00 2001
From: Pete Robbins <robbinspg@gmail.com>
Date: Thu, 13 Oct 2016 11:26:30 -0700
Subject: [PATCH 002/162] [SPARK-17827][SQL] maxColLength type should be Int
 for String and Binary

## What changes were proposed in this pull request?
correct the expected type from Length function to be Int

## How was this patch tested?
Test runs on little endian and big endian platforms

Author: Pete Robbins <robbinspg@gmail.com>

Closes #15464 from robbinspg/SPARK-17827.
---
 .../spark/sql/catalyst/plans/logical/Statistics.scala     | 4 ++--
 .../org/apache/spark/sql/StatisticsColumnSuite.scala      | 8 ++++----
 .../scala/org/apache/spark/sql/hive/StatisticsSuite.scala | 2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala
index 43455c989c0f4..f3e2147b8f974 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala
@@ -98,7 +98,7 @@ case class StringColumnStat(statRow: InternalRow) {
   // The indices here must be consistent with `ColumnStatStruct.stringColumnStat`.
   val numNulls: Long = statRow.getLong(0)
   val avgColLen: Double = statRow.getDouble(1)
-  val maxColLen: Long = statRow.getLong(2)
+  val maxColLen: Long = statRow.getInt(2)
   val ndv: Long = statRow.getLong(3)
 }
 
@@ -106,7 +106,7 @@ case class BinaryColumnStat(statRow: InternalRow) {
   // The indices here must be consistent with `ColumnStatStruct.binaryColumnStat`.
   val numNulls: Long = statRow.getLong(0)
   val avgColLen: Double = statRow.getDouble(1)
-  val maxColLen: Long = statRow.getLong(2)
+  val maxColLen: Long = statRow.getInt(2)
 }
 
 case class BooleanColumnStat(statRow: InternalRow) {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsColumnSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsColumnSuite.scala
index 0ee0547c45591..f1a201abd8da6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsColumnSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsColumnSuite.scala
@@ -150,7 +150,7 @@ class StatisticsColumnSuite extends StatisticsTest {
       val colStat = ColumnStat(InternalRow(
         values.count(_.isEmpty).toLong,
         nonNullValues.map(_.length).sum / nonNullValues.length.toDouble,
-        nonNullValues.map(_.length).max.toLong,
+        nonNullValues.map(_.length).max.toInt,
         nonNullValues.distinct.length.toLong))
       (f, colStat)
     }
@@ -165,7 +165,7 @@ class StatisticsColumnSuite extends StatisticsTest {
       val colStat = ColumnStat(InternalRow(
         values.count(_.isEmpty).toLong,
         nonNullValues.map(_.length).sum / nonNullValues.length.toDouble,
-        nonNullValues.map(_.length).max.toLong))
+        nonNullValues.map(_.length).max.toInt))
       (f, colStat)
     }
     checkColStats(df, expectedColStatsSeq)
@@ -255,10 +255,10 @@ class StatisticsColumnSuite extends StatisticsTest {
               doubleSeq.distinct.length.toLong))
         case StringType =>
           ColumnStat(InternalRow(0L, stringSeq.map(_.length).sum / stringSeq.length.toDouble,
-                stringSeq.map(_.length).max.toLong, stringSeq.distinct.length.toLong))
+                stringSeq.map(_.length).max.toInt, stringSeq.distinct.length.toLong))
         case BinaryType =>
           ColumnStat(InternalRow(0L, binarySeq.map(_.length).sum / binarySeq.length.toDouble,
-                binarySeq.map(_.length).max.toLong))
+                binarySeq.map(_.length).max.toInt))
         case BooleanType =>
           ColumnStat(InternalRow(0L, booleanSeq.count(_.equals(true)).toLong,
               booleanSeq.count(_.equals(false)).toLong))
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index 99dd080683d40..85228bb00123d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -378,7 +378,7 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
             ColumnStat(InternalRow(0L, intSeq.max, intSeq.min, intSeq.distinct.length.toLong))
           case StringType =>
             ColumnStat(InternalRow(0L, stringSeq.map(_.length).sum / stringSeq.length.toDouble,
-              stringSeq.map(_.length).max.toLong, stringSeq.distinct.length.toLong))
+              stringSeq.map(_.length).max.toInt, stringSeq.distinct.length.toLong))
           case BooleanType =>
             ColumnStat(InternalRow(0L, booleanSeq.count(_.equals(true)).toLong,
               booleanSeq.count(_.equals(false)).toLong))

From 08eac356095c7faa2b19d52f2fb0cbc47eb7d1d1 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Thu, 13 Oct 2016 13:31:50 -0700
Subject: [PATCH 003/162] [SPARK-17834][SQL] Fetch the earliest offsets
 manually in KafkaSource instead of counting on KafkaConsumer

## What changes were proposed in this pull request?

Because `KafkaConsumer.poll(0)` may update the partition offsets, this PR just calls `seekToBeginning` to manually set the earliest offsets for the KafkaSource initial offsets.

## How was this patch tested?

Existing tests.

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #15397 from zsxwing/SPARK-17834.
---
 .../spark/sql/kafka010/KafkaSource.scala      | 55 ++++++++++++-------
 .../sql/kafka010/KafkaSourceProvider.scala    | 19 +++++--
 2 files changed, 48 insertions(+), 26 deletions(-)

diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
index 1be70db87497e..4b0bb0a0f725c 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
@@ -82,6 +82,7 @@ private[kafka010] case class KafkaSource(
     executorKafkaParams: ju.Map[String, Object],
     sourceOptions: Map[String, String],
     metadataPath: String,
+    startFromEarliestOffset: Boolean,
     failOnDataLoss: Boolean)
   extends Source with Logging {
 
@@ -109,7 +110,11 @@ private[kafka010] case class KafkaSource(
   private lazy val initialPartitionOffsets = {
     val metadataLog = new HDFSMetadataLog[KafkaSourceOffset](sqlContext.sparkSession, metadataPath)
     metadataLog.get(0).getOrElse {
-      val offsets = KafkaSourceOffset(fetchPartitionOffsets(seekToEnd = false))
+      val offsets = if (startFromEarliestOffset) {
+        KafkaSourceOffset(fetchEarliestOffsets())
+      } else {
+        KafkaSourceOffset(fetchLatestOffsets())
+      }
       metadataLog.add(0, offsets)
       logInfo(s"Initial offsets: $offsets")
       offsets
@@ -123,7 +128,7 @@ private[kafka010] case class KafkaSource(
     // Make sure initialPartitionOffsets is initialized
     initialPartitionOffsets
 
-    val offset = KafkaSourceOffset(fetchPartitionOffsets(seekToEnd = true))
+    val offset = KafkaSourceOffset(fetchLatestOffsets())
     logDebug(s"GetOffset: ${offset.partitionToOffsets.toSeq.map(_.toString).sorted}")
     Some(offset)
   }
@@ -227,26 +232,34 @@ private[kafka010] case class KafkaSource(
   override def toString(): String = s"KafkaSource[$consumerStrategy]"
 
   /**
-   * Fetch the offset of a partition, either seek to the latest offsets or use the current offsets
-   * in the consumer.
+   * Fetch the earliest offsets of partitions.
    */
-  private def fetchPartitionOffsets(
-      seekToEnd: Boolean): Map[TopicPartition, Long] = withRetriesWithoutInterrupt {
-    // Make sure `KafkaConsumer.poll` won't be interrupted (KAFKA-1894)
-    assert(Thread.currentThread().isInstanceOf[StreamExecutionThread])
+  private def fetchEarliestOffsets(): Map[TopicPartition, Long] = withRetriesWithoutInterrupt {
     // Poll to get the latest assigned partitions
     consumer.poll(0)
     val partitions = consumer.assignment()
     consumer.pause(partitions)
-    logDebug(s"Partitioned assigned to consumer: $partitions")
+    logDebug(s"Partitions assigned to consumer: $partitions. Seeking to the beginning")
 
-    // Get the current or latest offset of each partition
-    if (seekToEnd) {
-      consumer.seekToEnd(partitions)
-      logDebug("Seeked to the end")
-    }
+    consumer.seekToBeginning(partitions)
+    val partitionOffsets = partitions.asScala.map(p => p -> consumer.position(p)).toMap
+    logDebug(s"Got earliest offsets for partition : $partitionOffsets")
+    partitionOffsets
+  }
+
+  /**
+   * Fetch the latest offset of partitions.
+   */
+  private def fetchLatestOffsets(): Map[TopicPartition, Long] = withRetriesWithoutInterrupt {
+    // Poll to get the latest assigned partitions
+    consumer.poll(0)
+    val partitions = consumer.assignment()
+    consumer.pause(partitions)
+    logDebug(s"Partitions assigned to consumer: $partitions. Seeking to the end.")
+
+    consumer.seekToEnd(partitions)
     val partitionOffsets = partitions.asScala.map(p => p -> consumer.position(p)).toMap
-    logDebug(s"Got offsets for partition : $partitionOffsets")
+    logDebug(s"Got latest offsets for partition : $partitionOffsets")
     partitionOffsets
   }
 
@@ -256,22 +269,21 @@ private[kafka010] case class KafkaSource(
    */
   private def fetchNewPartitionEarliestOffsets(
       newPartitions: Seq[TopicPartition]): Map[TopicPartition, Long] = withRetriesWithoutInterrupt {
-    // Make sure `KafkaConsumer.poll` won't be interrupted (KAFKA-1894)
-    assert(Thread.currentThread().isInstanceOf[StreamExecutionThread])
     // Poll to get the latest assigned partitions
     consumer.poll(0)
     val partitions = consumer.assignment()
+    consumer.pause(partitions)
     logDebug(s"\tPartitioned assigned to consumer: $partitions")
 
     // Get the earliest offset of each partition
     consumer.seekToBeginning(partitions)
-    val partitionToOffsets = newPartitions.filter { p =>
+    val partitionOffsets = newPartitions.filter { p =>
       // When deleting topics happen at the same time, some partitions may not be in `partitions`.
       // So we need to ignore them
       partitions.contains(p)
     }.map(p => p -> consumer.position(p)).toMap
-    logDebug(s"Got offsets for new partitions: $partitionToOffsets")
-    partitionToOffsets
+    logDebug(s"Got earliest offsets for new partitions: $partitionOffsets")
+    partitionOffsets
   }
 
   /**
@@ -284,6 +296,9 @@ private[kafka010] case class KafkaSource(
    */
   private def withRetriesWithoutInterrupt(
       body: => Map[TopicPartition, Long]): Map[TopicPartition, Long] = {
+    // Make sure `KafkaConsumer.poll` won't be interrupted (KAFKA-1894)
+    assert(Thread.currentThread().isInstanceOf[StreamExecutionThread])
+
     synchronized {
       var result: Option[Map[TopicPartition, Long]] = None
       var attempt = 1
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
index 1b0a2fe955d03..23b1b60f3bcaa 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
@@ -77,10 +77,15 @@ private[kafka010] class KafkaSourceProvider extends StreamSourceProvider
     // id. Hence, we should generate a unique id for each query.
     val uniqueGroupId = s"spark-kafka-source-${UUID.randomUUID}-${metadataPath.hashCode}"
 
-    val autoOffsetResetValue = caseInsensitiveParams.get(STARTING_OFFSET_OPTION_KEY) match {
-      case Some(value) => value.trim()  // same values as those supported by auto.offset.reset
-      case None => "latest"
-    }
+    val startFromEarliestOffset =
+      caseInsensitiveParams.get(STARTING_OFFSET_OPTION_KEY).map(_.trim.toLowerCase) match {
+        case Some("latest") => false
+        case Some("earliest") => true
+        case Some(pos) =>
+          // This should not happen since we have already checked the options.
+          throw new IllegalStateException(s"Invalid $STARTING_OFFSET_OPTION_KEY: $pos")
+        case None => false
+      }
 
     val kafkaParamsForStrategy =
       ConfigUpdater("source", specifiedKafkaParams)
@@ -90,8 +95,9 @@ private[kafka010] class KafkaSourceProvider extends StreamSourceProvider
         // So that consumers in Kafka source do not mess with any existing group id
         .set(ConsumerConfig.GROUP_ID_CONFIG, s"$uniqueGroupId-driver")
 
-        // So that consumers can start from earliest or latest
-        .set(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, autoOffsetResetValue)
+        // Set to "latest" to avoid exceptions. However, KafkaSource will fetch the initial offsets
+        // by itself instead of counting on KafkaConsumer.
+        .set(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest")
 
         // So that consumers in the driver does not commit offsets unnecessarily
         .set(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false")
@@ -147,6 +153,7 @@ private[kafka010] class KafkaSourceProvider extends StreamSourceProvider
       kafkaParamsForExecutors,
       parameters,
       metadataPath,
+      startFromEarliestOffset,
       failOnDataLoss)
   }
 

From 7106866c220c73960c6fe2a70e4911516617e21f Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Thu, 13 Oct 2016 13:36:26 -0700
Subject: [PATCH 004/162] [SPARK-17731][SQL][STREAMING] Metrics for structured
 streaming

## What changes were proposed in this pull request?

Metrics are needed for monitoring structured streaming apps. Here is the design doc for implementing the necessary metrics.
https://docs.google.com/document/d/1NIdcGuR1B3WIe8t7VxLrt58TJB4DtipWEbj5I_mzJys/edit?usp=sharing

Specifically, this PR adds the following public APIs changes.

### New APIs
- `StreamingQuery.status` returns a `StreamingQueryStatus` object (renamed from `StreamingQueryInfo`, see later)

- `StreamingQueryStatus` has the following important fields
  - inputRate - Current rate (rows/sec) at which data is being generated by all the sources
  - processingRate - Current rate (rows/sec) at which the query is processing data from
                                  all the sources
  - ~~outputRate~~ - *Does not work with wholestage codegen*
  - latency - Current average latency between the data being available in source and the sink writing the corresponding output
  - sourceStatuses: Array[SourceStatus] - Current statuses of the sources
  - sinkStatus: SinkStatus - Current status of the sink
  - triggerStatus - Low-level detailed status of the last completed/currently active trigger
    - latencies - getOffset, getBatch, full trigger, wal writes
    - timestamps - trigger start, finish, after getOffset, after getBatch
    - numRows - input, output, state total/updated rows for aggregations

- `SourceStatus` has the following important fields
  - inputRate - Current rate (rows/sec) at which data is being generated by the source
  - processingRate - Current rate (rows/sec) at which the query is processing data from the source
  - triggerStatus - Low-level detailed status of the last completed/currently active trigger

- Python API for `StreamingQuery.status()`

### Breaking changes to existing APIs
**Existing direct public facing APIs**
- Deprecated direct public-facing APIs `StreamingQuery.sourceStatuses` and `StreamingQuery.sinkStatus` in favour of `StreamingQuery.status.sourceStatuses/sinkStatus`.
  - Branch 2.0 should have it deprecated, master should have it removed.

**Existing advanced listener APIs**
- `StreamingQueryInfo` renamed to `StreamingQueryStatus` for consistency with `SourceStatus`, `SinkStatus`
   - Earlier StreamingQueryInfo was used only in the advanced listener API, but now it is used in direct public-facing API (StreamingQuery.status)

- Field `queryInfo` in listener events `QueryStarted`, `QueryProgress`, `QueryTerminated` changed have name `queryStatus` and return type `StreamingQueryStatus`.

- Field `offsetDesc` in `SourceStatus` was Option[String], converted it to `String`.

- For `SourceStatus` and `SinkStatus` made constructor private instead of private[sql] to make them more java-safe. Instead added `private[sql] object SourceStatus/SinkStatus.apply()` which are harder to accidentally use in Java.

## How was this patch tested?

Old and new unit tests.
- Rate calculation and other internal logic of StreamMetrics tested by StreamMetricsSuite.
- New info in statuses returned through StreamingQueryListener is tested in StreamingQueryListenerSuite.
- New and old info returned through StreamingQuery.status is tested in StreamingQuerySuite.
- Source-specific tests for making sure input rows are counted are is source-specific test suites.
- Additional tests to test minor additions in LocalTableScanExec, StateStore, etc.

Metrics also manually tested using Ganglia sink

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #15307 from tdas/SPARK-17731.
---
 .../spark/sql/kafka010/KafkaSourceSuite.scala |  27 ++
 project/MimaExcludes.scala                    |  13 +
 python/pyspark/sql/streaming.py               | 301 +++++++++++++++++
 .../spark/sql/catalyst/trees/TreeNode.scala   |   7 +
 .../sql/execution/LocalTableScanExec.scala    |   5 +-
 .../streaming/StatefulAggregate.scala         |  31 +-
 .../execution/streaming/StreamExecution.scala | 307 ++++++++++++++----
 .../execution/streaming/StreamMetrics.scala   | 242 ++++++++++++++
 .../sql/execution/streaming/memory.scala      |   7 +
 .../state/HDFSBackedStateStoreProvider.scala  |   2 +
 .../streaming/state/StateStore.scala          |   3 +
 .../apache/spark/sql/internal/SQLConf.scala   |   8 +
 .../spark/sql/streaming/SinkStatus.scala      |  28 +-
 .../spark/sql/streaming/SourceStatus.scala    |  54 ++-
 .../spark/sql/streaming/StreamingQuery.scala  |  13 +-
 .../sql/streaming/StreamingQueryInfo.scala    |  37 ---
 .../streaming/StreamingQueryListener.scala    |   8 +-
 .../sql/streaming/StreamingQueryStatus.scala  | 139 ++++++++
 .../execution/metric/SQLMetricsSuite.scala    |  17 +
 .../streaming/StreamMetricsSuite.scala        | 213 ++++++++++++
 .../streaming/TextSocketStreamSuite.scala     |  24 ++
 .../streaming/state/StateStoreSuite.scala     |   5 +
 .../sql/streaming/FileStreamSourceSuite.scala |  14 +
 .../spark/sql/streaming/StreamTest.scala      |  72 ++++
 .../streaming/StreamingAggregationSuite.scala |  54 +++
 .../StreamingQueryListenerSuite.scala         | 220 +++++--------
 .../sql/streaming/StreamingQuerySuite.scala   | 180 +++++++++-
 27 files changed, 1758 insertions(+), 273 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamMetrics.scala
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryInfo.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryStatus.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/StreamMetricsSuite.scala

diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
index c640b93b0a2ee..8b5296ea135c7 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
@@ -264,6 +264,33 @@ class KafkaSourceSuite extends KafkaSourceTest {
     testUnsupportedConfig("kafka.auto.offset.reset", "latest")
   }
 
+  test("input row metrics") {
+    val topic = newTopic()
+    testUtils.createTopic(topic, partitions = 5)
+    testUtils.sendMessages(topic, Array("-1"))
+    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
+
+    val kafka = spark
+      .readStream
+      .format("kafka")
+      .option("subscribe", topic)
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .load()
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .as[(String, String)]
+
+    val mapped = kafka.map(kv => kv._2.toInt + 1)
+    testStream(mapped)(
+      makeSureGetOffsetCalled,
+      AddKafkaData(Set(topic), 1, 2, 3),
+      CheckAnswer(2, 3, 4),
+      AssertOnLastQueryStatus { status =>
+        assert(status.triggerDetails.get("numRows.input.total").toInt > 0)
+        assert(status.sourceStatuses(0).processingRate > 0.0)
+      }
+    )
+  }
+
   private def newTopic(): String = s"topic-${topicId.getAndIncrement()}"
 
   private def testFromLatestOffsets(topic: String, options: (String, String)*): Unit = {
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index ae72d37a0b61c..1349af4219c16 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -56,6 +56,19 @@ object MimaExcludes {
       ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.catalog.Catalog.databaseExists"),
       ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.catalog.Catalog.tableExists"),
       ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.catalog.Catalog.functionExists"),
+
+      // [SPARK-17731][SQL][Streaming] Metrics for structured streaming
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.SourceStatus.this"),
+      ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.sql.streaming.SourceStatus.offsetDesc"),
+      ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQuery.status"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.SinkStatus.this"),
+      ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.streaming.StreamingQueryInfo"),
+      ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryListener#QueryStarted.this"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQueryListener#QueryStarted.queryInfo"),
+      ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryListener#QueryProgress.this"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQueryListener#QueryProgress.queryInfo"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQueryListener#QueryTerminated.queryInfo"),
+
       // [SPARK-17338][SQL] add global temp view
       ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.catalog.Catalog.dropGlobalTempView"),
       ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.sql.catalog.Catalog.dropTempView"),
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index 4e438fd5bee22..ce47bd1640fb1 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -189,6 +189,304 @@ def resetTerminated(self):
         self._jsqm.resetTerminated()
 
 
+class StreamingQueryStatus(object):
+    """A class used to report information about the progress of a StreamingQuery.
+
+    .. note:: Experimental
+
+    .. versionadded:: 2.1
+    """
+
+    def __init__(self, jsqs):
+        self._jsqs = jsqs
+
+    def __str__(self):
+        """
+        Pretty string of this query status.
+
+        >>> print(sqs)
+        StreamingQueryStatus:
+            Query name: query
+            Query id: 1
+            Status timestamp: 123
+            Input rate: 15.5 rows/sec
+            Processing rate 23.5 rows/sec
+            Latency: 345.0 ms
+            Trigger details:
+                isDataPresentInTrigger: true
+                isTriggerActive: true
+                latency.getBatch.total: 20
+                latency.getOffset.total: 10
+                numRows.input.total: 100
+                triggerId: 5
+            Source statuses [1 source]:
+                Source 1:    MySource1
+                    Available offset: #0
+                    Input rate: 15.5 rows/sec
+                    Processing rate: 23.5 rows/sec
+                    Trigger details:
+                        numRows.input.source: 100
+                        latency.getOffset.source: 10
+                        latency.getBatch.source: 20
+            Sink status:     MySink
+                Committed offsets: [#1, -]
+        """
+        return self._jsqs.toString()
+
+    @property
+    @ignore_unicode_prefix
+    @since(2.1)
+    def name(self):
+        """
+        Name of the query. This name is unique across all active queries.
+
+        >>> sqs.name
+        u'query'
+        """
+        return self._jsqs.name()
+
+    @property
+    @since(2.1)
+    def id(self):
+        """
+        Id of the query. This id is unique across all queries that have been started in
+        the current process.
+
+        >>> int(sqs.id)
+        1
+        """
+        return self._jsqs.id()
+
+    @property
+    @since(2.1)
+    def timestamp(self):
+        """
+        Timestamp (ms) of when this query was generated.
+
+        >>> int(sqs.timestamp)
+        123
+        """
+        return self._jsqs.timestamp()
+
+    @property
+    @since(2.1)
+    def inputRate(self):
+        """
+        Current total rate (rows/sec) at which data is being generated by all the sources.
+
+        >>> sqs.inputRate
+        15.5
+        """
+        return self._jsqs.inputRate()
+
+    @property
+    @since(2.1)
+    def processingRate(self):
+        """
+        Current rate (rows/sec) at which the query is processing data from all the sources.
+
+        >>> sqs.processingRate
+        23.5
+        """
+        return self._jsqs.processingRate()
+
+    @property
+    @since(2.1)
+    def latency(self):
+        """
+        Current average latency between the data being available in source and the sink
+        writing the corresponding output.
+
+        >>> sqs.latency
+        345.0
+        """
+        if (self._jsqs.latency().nonEmpty()):
+            return self._jsqs.latency().get()
+        else:
+            return None
+
+    @property
+    @ignore_unicode_prefix
+    @since(2.1)
+    def sourceStatuses(self):
+        """
+        Current statuses of the sources as a list.
+
+        >>> len(sqs.sourceStatuses)
+        1
+        >>> sqs.sourceStatuses[0].description
+        u'MySource1'
+        """
+        return [SourceStatus(ss) for ss in self._jsqs.sourceStatuses()]
+
+    @property
+    @ignore_unicode_prefix
+    @since(2.1)
+    def sinkStatus(self):
+        """
+        Current status of the sink.
+
+        >>> sqs.sinkStatus.description
+        u'MySink'
+        """
+        return SinkStatus(self._jsqs.sinkStatus())
+
+    @property
+    @ignore_unicode_prefix
+    @since(2.1)
+    def triggerDetails(self):
+        """
+        Low-level details of the currently active trigger (e.g. number of rows processed
+        in trigger, latency of intermediate steps, etc.).
+
+        If no trigger is currently active, then it will have details of the last completed trigger.
+
+        >>> sqs.triggerDetails
+        {u'triggerId': u'5', u'latency.getBatch.total': u'20', u'numRows.input.total': u'100',
+        u'isTriggerActive': u'true', u'latency.getOffset.total': u'10',
+        u'isDataPresentInTrigger': u'true'}
+        """
+        return self._jsqs.triggerDetails()
+
+
+class SourceStatus(object):
+    """
+    Status and metrics of a streaming Source.
+
+    .. note:: Experimental
+
+    .. versionadded:: 2.1
+    """
+
+    def __init__(self, jss):
+        self._jss = jss
+
+    def __str__(self):
+        """
+        Pretty string of this source status.
+
+        >>> print(sqs.sourceStatuses[0])
+        SourceStatus:    MySource1
+            Available offset: #0
+            Input rate: 15.5 rows/sec
+            Processing rate: 23.5 rows/sec
+            Trigger details:
+                numRows.input.source: 100
+                latency.getOffset.source: 10
+                latency.getBatch.source: 20
+        """
+        return self._jss.toString()
+
+    @property
+    @ignore_unicode_prefix
+    @since(2.1)
+    def description(self):
+        """
+        Description of the source corresponding to this status.
+
+        >>> sqs.sourceStatuses[0].description
+        u'MySource1'
+        """
+        return self._jss.description()
+
+    @property
+    @ignore_unicode_prefix
+    @since(2.1)
+    def offsetDesc(self):
+        """
+        Description of the current offset if known.
+
+        >>> sqs.sourceStatuses[0].offsetDesc
+        u'#0'
+        """
+        return self._jss.offsetDesc()
+
+    @property
+    @since(2.1)
+    def inputRate(self):
+        """
+        Current rate (rows/sec) at which data is being generated by the source.
+
+        >>> sqs.sourceStatuses[0].inputRate
+        15.5
+        """
+        return self._jss.inputRate()
+
+    @property
+    @since(2.1)
+    def processingRate(self):
+        """
+        Current rate (rows/sec) at which the query is processing data from the source.
+
+        >>> sqs.sourceStatuses[0].processingRate
+        23.5
+        """
+        return self._jss.processingRate()
+
+    @property
+    @ignore_unicode_prefix
+    @since(2.1)
+    def triggerDetails(self):
+        """
+        Low-level details of the currently active trigger (e.g. number of rows processed
+        in trigger, latency of intermediate steps, etc.).
+
+        If no trigger is currently active, then it will have details of the last completed trigger.
+
+        >>> sqs.sourceStatuses[0].triggerDetails
+        {u'numRows.input.source': u'100', u'latency.getOffset.source': u'10',
+        u'latency.getBatch.source': u'20'}
+       """
+        return self._jss.triggerDetails()
+
+
+class SinkStatus(object):
+    """
+    Status and metrics of a streaming Sink.
+
+    .. note:: Experimental
+
+    .. versionadded:: 2.1
+    """
+
+    def __init__(self, jss):
+        self._jss = jss
+
+    def __str__(self):
+        """
+        Pretty string of this source status.
+
+        >>> print(sqs.sinkStatus)
+        SinkStatus:    MySink
+            Committed offsets: [#1, -]
+        """
+        return self._jss.toString()
+
+    @property
+    @ignore_unicode_prefix
+    @since(2.1)
+    def description(self):
+        """
+        Description of the source corresponding to this status.
+
+        >>> sqs.sinkStatus.description
+        u'MySink'
+        """
+        return self._jss.description()
+
+    @property
+    @ignore_unicode_prefix
+    @since(2.1)
+    def offsetDesc(self):
+        """
+        Description of the current offsets up to which data has been written by the sink.
+
+        >>> sqs.sinkStatus.offsetDesc
+        u'[#1, -]'
+        """
+        return self._jss.offsetDesc()
+
+
 class Trigger(object):
     """Used to indicate how often results should be produced by a :class:`StreamingQuery`.
 
@@ -753,11 +1051,14 @@ def _test():
     globs['sdf_schema'] = StructType([StructField("data", StringType(), False)])
     globs['df'] = \
         globs['spark'].readStream.format('text').load('python/test_support/sql/streaming')
+    globs['sqs'] = StreamingQueryStatus(
+        spark.sparkContext._jvm.org.apache.spark.sql.streaming.StreamingQueryStatus.testStatus())
 
     (failure_count, test_count) = doctest.testmod(
         pyspark.sql.streaming, globs=globs,
         optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
     globs['spark'].stop()
+
     if failure_count:
         exit(-1)
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
index 83cb375525832..ea8d8fef7bdf1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
@@ -164,6 +164,13 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
     ret
   }
 
+  /**
+   * Returns a Seq containing the leaves in this tree.
+   */
+  def collectLeaves(): Seq[BaseType] = {
+    this.collect { case p if p.children.isEmpty => p }
+  }
+
   /**
    * Finds and returns the first [[TreeNode]] of the tree for which the given partial function
    * is defined (pre-order), and applies the partial function to it.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScanExec.scala
index 6598fa381aa3d..e366b9af35c62 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScanExec.scala
@@ -64,10 +64,13 @@ case class LocalTableScanExec(
   }
 
   override def executeCollect(): Array[InternalRow] = {
+    longMetric("numOutputRows").add(unsafeRows.size)
     unsafeRows
   }
 
   override def executeTake(limit: Int): Array[InternalRow] = {
-    unsafeRows.take(limit)
+    val taken = unsafeRows.take(limit)
+    longMetric("numOutputRows").add(taken.size)
+    taken
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StatefulAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StatefulAggregate.scala
index 4d0283fbef1d0..587ea7d02acab 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StatefulAggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StatefulAggregate.scala
@@ -23,6 +23,7 @@ import org.apache.spark.sql.catalyst.errors._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
 import org.apache.spark.sql.execution
+import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.sql.execution.streaming.state._
 import org.apache.spark.sql.execution.SparkPlan
 
@@ -56,7 +57,12 @@ case class StateStoreRestoreExec(
     child: SparkPlan)
   extends execution.UnaryExecNode with StatefulOperator {
 
+  override lazy val metrics = Map(
+    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
+
   override protected def doExecute(): RDD[InternalRow] = {
+    val numOutputRows = longMetric("numOutputRows")
+
     child.execute().mapPartitionsWithStateStore(
       getStateId.checkpointLocation,
       operatorId = getStateId.operatorId,
@@ -69,6 +75,7 @@ case class StateStoreRestoreExec(
         iter.flatMap { row =>
           val key = getKey(row)
           val savedState = store.get(key)
+          numOutputRows += 1
           row +: savedState.toSeq
         }
     }
@@ -86,7 +93,13 @@ case class StateStoreSaveExec(
     child: SparkPlan)
   extends execution.UnaryExecNode with StatefulOperator {
 
+  override lazy val metrics = Map(
+    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
+    "numTotalStateRows" -> SQLMetrics.createMetric(sparkContext, "number of total state rows"),
+    "numUpdatedStateRows" -> SQLMetrics.createMetric(sparkContext, "number of updated state rows"))
+
   override protected def doExecute(): RDD[InternalRow] = {
+    metrics // force lazy init at driver
     assert(returnAllStates.nonEmpty,
       "Incorrect planning in IncrementalExecution, returnAllStates have not been set")
     val saveAndReturnFunc = if (returnAllStates.get) saveAndReturnAll _ else saveAndReturnUpdated _
@@ -111,6 +124,10 @@ case class StateStoreSaveExec(
   private def saveAndReturnUpdated(
       store: StateStore,
       iter: Iterator[InternalRow]): Iterator[InternalRow] = {
+    val numOutputRows = longMetric("numOutputRows")
+    val numTotalStateRows = longMetric("numTotalStateRows")
+    val numUpdatedStateRows = longMetric("numUpdatedStateRows")
+
     new Iterator[InternalRow] {
       private[this] val baseIterator = iter
       private[this] val getKey = GenerateUnsafeProjection.generate(keyExpressions, child.output)
@@ -118,6 +135,7 @@ case class StateStoreSaveExec(
       override def hasNext: Boolean = {
         if (!baseIterator.hasNext) {
           store.commit()
+          numTotalStateRows += store.numKeys()
           false
         } else {
           true
@@ -128,6 +146,8 @@ case class StateStoreSaveExec(
         val row = baseIterator.next().asInstanceOf[UnsafeRow]
         val key = getKey(row)
         store.put(key.copy(), row.copy())
+        numOutputRows += 1
+        numUpdatedStateRows += 1
         row
       }
     }
@@ -142,12 +162,21 @@ case class StateStoreSaveExec(
       store: StateStore,
       iter: Iterator[InternalRow]): Iterator[InternalRow] = {
     val getKey = GenerateUnsafeProjection.generate(keyExpressions, child.output)
+    val numOutputRows = longMetric("numOutputRows")
+    val numTotalStateRows = longMetric("numTotalStateRows")
+    val numUpdatedStateRows = longMetric("numUpdatedStateRows")
+
     while (iter.hasNext) {
       val row = iter.next().asInstanceOf[UnsafeRow]
       val key = getKey(row)
       store.put(key.copy(), row.copy())
+      numUpdatedStateRows += 1
     }
     store.commit()
-    store.iterator().map(_._2.asInstanceOf[InternalRow])
+    numTotalStateRows += store.numKeys()
+    store.iterator().map { case (k, v) =>
+      numOutputRows += 1
+      v.asInstanceOf[InternalRow]
+    }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 333239f875bd3..9144736c940f5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -32,7 +32,7 @@ import org.apache.spark.sql.catalyst.encoders.RowEncoder
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap}
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.catalyst.util._
-import org.apache.spark.sql.execution.QueryExecution
+import org.apache.spark.sql.execution.{QueryExecution, SparkPlan}
 import org.apache.spark.sql.execution.command.ExplainCommand
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.streaming._
@@ -57,6 +57,7 @@ class StreamExecution(
   extends StreamingQuery with Logging {
 
   import org.apache.spark.sql.streaming.StreamingQueryListener._
+  import StreamMetrics._
 
   private val pollingDelayMs = sparkSession.sessionState.conf.streamingPollingDelay
 
@@ -105,11 +106,22 @@ class StreamExecution(
   var lastExecution: QueryExecution = null
 
   @volatile
-  var streamDeathCause: StreamingQueryException = null
+  private var streamDeathCause: StreamingQueryException = null
 
   /* Get the call site in the caller thread; will pass this into the micro batch thread */
   private val callSite = Utils.getCallSite()
 
+  /** Metrics for this query */
+  private val streamMetrics =
+    new StreamMetrics(uniqueSources.toSet, triggerClock, s"StructuredStreaming.$name")
+
+  @volatile
+  private var currentStatus: StreamingQueryStatus = null
+
+  /** Flag that signals whether any error with input metrics have already been logged */
+  @volatile
+  private var metricWarningLogged: Boolean = false
+
   /**
    * The thread that runs the micro-batches of this stream. Note that this thread must be
    * [[org.apache.spark.util.UninterruptibleThread]] to avoid potential deadlocks in using
@@ -136,16 +148,14 @@ class StreamExecution(
   /** Whether the query is currently active or not */
   override def isActive: Boolean = state == ACTIVE
 
+  /** Returns the current status of the query. */
+  override def status: StreamingQueryStatus = currentStatus
+
   /** Returns current status of all the sources. */
-  override def sourceStatuses: Array[SourceStatus] = {
-    val localAvailableOffsets = availableOffsets
-    sources.map(s =>
-      new SourceStatus(s.toString, localAvailableOffsets.get(s).map(_.toString))).toArray
-  }
+  override def sourceStatuses: Array[SourceStatus] = currentStatus.sourceStatuses.toArray
 
   /** Returns current status of the sink. */
-  override def sinkStatus: SinkStatus =
-    new SinkStatus(sink.toString, committedOffsets.toCompositeOffset(sources).toString)
+  override def sinkStatus: SinkStatus = currentStatus.sinkStatus
 
   /** Returns the [[StreamingQueryException]] if the query was terminated by an exception. */
   override def exception: Option[StreamingQueryException] = Option(streamDeathCause)
@@ -176,7 +186,11 @@ class StreamExecution(
       // Mark ACTIVE and then post the event. QueryStarted event is synchronously sent to listeners,
       // so must mark this as ACTIVE first.
       state = ACTIVE
-      postEvent(new QueryStarted(this.toInfo)) // Assumption: Does not throw exception.
+      if (sparkSession.sessionState.conf.streamingMetricsEnabled) {
+        sparkSession.sparkContext.env.metricsSystem.registerSource(streamMetrics)
+      }
+      updateStatus()
+      postEvent(new QueryStarted(currentStatus)) // Assumption: Does not throw exception.
 
       // Unblock starting thread
       startLatch.countDown()
@@ -185,25 +199,41 @@ class StreamExecution(
       SparkSession.setActiveSession(sparkSession)
 
       triggerExecutor.execute(() => {
-        if (isActive) {
-          if (currentBatchId < 0) {
-            // We'll do this initialization only once
-            populateStartOffsets()
-            logDebug(s"Stream running from $committedOffsets to $availableOffsets")
+        streamMetrics.reportTriggerStarted(currentBatchId)
+        streamMetrics.reportTriggerDetail(STATUS_MESSAGE, "Finding new data from sources")
+        updateStatus()
+        val isTerminated = reportTimeTaken(TRIGGER_LATENCY) {
+          if (isActive) {
+            if (currentBatchId < 0) {
+              // We'll do this initialization only once
+              populateStartOffsets()
+              logDebug(s"Stream running from $committedOffsets to $availableOffsets")
+            } else {
+              constructNextBatch()
+            }
+            if (dataAvailable) {
+              streamMetrics.reportTriggerDetail(IS_DATA_PRESENT_IN_TRIGGER, true)
+              streamMetrics.reportTriggerDetail(STATUS_MESSAGE, "Processing new data")
+              updateStatus()
+              runBatch()
+              // We'll increase currentBatchId after we complete processing current batch's data
+              currentBatchId += 1
+            } else {
+              streamMetrics.reportTriggerDetail(IS_DATA_PRESENT_IN_TRIGGER, false)
+              streamMetrics.reportTriggerDetail(STATUS_MESSAGE, "No new data")
+              updateStatus()
+              Thread.sleep(pollingDelayMs)
+            }
+            true
           } else {
-            constructNextBatch()
+            false
           }
-          if (dataAvailable) {
-            runBatch()
-            // We'll increase currentBatchId after we complete processing current batch's data
-            currentBatchId += 1
-          } else {
-            Thread.sleep(pollingDelayMs)
-          }
-          true
-        } else {
-          false
         }
+        // Update metrics and notify others
+        streamMetrics.reportTriggerFinished()
+        updateStatus()
+        postEvent(new QueryProgress(currentStatus))
+        isTerminated
       })
     } catch {
       case _: InterruptedException if state == TERMINATED => // interrupted by stop()
@@ -221,8 +251,16 @@ class StreamExecution(
         }
     } finally {
       state = TERMINATED
+
+      // Update metrics and status
+      streamMetrics.stop()
+      sparkSession.sparkContext.env.metricsSystem.removeSource(streamMetrics)
+      updateStatus()
+
+      // Notify others
       sparkSession.streams.notifyQueryTermination(StreamExecution.this)
-      postEvent(new QueryTerminated(this.toInfo, exception.map(_.cause).map(Utils.exceptionString)))
+      postEvent(
+        new QueryTerminated(currentStatus, exception.map(_.cause).map(Utils.exceptionString)))
       terminationLatch.countDown()
     }
   }
@@ -248,7 +286,6 @@ class StreamExecution(
             committedOffsets = lastOffsets.toStreamProgress(sources)
             logDebug(s"Resuming with committed offsets: $committedOffsets")
         }
-
       case None => // We are starting this stream for the first time.
         logInfo(s"Starting new streaming query.")
         currentBatchId = 0
@@ -278,8 +315,14 @@ class StreamExecution(
     val hasNewData = {
       awaitBatchLock.lock()
       try {
-        val newData = uniqueSources.flatMap(s => s.getOffset.map(o => s -> o))
-        availableOffsets ++= newData
+        reportTimeTaken(GET_OFFSET_LATENCY) {
+          val latestOffsets: Map[Source, Option[Offset]] = uniqueSources.map { s =>
+            reportTimeTaken(s, SOURCE_GET_OFFSET_LATENCY) {
+              (s, s.getOffset)
+            }
+          }.toMap
+          availableOffsets ++= latestOffsets.filter { case (s, o) => o.nonEmpty }.mapValues(_.get)
+        }
 
         if (dataAvailable) {
           true
@@ -292,16 +335,19 @@ class StreamExecution(
       }
     }
     if (hasNewData) {
-      assert(offsetLog.add(currentBatchId, availableOffsets.toCompositeOffset(sources)),
-        s"Concurrent update to the log. Multiple streaming jobs detected for $currentBatchId")
-      logInfo(s"Committed offsets for batch $currentBatchId.")
-
-      // Now that we have logged the new batch, no further processing will happen for
-      // the previous batch, and it is safe to discard the old metadata.
-      // Note that purge is exclusive, i.e. it purges everything before currentBatchId.
-      // NOTE: If StreamExecution implements pipeline parallelism (multiple batches in
-      // flight at the same time), this cleanup logic will need to change.
-      offsetLog.purge(currentBatchId)
+      reportTimeTaken(OFFSET_WAL_WRITE_LATENCY) {
+        assert(
+          offsetLog.add(currentBatchId, availableOffsets.toCompositeOffset(sources)),
+          s"Concurrent update to the log. Multiple streaming jobs detected for $currentBatchId")
+        logInfo(s"Committed offsets for batch $currentBatchId.")
+
+        // Now that we have logged the new batch, no further processing will happen for
+        // the previous batch, and it is safe to discard the old metadata.
+        // Note that purge is exclusive, i.e. it purges everything before currentBatchId.
+        // NOTE: If StreamExecution implements pipeline parallelism (multiple batches in
+        // flight at the same time), this cleanup logic will need to change.
+        offsetLog.purge(currentBatchId)
+      }
     } else {
       awaitBatchLock.lock()
       try {
@@ -311,26 +357,30 @@ class StreamExecution(
         awaitBatchLock.unlock()
       }
     }
+    reportTimestamp(GET_OFFSET_TIMESTAMP)
   }
 
   /**
    * Processes any data available between `availableOffsets` and `committedOffsets`.
    */
   private def runBatch(): Unit = {
-    val startTime = System.nanoTime()
-
     // TODO: Move this to IncrementalExecution.
 
     // Request unprocessed data from all sources.
-    val newData = availableOffsets.flatMap {
-      case (source, available)
+    val newData = reportTimeTaken(GET_BATCH_LATENCY) {
+      availableOffsets.flatMap {
+        case (source, available)
           if committedOffsets.get(source).map(_ != available).getOrElse(true) =>
-        val current = committedOffsets.get(source)
-        val batch = source.getBatch(current, available)
-        logDebug(s"Retrieving data from $source: $current -> $available")
-        Some(source -> batch)
-      case _ => None
-    }.toMap
+          val current = committedOffsets.get(source)
+          val batch = reportTimeTaken(source, SOURCE_GET_BATCH_LATENCY) {
+            source.getBatch(current, available)
+          }
+          logDebug(s"Retrieving data from $source: $current -> $available")
+          Some(source -> batch)
+        case _ => None
+      }
+    }
+    reportTimestamp(GET_BATCH_TIMESTAMP)
 
     // A list of attributes that will need to be updated.
     var replacements = new ArrayBuffer[(Attribute, Attribute)]
@@ -351,25 +401,24 @@ class StreamExecution(
 
     // Rewire the plan to use the new attributes that were returned by the source.
     val replacementMap = AttributeMap(replacements)
-    val newPlan = withNewSources transformAllExpressions {
+    val triggerLogicalPlan = withNewSources transformAllExpressions {
       case a: Attribute if replacementMap.contains(a) => replacementMap(a)
     }
 
-    val optimizerStart = System.nanoTime()
-    lastExecution = new IncrementalExecution(
-      sparkSession,
-      newPlan,
-      outputMode,
-      checkpointFile("state"),
-      currentBatchId)
-
-    lastExecution.executedPlan
-    val optimizerTime = (System.nanoTime() - optimizerStart).toDouble / 1000000
-    logDebug(s"Optimized batch in ${optimizerTime}ms")
+    val executedPlan = reportTimeTaken(OPTIMIZER_LATENCY) {
+      lastExecution = new IncrementalExecution(
+        sparkSession,
+        triggerLogicalPlan,
+        outputMode,
+        checkpointFile("state"),
+        currentBatchId)
+      lastExecution.executedPlan // Force the lazy generation of execution plan
+    }
 
     val nextBatch =
       new Dataset(sparkSession, lastExecution, RowEncoder(lastExecution.analyzed.schema))
     sink.addBatch(currentBatchId, nextBatch)
+    reportNumRows(executedPlan, triggerLogicalPlan, newData)
 
     awaitBatchLock.lock()
     try {
@@ -379,11 +428,8 @@ class StreamExecution(
       awaitBatchLock.unlock()
     }
 
-    val batchTime = (System.nanoTime() - startTime).toDouble / 1000000
-    logInfo(s"Completed up to $availableOffsets in ${batchTime}ms")
     // Update committed offsets.
     committedOffsets ++= availableOffsets
-    postEvent(new QueryProgress(this.toInfo))
   }
 
   private def postEvent(event: StreamingQueryListener.Event) {
@@ -516,12 +562,131 @@ class StreamExecution(
      """.stripMargin
   }
 
-  private def toInfo: StreamingQueryInfo = {
-    new StreamingQueryInfo(
-      this.name,
-      this.id,
-      this.sourceStatuses,
-      this.sinkStatus)
+  /**
+   * Report row metrics of the executed trigger
+   * @param triggerExecutionPlan Execution plan of the trigger
+   * @param triggerLogicalPlan Logical plan of the trigger, generated from the query logical plan
+   * @param sourceToDF Source to DataFrame returned by the source.getBatch in this trigger
+   */
+  private def reportNumRows(
+      triggerExecutionPlan: SparkPlan,
+      triggerLogicalPlan: LogicalPlan,
+      sourceToDF: Map[Source, DataFrame]): Unit = {
+    // We want to associate execution plan leaves to sources that generate them, so that we match
+    // the their metrics (e.g. numOutputRows) to the sources. To do this we do the following.
+    // Consider the translation from the streaming logical plan to the final executed plan.
+    //
+    //  streaming logical plan (with sources) <==> trigger's logical plan <==> executed plan
+    //
+    // 1. We keep track of streaming sources associated with each leaf in the trigger's logical plan
+    //    - Each logical plan leaf will be associated with a single streaming source.
+    //    - There can be multiple logical plan leaves associated with a streaming source.
+    //    - There can be leaves not associated with any streaming source, because they were
+    //      generated from a batch source (e.g. stream-batch joins)
+    //
+    // 2. Assuming that the executed plan has same number of leaves in the same order as that of
+    //    the trigger logical plan, we associate executed plan leaves with corresponding
+    //    streaming sources.
+    //
+    // 3. For each source, we sum the metrics of the associated execution plan leaves.
+    //
+    val logicalPlanLeafToSource = sourceToDF.flatMap { case (source, df) =>
+      df.logicalPlan.collectLeaves().map { leaf => leaf -> source }
+    }
+    val allLogicalPlanLeaves = triggerLogicalPlan.collectLeaves() // includes non-streaming sources
+    val allExecPlanLeaves = triggerExecutionPlan.collectLeaves()
+    val sourceToNumInputRows: Map[Source, Long] =
+      if (allLogicalPlanLeaves.size == allExecPlanLeaves.size) {
+        val execLeafToSource = allLogicalPlanLeaves.zip(allExecPlanLeaves).flatMap {
+          case (lp, ep) => logicalPlanLeafToSource.get(lp).map { source => ep -> source }
+        }
+        val sourceToNumInputRows = execLeafToSource.map { case (execLeaf, source) =>
+          val numRows = execLeaf.metrics.get("numOutputRows").map(_.value).getOrElse(0L)
+          source -> numRows
+        }
+        sourceToNumInputRows.groupBy(_._1).mapValues(_.map(_._2).sum) // sum up rows for each source
+      } else {
+        if (!metricWarningLogged) {
+          def toString[T](seq: Seq[T]): String = s"(size = ${seq.size}), ${seq.mkString(", ")}"
+          logWarning(
+            "Could not report metrics as number leaves in trigger logical plan did not match that" +
+              s" of the execution plan:\n" +
+              s"logical plan leaves: ${toString(allLogicalPlanLeaves)}\n" +
+              s"execution plan leaves: ${toString(allExecPlanLeaves)}\n")
+          metricWarningLogged = true
+        }
+        Map.empty
+      }
+    val numOutputRows = triggerExecutionPlan.metrics.get("numOutputRows").map(_.value)
+    val stateNodes = triggerExecutionPlan.collect {
+      case p if p.isInstanceOf[StateStoreSaveExec] => p
+    }
+
+    streamMetrics.reportNumInputRows(sourceToNumInputRows)
+    stateNodes.zipWithIndex.foreach { case (s, i) =>
+      streamMetrics.reportTriggerDetail(
+        NUM_TOTAL_STATE_ROWS(i + 1),
+        s.metrics.get("numTotalStateRows").map(_.value).getOrElse(0L))
+      streamMetrics.reportTriggerDetail(
+        NUM_UPDATED_STATE_ROWS(i + 1),
+        s.metrics.get("numUpdatedStateRows").map(_.value).getOrElse(0L))
+    }
+    updateStatus()
+  }
+
+  private def reportTimeTaken[T](triggerDetailKey: String)(body: => T): T = {
+    val startTime = triggerClock.getTimeMillis()
+    val result = body
+    val endTime = triggerClock.getTimeMillis()
+    val timeTaken = math.max(endTime - startTime, 0)
+    streamMetrics.reportTriggerDetail(triggerDetailKey, timeTaken)
+    updateStatus()
+    if (triggerDetailKey == TRIGGER_LATENCY) {
+      logInfo(s"Completed up to $availableOffsets in $timeTaken ms")
+    }
+    result
+  }
+
+  private def reportTimeTaken[T](source: Source, triggerDetailKey: String)(body: => T): T = {
+    val startTime = triggerClock.getTimeMillis()
+    val result = body
+    val endTime = triggerClock.getTimeMillis()
+    streamMetrics.reportSourceTriggerDetail(
+      source, triggerDetailKey, math.max(endTime - startTime, 0))
+    updateStatus()
+    result
+  }
+
+  private def reportTimestamp(triggerDetailKey: String): Unit = {
+    streamMetrics.reportTriggerDetail(triggerDetailKey, triggerClock.getTimeMillis)
+    updateStatus()
+  }
+
+  private def updateStatus(): Unit = {
+    val localAvailableOffsets = availableOffsets
+    val sourceStatuses = sources.map { s =>
+      SourceStatus(
+        s.toString,
+        localAvailableOffsets.get(s).map(_.toString).getOrElse("-"), // TODO: use json if available
+        streamMetrics.currentSourceInputRate(s),
+        streamMetrics.currentSourceProcessingRate(s),
+        streamMetrics.currentSourceTriggerDetails(s))
+    }.toArray
+    val sinkStatus = SinkStatus(
+      sink.toString,
+      committedOffsets.toCompositeOffset(sources).toString)
+
+    currentStatus =
+      StreamingQueryStatus(
+        name = name,
+        id = id,
+        timestamp = triggerClock.getTimeMillis(),
+        inputRate = streamMetrics.currentInputRate(),
+        processingRate = streamMetrics.currentProcessingRate(),
+        latency = streamMetrics.currentLatency(),
+        sourceStatuses = sourceStatuses,
+        sinkStatus = sinkStatus,
+        triggerDetails = streamMetrics.currentTriggerDetails())
   }
 
   trait State
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamMetrics.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamMetrics.scala
new file mode 100644
index 0000000000000..e98d1883e4596
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamMetrics.scala
@@ -0,0 +1,242 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import java.{util => ju}
+
+import scala.collection.mutable
+
+import com.codahale.metrics.{Gauge, MetricRegistry}
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.metrics.source.{Source => CodahaleSource}
+import org.apache.spark.util.Clock
+
+/**
+ * Class that manages all the metrics related to a StreamingQuery. It does the following.
+ * - Calculates metrics (rates, latencies, etc.) based on information reported by StreamExecution.
+ * - Allows the current metric values to be queried
+ * - Serves some of the metrics through Codahale/DropWizard metrics
+ *
+ * @param sources Unique set of sources in a query
+ * @param triggerClock Clock used for triggering in StreamExecution
+ * @param codahaleSourceName Root name for all the Codahale metrics
+ */
+class StreamMetrics(sources: Set[Source], triggerClock: Clock, codahaleSourceName: String)
+  extends CodahaleSource with Logging {
+
+  import StreamMetrics._
+
+  // Trigger infos
+  private val triggerDetails = new mutable.HashMap[String, String]
+  private val sourceTriggerDetails = new mutable.HashMap[Source, mutable.HashMap[String, String]]
+
+  // Rate estimators for sources and sinks
+  private val inputRates = new mutable.HashMap[Source, RateCalculator]
+  private val processingRates = new mutable.HashMap[Source, RateCalculator]
+
+  // Number of input rows in the current trigger
+  private val numInputRows = new mutable.HashMap[Source, Long]
+  private var currentTriggerStartTimestamp: Long = -1
+  private var previousTriggerStartTimestamp: Long = -1
+  private var latency: Option[Double] = None
+
+  override val sourceName: String = codahaleSourceName
+  override val metricRegistry: MetricRegistry = new MetricRegistry
+
+  // =========== Initialization ===========
+
+  // Metric names should not have . in them, so that all the metrics of a query are identified
+  // together in Ganglia as a single metric group
+  registerGauge("inputRate-total", currentInputRate)
+  registerGauge("processingRate-total", () => currentProcessingRate)
+  registerGauge("latency", () => currentLatency().getOrElse(-1.0))
+
+  sources.foreach { s =>
+    inputRates.put(s, new RateCalculator)
+    processingRates.put(s, new RateCalculator)
+    sourceTriggerDetails.put(s, new mutable.HashMap[String, String])
+
+    registerGauge(s"inputRate-${s.toString}", () => currentSourceInputRate(s))
+    registerGauge(s"processingRate-${s.toString}", () => currentSourceProcessingRate(s))
+  }
+
+  // =========== Setter methods ===========
+
+  def reportTriggerStarted(triggerId: Long): Unit = synchronized {
+    numInputRows.clear()
+    triggerDetails.clear()
+    sourceTriggerDetails.values.foreach(_.clear())
+
+    reportTriggerDetail(TRIGGER_ID, triggerId)
+    sources.foreach(s => reportSourceTriggerDetail(s, TRIGGER_ID, triggerId))
+    reportTriggerDetail(IS_TRIGGER_ACTIVE, true)
+    currentTriggerStartTimestamp = triggerClock.getTimeMillis()
+    reportTriggerDetail(START_TIMESTAMP, currentTriggerStartTimestamp)
+  }
+
+  def reportTriggerDetail[T](key: String, value: T): Unit = synchronized {
+    triggerDetails.put(key, value.toString)
+  }
+
+  def reportSourceTriggerDetail[T](source: Source, key: String, value: T): Unit = synchronized {
+    sourceTriggerDetails(source).put(key, value.toString)
+  }
+
+  def reportNumInputRows(inputRows: Map[Source, Long]): Unit = synchronized {
+    numInputRows ++= inputRows
+  }
+
+  def reportTriggerFinished(): Unit = synchronized {
+    require(currentTriggerStartTimestamp >= 0)
+    val currentTriggerFinishTimestamp = triggerClock.getTimeMillis()
+    reportTriggerDetail(FINISH_TIMESTAMP, currentTriggerFinishTimestamp)
+    triggerDetails.remove(STATUS_MESSAGE)
+    reportTriggerDetail(IS_TRIGGER_ACTIVE, false)
+
+    // Report number of rows
+    val totalNumInputRows = numInputRows.values.sum
+    reportTriggerDetail(NUM_INPUT_ROWS, totalNumInputRows)
+    numInputRows.foreach { case (s, r) =>
+      reportSourceTriggerDetail(s, NUM_SOURCE_INPUT_ROWS, r)
+    }
+
+    val currentTriggerDuration = currentTriggerFinishTimestamp - currentTriggerStartTimestamp
+    val previousInputIntervalOption = if (previousTriggerStartTimestamp >= 0) {
+      Some(currentTriggerStartTimestamp - previousTriggerStartTimestamp)
+    } else None
+
+    // Update input rate = num rows received by each source during the previous trigger interval
+    // Interval is measures as interval between start times of previous and current trigger.
+    //
+    // TODO: Instead of trigger start, we should use time when getOffset was called on each source
+    // as this may be different for each source if there are many sources in the query plan
+    // and getOffset is called serially on them.
+    if (previousInputIntervalOption.nonEmpty) {
+      sources.foreach { s =>
+        inputRates(s).update(numInputRows.getOrElse(s, 0), previousInputIntervalOption.get)
+      }
+    }
+
+    // Update processing rate = num rows processed for each source in current trigger duration
+    sources.foreach { s =>
+      processingRates(s).update(numInputRows.getOrElse(s, 0), currentTriggerDuration)
+    }
+
+    // Update latency = if data present, 0.5 * previous trigger interval + current trigger duration
+    if (previousInputIntervalOption.nonEmpty && totalNumInputRows > 0) {
+      latency = Some((previousInputIntervalOption.get.toDouble / 2) + currentTriggerDuration)
+    } else {
+      latency = None
+    }
+
+    previousTriggerStartTimestamp = currentTriggerStartTimestamp
+    currentTriggerStartTimestamp = -1
+  }
+
+  // =========== Getter methods ===========
+
+  def currentInputRate(): Double = synchronized {
+    // Since we are calculating source input rates using the same time interval for all sources
+    // it is fine to calculate total input rate as the sum of per source input rate.
+    inputRates.map(_._2.currentRate).sum
+  }
+
+  def currentSourceInputRate(source: Source): Double = synchronized {
+    inputRates(source).currentRate
+  }
+
+  def currentProcessingRate(): Double = synchronized {
+    // Since we are calculating source processing rates using the same time interval for all sources
+    // it is fine to calculate total processing rate as the sum of per source processing rate.
+    processingRates.map(_._2.currentRate).sum
+  }
+
+  def currentSourceProcessingRate(source: Source): Double = synchronized {
+    processingRates(source).currentRate
+  }
+
+  def currentLatency(): Option[Double] = synchronized { latency }
+
+  def currentTriggerDetails(): Map[String, String] = synchronized { triggerDetails.toMap }
+
+  def currentSourceTriggerDetails(source: Source): Map[String, String] = synchronized {
+    sourceTriggerDetails(source).toMap
+  }
+
+  // =========== Other methods ===========
+
+  private def registerGauge[T](name: String, f: () => T)(implicit num: Numeric[T]): Unit = {
+    synchronized {
+      metricRegistry.register(name, new Gauge[T] {
+        override def getValue: T = f()
+      })
+    }
+  }
+
+  def stop(): Unit = synchronized {
+    triggerDetails.clear()
+    inputRates.valuesIterator.foreach { _.stop() }
+    processingRates.valuesIterator.foreach { _.stop() }
+    latency = None
+  }
+}
+
+object StreamMetrics extends Logging {
+  /** Simple utility class to calculate rate while avoiding DivideByZero */
+  class RateCalculator {
+    @volatile private var rate: Option[Double] = None
+
+    def update(numRows: Long, timeGapMs: Long): Unit = {
+      if (timeGapMs > 0) {
+        rate = Some(numRows.toDouble * 1000 / timeGapMs)
+      } else {
+        rate = None
+        logDebug(s"Rate updates cannot with zero or negative time gap $timeGapMs")
+      }
+    }
+
+    def currentRate: Double = rate.getOrElse(0.0)
+
+    def stop(): Unit = { rate = None }
+  }
+
+
+  val TRIGGER_ID = "triggerId"
+  val IS_TRIGGER_ACTIVE = "isTriggerActive"
+  val IS_DATA_PRESENT_IN_TRIGGER = "isDataPresentInTrigger"
+  val STATUS_MESSAGE = "statusMessage"
+
+  val START_TIMESTAMP = "timestamp.triggerStart"
+  val GET_OFFSET_TIMESTAMP = "timestamp.afterGetOffset"
+  val GET_BATCH_TIMESTAMP = "timestamp.afterGetBatch"
+  val FINISH_TIMESTAMP = "timestamp.triggerFinish"
+
+  val GET_OFFSET_LATENCY = "latency.getOffset.total"
+  val GET_BATCH_LATENCY = "latency.getBatch.total"
+  val OFFSET_WAL_WRITE_LATENCY = "latency.offsetLogWrite"
+  val OPTIMIZER_LATENCY = "latency.optimizer"
+  val TRIGGER_LATENCY = "latency.fullTrigger"
+  val SOURCE_GET_OFFSET_LATENCY = "latency.getOffset.source"
+  val SOURCE_GET_BATCH_LATENCY = "latency.getBatch.source"
+
+  val NUM_INPUT_ROWS = "numRows.input.total"
+  val NUM_SOURCE_INPUT_ROWS = "numRows.input.source"
+  def NUM_TOTAL_STATE_ROWS(aggId: Int): String = s"numRows.state.aggregation$aggId.total"
+  def NUM_UPDATED_STATE_ROWS(aggId: Int): String = s"numRows.state.aggregation$aggId.updated"
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
index 5052c4d50c5ed..788fcd0361bee 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
@@ -112,6 +112,11 @@ case class MemoryStream[A : Encoder](id: Int, sqlContext: SQLContext)
   }
 
   override def stop() {}
+
+  def reset(): Unit = synchronized {
+    batches.clear()
+    currentOffset = new LongOffset(-1)
+  }
 }
 
 /**
@@ -165,6 +170,8 @@ class MemorySink(val schema: StructType, outputMode: OutputMode) extends Sink wi
       logDebug(s"Skipping already committed batch: $batchId")
     }
   }
+
+  override def toString(): String = "MemorySink"
 }
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
index bec966b15ed0f..7d71f5242c27d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
@@ -197,6 +197,8 @@ private[state] class HDFSBackedStateStoreProvider(
       allUpdates.values().asScala.toIterator
     }
 
+    override def numKeys(): Long = mapToUpdate.size()
+
     /**
      * Whether all updates have been committed
      */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala
index a67fdceb3cee6..7132e284c28f4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala
@@ -77,6 +77,9 @@ trait StateStore {
    */
   def updates(): Iterator[StoreUpdate]
 
+  /** Number of keys in the state store */
+  def numKeys(): Long
+
   /**
    * Whether all updates have been committed
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 192083e2ea5f5..e671604c39855 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -569,6 +569,12 @@ object SQLConf {
       .timeConf(TimeUnit.MILLISECONDS)
       .createWithDefault(10L)
 
+  val STREAMING_METRICS_ENABLED =
+    SQLConfigBuilder("spark.sql.streaming.metricsEnabled")
+      .doc("Whether Dropwizard/Codahale metrics will be reported for active streaming queries.")
+      .booleanConf
+      .createWithDefault(false)
+
   val NDV_MAX_ERROR =
     SQLConfigBuilder("spark.sql.statistics.ndv.maxError")
       .internal()
@@ -635,6 +641,8 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
 
   def streamingPollingDelay: Long = getConf(STREAMING_POLLING_DELAY)
 
+  def streamingMetricsEnabled: Boolean = getConf(STREAMING_METRICS_ENABLED)
+
   def filesMaxPartitionBytes: Long = getConf(FILES_MAX_PARTITION_BYTES)
 
   def filesOpenCostInBytes: Long = getConf(FILES_OPEN_COST_IN_BYTES)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/SinkStatus.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/SinkStatus.scala
index de1efe961f8bd..c9911665f7d72 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/SinkStatus.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/SinkStatus.scala
@@ -18,17 +18,33 @@
 package org.apache.spark.sql.streaming
 
 import org.apache.spark.annotation.Experimental
-import org.apache.spark.sql.execution.streaming.Sink
+import org.apache.spark.sql.streaming.StreamingQueryStatus.indent
 
 /**
  * :: Experimental ::
- * Status and metrics of a streaming [[Sink]].
+ * Status and metrics of a streaming sink.
  *
- * @param description Description of the source corresponding to this status
- * @param offsetDesc Description of the current offset up to which data has been written by the sink
+ * @param description Description of the source corresponding to this status.
+ * @param offsetDesc Description of the current offsets up to which data has been written
+ *                   by the sink.
  * @since 2.0.0
  */
 @Experimental
-class SinkStatus private[sql](
+class SinkStatus private(
     val description: String,
-    val offsetDesc: String)
+    val offsetDesc: String) {
+
+  override def toString: String =
+    "SinkStatus:" + indent(prettyString)
+
+  private[sql] def prettyString: String = {
+    s"""$description
+       |Committed offsets: $offsetDesc
+       |""".stripMargin
+  }
+}
+
+/** Companion object, primarily for creating SinkStatus instances internally */
+private[sql] object SinkStatus {
+  def apply(desc: String, offsetDesc: String): SinkStatus = new SinkStatus(desc, offsetDesc)
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/SourceStatus.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/SourceStatus.scala
index bd0c8485e4fdd..6ace4833be22f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/SourceStatus.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/SourceStatus.scala
@@ -17,18 +17,60 @@
 
 package org.apache.spark.sql.streaming
 
+import java.{util => ju}
+
+import scala.collection.JavaConverters._
+
 import org.apache.spark.annotation.Experimental
-import org.apache.spark.sql.execution.streaming.Source
+import org.apache.spark.sql.streaming.StreamingQueryStatus.indent
 
 /**
  * :: Experimental ::
- * Status and metrics of a streaming [[Source]].
+ * Status and metrics of a streaming Source.
  *
- * @param description Description of the source corresponding to this status
- * @param offsetDesc Description of the current [[Source]] offset if known
+ * @param description Description of the source corresponding to this status.
+ * @param offsetDesc Description of the current offset if known.
+ * @param inputRate Current rate (rows/sec) at which data is being generated by the source.
+ * @param processingRate Current rate (rows/sec) at which the query is processing data from
+ *                       the source.
+ * @param triggerDetails Low-level details of the currently active trigger (e.g. number of
+ *                      rows processed in trigger, latency of intermediate steps, etc.).
+ *                      If no trigger is active, then it will have details of the last completed
+ *                      trigger.
  * @since 2.0.0
  */
 @Experimental
-class SourceStatus private[sql] (
+class SourceStatus private(
     val description: String,
-    val offsetDesc: Option[String])
+    val offsetDesc: String,
+    val inputRate: Double,
+    val processingRate: Double,
+    val triggerDetails: ju.Map[String, String]) {
+
+  override def toString: String =
+    "SourceStatus:" + indent(prettyString)
+
+  private[sql] def prettyString: String = {
+    val triggerDetailsLines =
+      triggerDetails.asScala.map { case (k, v) => s"$k: $v" }
+    s"""$description
+       |Available offset: $offsetDesc
+       |Input rate: $inputRate rows/sec
+       |Processing rate: $processingRate rows/sec
+       |Trigger details:
+       |""".stripMargin + indent(triggerDetailsLines)
+
+  }
+}
+
+/** Companion object, primarily for creating SourceStatus instances internally */
+private[sql] object SourceStatus {
+  def apply(
+      desc: String,
+      offsetDesc: String,
+      inputRate: Double,
+      processingRate: Double,
+      triggerDetails: Map[String, String]): SourceStatus = {
+    new SourceStatus(desc, offsetDesc, inputRate, processingRate, triggerDetails.asJava)
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQuery.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQuery.scala
index 91f0a1e3446a1..0a85414451981 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQuery.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQuery.scala
@@ -62,13 +62,24 @@ trait StreamingQuery {
    */
   def exception: Option[StreamingQueryException]
 
+  /**
+   * Returns the current status of the query.
+   * @since 2.0.2
+   */
+  def status: StreamingQueryStatus
+
   /**
    * Returns current status of all the sources.
    * @since 2.0.0
    */
+  @deprecated("use status.sourceStatuses", "2.0.2")
   def sourceStatuses: Array[SourceStatus]
 
-  /** Returns current status of the sink. */
+  /**
+   * Returns current status of the sink.
+   * @since 2.0.0
+   */
+  @deprecated("use status.sinkStatus", "2.0.2")
   def sinkStatus: SinkStatus
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryInfo.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryInfo.scala
deleted file mode 100644
index 1af2668817eae..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryInfo.scala
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.streaming
-
-import org.apache.spark.annotation.Experimental
-
-/**
- * :: Experimental ::
- * A class used to report information about the progress of a [[StreamingQuery]].
- *
- * @param name The [[StreamingQuery]] name. This name is unique across all active queries.
- * @param id The [[StreamingQuery]] id. This id is unique across
-  *          all queries that have been started in the current process.
- * @param sourceStatuses The current statuses of the [[StreamingQuery]]'s sources.
- * @param sinkStatus The current status of the [[StreamingQuery]]'s sink.
- */
-@Experimental
-class StreamingQueryInfo private[sql](
-  val name: String,
-  val id: Long,
-  val sourceStatuses: Seq[SourceStatus],
-  val sinkStatus: SinkStatus)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala
index 8a8855d85a4c7..69790e33b2168 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala
@@ -84,7 +84,7 @@ object StreamingQueryListener {
    * @since 2.0.0
    */
   @Experimental
-  class QueryStarted private[sql](val queryInfo: StreamingQueryInfo) extends Event
+  class QueryStarted private[sql](val queryStatus: StreamingQueryStatus) extends Event
 
   /**
    * :: Experimental ::
@@ -92,19 +92,19 @@ object StreamingQueryListener {
    * @since 2.0.0
    */
   @Experimental
-  class QueryProgress private[sql](val queryInfo: StreamingQueryInfo) extends Event
+  class QueryProgress private[sql](val queryStatus: StreamingQueryStatus) extends Event
 
   /**
    * :: Experimental ::
    * Event representing that termination of a query
    *
-   * @param queryInfo Information about the status of the query.
+   * @param queryStatus Information about the status of the query.
    * @param exception The exception message of the [[StreamingQuery]] if the query was terminated
    *                  with an exception. Otherwise, it will be `None`.
    * @since 2.0.0
    */
   @Experimental
   class QueryTerminated private[sql](
-      val queryInfo: StreamingQueryInfo,
+      val queryStatus: StreamingQueryStatus,
       val exception: Option[String]) extends Event
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryStatus.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryStatus.scala
new file mode 100644
index 0000000000000..47689928730d0
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryStatus.scala
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.streaming
+
+import java.{util => ju}
+
+import scala.collection.JavaConverters._
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.sql.execution.streaming.{CompositeOffset, LongOffset}
+
+/**
+ * :: Experimental ::
+ * A class used to report information about the progress of a [[StreamingQuery]].
+ *
+ * @param name Name of the query. This name is unique across all active queries.
+ * @param id Id of the query. This id is unique across
+ *          all queries that have been started in the current process.
+ * @param timestamp Timestamp (ms) of when this query was generated.
+ * @param inputRate Current rate (rows/sec) at which data is being generated by all the sources.
+ * @param processingRate Current rate (rows/sec) at which the query is processing data from
+ *                       all the sources.
+ * @param latency  Current average latency between the data being available in source and the sink
+ *                   writing the corresponding output.
+ * @param sourceStatuses Current statuses of the sources.
+ * @param sinkStatus Current status of the sink.
+ * @param triggerDetails Low-level details of the currently active trigger (e.g. number of
+ *                      rows processed in trigger, latency of intermediate steps, etc.).
+ *                      If no trigger is active, then it will have details of the last completed
+ *                      trigger.
+ * @since 2.0.0
+ */
+@Experimental
+class StreamingQueryStatus private(
+  val name: String,
+  val id: Long,
+  val timestamp: Long,
+  val inputRate: Double,
+  val processingRate: Double,
+  val latency: Option[Double],
+  val sourceStatuses: Array[SourceStatus],
+  val sinkStatus: SinkStatus,
+  val triggerDetails: ju.Map[String, String]) {
+
+  import StreamingQueryStatus._
+
+  override def toString: String = {
+    val sourceStatusLines = sourceStatuses.zipWithIndex.map { case (s, i) =>
+      s"Source ${i + 1}:" + indent(s.prettyString)
+    }
+    val sinkStatusLines = sinkStatus.prettyString
+    val triggerDetailsLines = triggerDetails.asScala.map { case (k, v) => s"$k: $v" }.toSeq.sorted
+    val numSources = sourceStatuses.length
+    val numSourcesString = s"$numSources source" + { if (numSources > 1) "s" else "" }
+
+    val allLines = s"""
+        |Query name: $name
+        |Query id: $id
+        |Status timestamp: $timestamp
+        |Input rate: $inputRate rows/sec
+        |Processing rate $processingRate rows/sec
+        |Latency: ${latency.getOrElse("-")} ms
+        |Trigger details:
+        |${indent(triggerDetailsLines)}
+        |Source statuses [$numSourcesString]:
+        |${indent(sourceStatusLines)}
+        |Sink status: ${indent(sinkStatusLines)}""".stripMargin
+
+    s"StreamingQueryStatus:${indent(allLines)}"
+  }
+}
+
+/** Companion object, primarily for creating StreamingQueryInfo instances internally */
+private[sql] object StreamingQueryStatus {
+  def apply(
+      name: String,
+      id: Long,
+      timestamp: Long,
+      inputRate: Double,
+      processingRate: Double,
+      latency: Option[Double],
+      sourceStatuses: Array[SourceStatus],
+      sinkStatus: SinkStatus,
+      triggerDetails: Map[String, String]): StreamingQueryStatus = {
+    new StreamingQueryStatus(name, id, timestamp, inputRate, processingRate,
+      latency, sourceStatuses, sinkStatus, triggerDetails.asJava)
+  }
+
+  def indent(strings: Iterable[String]): String = strings.map(indent).mkString("\n")
+  def indent(string: String): String = string.split("\n").map("    " + _).mkString("\n")
+
+  /** Create an instance of status for python testing */
+  def testStatus(): StreamingQueryStatus = {
+    import org.apache.spark.sql.execution.streaming.StreamMetrics._
+    StreamingQueryStatus(
+      name = "query",
+      id = 1,
+      timestamp = 123,
+      inputRate = 15.5,
+      processingRate = 23.5,
+      latency = Some(345),
+      sourceStatuses = Array(
+        SourceStatus(
+          desc = "MySource1",
+          offsetDesc = LongOffset(0).toString,
+          inputRate = 15.5,
+          processingRate = 23.5,
+          triggerDetails = Map(
+            NUM_SOURCE_INPUT_ROWS -> "100",
+            SOURCE_GET_OFFSET_LATENCY -> "10",
+            SOURCE_GET_BATCH_LATENCY -> "20"))),
+      sinkStatus = SinkStatus(
+        desc = "MySink",
+        offsetDesc = CompositeOffset(Some(LongOffset(1)) :: None :: Nil).toString),
+      triggerDetails = Map(
+        TRIGGER_ID -> "5",
+        IS_TRIGGER_ACTIVE -> "true",
+        IS_DATA_PRESENT_IN_TRIGGER -> "true",
+        GET_OFFSET_LATENCY -> "10",
+        GET_BATCH_LATENCY -> "20",
+        NUM_INPUT_ROWS -> "100"
+      ))
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
index bba40c6510cfb..229d8814e0143 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.metric
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
 import org.apache.spark.sql.execution.SparkPlanInfo
 import org.apache.spark.sql.execution.ui.SparkPlanGraph
 import org.apache.spark.sql.functions._
@@ -85,6 +86,22 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
     }
   }
 
+  test("LocalTableScanExec computes metrics in collect and take") {
+    val df1 = spark.createDataset(Seq(1, 2, 3))
+    val logical = df1.queryExecution.logical
+    require(logical.isInstanceOf[LocalRelation])
+    df1.collect()
+    val metrics1 = df1.queryExecution.executedPlan.collectLeaves().head.metrics
+    assert(metrics1.contains("numOutputRows"))
+    assert(metrics1("numOutputRows").value === 3)
+
+    val df2 = spark.createDataset(Seq(1, 2, 3)).limit(2)
+    df2.collect()
+    val metrics2 = df2.queryExecution.executedPlan.collectLeaves().head.metrics
+    assert(metrics2.contains("numOutputRows"))
+    assert(metrics2("numOutputRows").value === 2)
+  }
+
   test("Filter metrics") {
     // Assume the execution plan is
     // PhysicalRDD(nodeId = 1) -> Filter(nodeId = 0)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/StreamMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/StreamMetricsSuite.scala
new file mode 100644
index 0000000000000..938423db64745
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/StreamMetricsSuite.scala
@@ -0,0 +1,213 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import org.scalactic.TolerantNumerics
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.types.{StructField, StructType}
+import org.apache.spark.util.ManualClock
+
+class StreamMetricsSuite extends SparkFunSuite {
+  import StreamMetrics._
+
+  // To make === between double tolerate inexact values
+  implicit val doubleEquality = TolerantNumerics.tolerantDoubleEquality(0.01)
+
+  test("rates, latencies, trigger details - basic life cycle") {
+    val sm = newStreamMetrics(source)
+    assert(sm.currentInputRate() === 0.0)
+    assert(sm.currentProcessingRate() === 0.0)
+    assert(sm.currentSourceInputRate(source) === 0.0)
+    assert(sm.currentSourceProcessingRate(source) === 0.0)
+    assert(sm.currentLatency() === None)
+    assert(sm.currentTriggerDetails().isEmpty)
+
+    // When trigger started, the rates should not change, but should return
+    // reported trigger details
+    sm.reportTriggerStarted(1)
+    sm.reportTriggerDetail("key", "value")
+    sm.reportSourceTriggerDetail(source, "key2", "value2")
+    assert(sm.currentInputRate() === 0.0)
+    assert(sm.currentProcessingRate() === 0.0)
+    assert(sm.currentSourceInputRate(source) === 0.0)
+    assert(sm.currentSourceProcessingRate(source) === 0.0)
+    assert(sm.currentLatency() === None)
+    assert(sm.currentTriggerDetails() ===
+      Map(TRIGGER_ID -> "1", IS_TRIGGER_ACTIVE -> "true",
+        START_TIMESTAMP -> "0", "key" -> "value"))
+    assert(sm.currentSourceTriggerDetails(source) ===
+      Map(TRIGGER_ID -> "1", "key2" -> "value2"))
+
+    // Finishing the trigger should calculate the rates, except input rate which needs
+    // to have another trigger interval
+    sm.reportNumInputRows(Map(source -> 100L)) // 100 input rows, 10 output rows
+    clock.advance(1000)
+    sm.reportTriggerFinished()
+    assert(sm.currentInputRate() === 0.0)
+    assert(sm.currentProcessingRate() === 100.0)  // 100 input rows processed in 1 sec
+    assert(sm.currentSourceInputRate(source) === 0.0)
+    assert(sm.currentSourceProcessingRate(source) === 100.0)
+    assert(sm.currentLatency() === None)
+    assert(sm.currentTriggerDetails() ===
+      Map(TRIGGER_ID -> "1", IS_TRIGGER_ACTIVE -> "false",
+        START_TIMESTAMP -> "0", FINISH_TIMESTAMP -> "1000",
+        NUM_INPUT_ROWS -> "100", "key" -> "value"))
+    assert(sm.currentSourceTriggerDetails(source) ===
+      Map(TRIGGER_ID -> "1", NUM_SOURCE_INPUT_ROWS -> "100", "key2" -> "value2"))
+
+    // After another trigger starts, the rates and latencies should not change until
+    // new rows are reported
+    clock.advance(1000)
+    sm.reportTriggerStarted(2)
+    assert(sm.currentInputRate() === 0.0)
+    assert(sm.currentProcessingRate() === 100.0)
+    assert(sm.currentSourceInputRate(source) === 0.0)
+    assert(sm.currentSourceProcessingRate(source) === 100.0)
+    assert(sm.currentLatency() === None)
+
+    // Reporting new rows should update the rates and latencies
+    sm.reportNumInputRows(Map(source -> 200L))     // 200 input rows
+    clock.advance(500)
+    sm.reportTriggerFinished()
+    assert(sm.currentInputRate() === 100.0)      // 200 input rows generated in 2 seconds b/w starts
+    assert(sm.currentProcessingRate() === 400.0) // 200 output rows processed in 0.5 sec
+    assert(sm.currentSourceInputRate(source) === 100.0)
+    assert(sm.currentSourceProcessingRate(source) === 400.0)
+    assert(sm.currentLatency().get === 1500.0)       // 2000 ms / 2 + 500 ms
+
+    // Rates should be set to 0 after stop
+    sm.stop()
+    assert(sm.currentInputRate() === 0.0)
+    assert(sm.currentProcessingRate() === 0.0)
+    assert(sm.currentSourceInputRate(source) === 0.0)
+    assert(sm.currentSourceProcessingRate(source) === 0.0)
+    assert(sm.currentLatency() === None)
+    assert(sm.currentTriggerDetails().isEmpty)
+  }
+
+  test("rates and latencies - after trigger with no data") {
+    val sm = newStreamMetrics(source)
+    // Trigger 1 with data
+    sm.reportTriggerStarted(1)
+    sm.reportNumInputRows(Map(source -> 100L)) // 100 input rows
+    clock.advance(1000)
+    sm.reportTriggerFinished()
+
+    // Trigger 2 with data
+    clock.advance(1000)
+    sm.reportTriggerStarted(2)
+    sm.reportNumInputRows(Map(source -> 200L)) // 200 input rows
+    clock.advance(500)
+    sm.reportTriggerFinished()
+
+    // Make sure that all rates are set
+    require(sm.currentInputRate() === 100.0) // 200 input rows generated in 2 seconds b/w starts
+    require(sm.currentProcessingRate() === 400.0) // 200 output rows processed in 0.5 sec
+    require(sm.currentSourceInputRate(source) === 100.0)
+    require(sm.currentSourceProcessingRate(source) === 400.0)
+    require(sm.currentLatency().get === 1500.0) // 2000 ms / 2 + 500 ms
+
+    // Trigger 3 with data
+    clock.advance(500)
+    sm.reportTriggerStarted(3)
+    clock.advance(500)
+    sm.reportTriggerFinished()
+
+    // Rates are set to zero and latency is set to None
+    assert(sm.currentInputRate() === 0.0)
+    assert(sm.currentProcessingRate() === 0.0)
+    assert(sm.currentSourceInputRate(source) === 0.0)
+    assert(sm.currentSourceProcessingRate(source) === 0.0)
+    assert(sm.currentLatency() === None)
+    sm.stop()
+  }
+
+  test("rates - after trigger with multiple sources, and one source having no info") {
+    val source1 = TestSource(1)
+    val source2 = TestSource(2)
+    val sm = newStreamMetrics(source1, source2)
+    // Trigger 1 with data
+    sm.reportTriggerStarted(1)
+    sm.reportNumInputRows(Map(source1 -> 100L, source2 -> 100L))
+    clock.advance(1000)
+    sm.reportTriggerFinished()
+
+    // Trigger 2 with data
+    clock.advance(1000)
+    sm.reportTriggerStarted(2)
+    sm.reportNumInputRows(Map(source1 -> 200L, source2 -> 200L))
+    clock.advance(500)
+    sm.reportTriggerFinished()
+
+    // Make sure that all rates are set
+    assert(sm.currentInputRate() === 200.0) // 200*2 input rows generated in 2 seconds b/w starts
+    assert(sm.currentProcessingRate() === 800.0) // 200*2 output rows processed in 0.5 sec
+    assert(sm.currentSourceInputRate(source1) === 100.0)
+    assert(sm.currentSourceInputRate(source2) === 100.0)
+    assert(sm.currentSourceProcessingRate(source1) === 400.0)
+    assert(sm.currentSourceProcessingRate(source2) === 400.0)
+
+    // Trigger 3 with data
+    clock.advance(500)
+    sm.reportTriggerStarted(3)
+    clock.advance(500)
+    sm.reportNumInputRows(Map(source1 -> 200L))
+    sm.reportTriggerFinished()
+
+    // Rates are set to zero and latency is set to None
+    assert(sm.currentInputRate() === 200.0)
+    assert(sm.currentProcessingRate() === 400.0)
+    assert(sm.currentSourceInputRate(source1) === 200.0)
+    assert(sm.currentSourceInputRate(source2) === 0.0)
+    assert(sm.currentSourceProcessingRate(source1) === 400.0)
+    assert(sm.currentSourceProcessingRate(source2) === 0.0)
+    sm.stop()
+  }
+
+  test("registered Codahale metrics") {
+    import scala.collection.JavaConverters._
+    val sm = newStreamMetrics(source)
+    val gaugeNames = sm.metricRegistry.getGauges().keySet().asScala
+
+    // so that all metrics are considered as a single metric group in Ganglia
+    assert(!gaugeNames.exists(_.contains(".")))
+    assert(gaugeNames === Set(
+      "inputRate-total",
+      "inputRate-source0",
+      "processingRate-total",
+      "processingRate-source0",
+      "latency"))
+  }
+
+  private def newStreamMetrics(sources: Source*): StreamMetrics = {
+    new StreamMetrics(sources.toSet, clock, "test")
+  }
+
+  private val clock = new ManualClock()
+  private val source = TestSource(0)
+
+  case class TestSource(id: Int) extends Source {
+    override def schema: StructType = StructType(Array.empty[StructField])
+    override def getOffset: Option[Offset] = Some(new LongOffset(0))
+    override def getBatch(start: Option[Offset], end: Offset): DataFrame = { null }
+    override def stop() {}
+    override def toString(): String = s"source$id"
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/TextSocketStreamSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/TextSocketStreamSuite.scala
index 6b0ba7acb4804..5174a0415304c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/TextSocketStreamSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/TextSocketStreamSuite.scala
@@ -156,6 +156,30 @@ class TextSocketStreamSuite extends StreamTest with SharedSQLContext with Before
     }
   }
 
+  test("input row metrics") {
+    serverThread = new ServerThread()
+    serverThread.start()
+
+    val provider = new TextSocketSourceProvider
+    val parameters = Map("host" -> "localhost", "port" -> serverThread.port.toString)
+    source = provider.createSource(sqlContext, "", None, "", parameters)
+
+    failAfter(streamingTimeout) {
+      serverThread.enqueue("hello")
+      while (source.getOffset.isEmpty) {
+        Thread.sleep(10)
+      }
+      val batch = source.getBatch(None, source.getOffset.get).as[String]
+      batch.collect()
+      val numRowsMetric =
+        batch.queryExecution.executedPlan.collectLeaves().head.metrics.get("numOutputRows")
+      assert(numRowsMetric.nonEmpty)
+      assert(numRowsMetric.get.value === 1)
+      source.stop()
+      source = null
+    }
+  }
+
   private class ServerThread extends Thread with Logging {
     private val serverSocket = new ServerSocket(0)
     private val messageQueue = new LinkedBlockingQueue[String]()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala
index 984b84fd13fbd..06f1bd6c3bcc7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala
@@ -74,6 +74,7 @@ class StateStoreSuite extends SparkFunSuite with BeforeAndAfter with PrivateMeth
 
     // Verify state after updating
     put(store, "a", 1)
+    assert(store.numKeys() === 1)
     intercept[IllegalStateException] {
       store.iterator()
     }
@@ -85,7 +86,9 @@ class StateStoreSuite extends SparkFunSuite with BeforeAndAfter with PrivateMeth
     // Make updates, commit and then verify state
     put(store, "b", 2)
     put(store, "aa", 3)
+    assert(store.numKeys() === 3)
     remove(store, _.startsWith("a"))
+    assert(store.numKeys() === 1)
     assert(store.commit() === 1)
 
     assert(store.hasCommitted)
@@ -107,7 +110,9 @@ class StateStoreSuite extends SparkFunSuite with BeforeAndAfter with PrivateMeth
     val reloadedProvider = new HDFSBackedStateStoreProvider(
       store.id, keySchema, valueSchema, StateStoreConf.empty, new Configuration)
     val reloadedStore = reloadedProvider.getStore(1)
+    assert(reloadedStore.numKeys() === 1)
     put(reloadedStore, "c", 4)
+    assert(reloadedStore.numKeys() === 2)
     assert(reloadedStore.commit() === 2)
     assert(rowsToSet(reloadedStore.iterator()) === Set("b" -> 2, "c" -> 4))
     assert(getDataFromFiles(provider) === Set("b" -> 2, "c" -> 4))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
index 7f9c981a4e9c9..aabdccaaf319d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
@@ -998,6 +998,20 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
       }
     }
   }
+
+  test("input row metrics") {
+    withTempDirs { case (src, tmp) =>
+      val input = spark.readStream.format("text").load(src.getCanonicalPath)
+      testStream(input)(
+        AddTextFileData("100", src, tmp),
+        CheckAnswer("100"),
+        AssertOnLastQueryStatus { status =>
+          assert(status.triggerDetails.get("numRows.input.total") === "1")
+          assert(status.sourceStatuses(0).processingRate > 0.0)
+        }
+      )
+    }
+  }
 }
 
 class FileStreamSourceStressTestSuite extends FileStreamSourceTest {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index fa13d385cce75..3b9d3786349ad 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -28,6 +28,8 @@ import scala.util.control.NonFatal
 
 import org.scalatest.Assertions
 import org.scalatest.concurrent.{Eventually, Timeouts}
+import org.scalatest.concurrent.AsyncAssertions.Waiter
+import org.scalatest.concurrent.Eventually._
 import org.scalatest.concurrent.PatienceConfiguration.Timeout
 import org.scalatest.exceptions.TestFailedDueToTimeoutException
 import org.scalatest.time.Span
@@ -38,6 +40,7 @@ import org.apache.spark.sql.catalyst.encoders.{encoderFor, ExpressionEncoder, Ro
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.execution.streaming._
+import org.apache.spark.sql.streaming.StreamingQueryListener._
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.util.{Clock, ManualClock, SystemClock, Utils}
 
@@ -198,6 +201,10 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
     }
   }
 
+  case class AssertOnLastQueryStatus(condition: StreamingQueryStatus => Unit)
+    extends StreamAction
+
+
   /**
    * Executes the specified actions on the given streaming DataFrame and provides helpful
    * error messages in the case of failures or incorrect answers.
@@ -299,9 +306,12 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
 
     val testThread = Thread.currentThread()
     val metadataRoot = Utils.createTempDir(namePrefix = "streaming.metadata").getCanonicalPath
+    val statusCollector = new QueryStatusCollector
 
     try {
+      spark.streams.addListener(statusCollector)
       startedTest.foreach { action =>
+        logInfo(s"Processing test stream action: $action")
         action match {
           case StartStream(trigger, triggerClock) =>
             verify(currentStream == null, "stream already running")
@@ -399,6 +409,13 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
             val streamToAssert = Option(currentStream).getOrElse(lastStream)
             verify({ a.run(); true }, s"Assert failed: ${a.message}")
 
+          case a: AssertOnLastQueryStatus =>
+            Eventually.eventually(timeout(streamingTimeout)) {
+              require(statusCollector.lastTriggerStatus.nonEmpty)
+            }
+            val status = statusCollector.lastTriggerStatus.get
+            verify({ a.condition(status); true }, "Assert on last query status failed")
+
           case a: AddData =>
             try {
               // Add data and get the source where it was added, and the expected offset of the
@@ -473,6 +490,7 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
       if (currentStream != null && currentStream.microBatchThread.isAlive) {
         currentStream.stop()
       }
+      spark.streams.removeListener(statusCollector)
     }
   }
 
@@ -606,4 +624,58 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
       }
     }
   }
+
+
+  class QueryStatusCollector extends StreamingQueryListener {
+    // to catch errors in the async listener events
+    @volatile private var asyncTestWaiter = new Waiter
+
+    @volatile var startStatus: StreamingQueryStatus = null
+    @volatile var terminationStatus: StreamingQueryStatus = null
+    @volatile var terminationException: Option[String] = null
+
+    private val progressStatuses = new mutable.ArrayBuffer[StreamingQueryStatus]
+
+    /** Get the info of the last trigger that processed data */
+    def lastTriggerStatus: Option[StreamingQueryStatus] = synchronized {
+      progressStatuses.filter { i =>
+        i.triggerDetails.get("isTriggerActive").toBoolean == false &&
+          i.triggerDetails.get("isDataPresentInTrigger").toBoolean == true
+      }.lastOption
+    }
+
+    def reset(): Unit = {
+      startStatus = null
+      terminationStatus = null
+      progressStatuses.clear()
+      asyncTestWaiter = new Waiter
+    }
+
+    def checkAsyncErrors(): Unit = {
+      asyncTestWaiter.await(timeout(10 seconds))
+    }
+
+
+    override def onQueryStarted(queryStarted: QueryStarted): Unit = {
+      asyncTestWaiter {
+        startStatus = queryStarted.queryStatus
+      }
+    }
+
+    override def onQueryProgress(queryProgress: QueryProgress): Unit = {
+      asyncTestWaiter {
+        assert(startStatus != null, "onQueryProgress called before onQueryStarted")
+        synchronized { progressStatuses += queryProgress.queryStatus }
+      }
+    }
+
+    override def onQueryTerminated(queryTerminated: QueryTerminated): Unit = {
+      asyncTestWaiter {
+        assert(startStatus != null, "onQueryTerminated called before onQueryStarted")
+        terminationStatus = queryTerminated.queryStatus
+        terminationException = queryTerminated.exception
+      }
+      asyncTestWaiter.dismiss()
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
index 8681199817fe6..e59b5491f90b6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
@@ -22,6 +22,7 @@ import org.scalatest.BeforeAndAfterAll
 import org.apache.spark.SparkException
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.InternalOutputModes._
+import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.execution.streaming.state.StateStore
 import org.apache.spark.sql.expressions.scalalang.typed
@@ -129,6 +130,59 @@ class StreamingAggregationSuite extends StreamTest with BeforeAndAfterAll {
     )
   }
 
+  test("state metrics") {
+    val inputData = MemoryStream[Int]
+
+    val aggregated =
+      inputData.toDS()
+        .flatMap(x => Seq(x, x + 1))
+        .toDF("value")
+        .groupBy($"value")
+        .agg(count("*"))
+        .as[(Int, Long)]
+
+    implicit class RichStreamExecution(query: StreamExecution) {
+      def stateNodes: Seq[SparkPlan] = {
+        query.lastExecution.executedPlan.collect {
+          case p if p.isInstanceOf[StateStoreSaveExec] => p
+        }
+      }
+    }
+
+    // Test with Update mode
+    testStream(aggregated, Update)(
+      AddData(inputData, 1),
+      CheckLastBatch((1, 1), (2, 1)),
+      AssertOnQuery { _.stateNodes.size === 1 },
+      AssertOnQuery { _.stateNodes.head.metrics.get("numOutputRows").get.value === 2 },
+      AssertOnQuery { _.stateNodes.head.metrics.get("numUpdatedStateRows").get.value === 2 },
+      AssertOnQuery { _.stateNodes.head.metrics.get("numTotalStateRows").get.value === 2 },
+      AddData(inputData, 2, 3),
+      CheckLastBatch((2, 2), (3, 2), (4, 1)),
+      AssertOnQuery { _.stateNodes.size === 1 },
+      AssertOnQuery { _.stateNodes.head.metrics.get("numOutputRows").get.value === 3 },
+      AssertOnQuery { _.stateNodes.head.metrics.get("numUpdatedStateRows").get.value === 3 },
+      AssertOnQuery { _.stateNodes.head.metrics.get("numTotalStateRows").get.value === 4 }
+    )
+
+    // Test with Complete mode
+    inputData.reset()
+    testStream(aggregated, Complete)(
+      AddData(inputData, 1),
+      CheckLastBatch((1, 1), (2, 1)),
+      AssertOnQuery { _.stateNodes.size === 1 },
+      AssertOnQuery { _.stateNodes.head.metrics.get("numOutputRows").get.value === 2 },
+      AssertOnQuery { _.stateNodes.head.metrics.get("numUpdatedStateRows").get.value === 2 },
+      AssertOnQuery { _.stateNodes.head.metrics.get("numTotalStateRows").get.value === 2 },
+      AddData(inputData, 2, 3),
+      CheckLastBatch((1, 1), (2, 2), (3, 2), (4, 1)),
+      AssertOnQuery { _.stateNodes.size === 1 },
+      AssertOnQuery { _.stateNodes.head.metrics.get("numOutputRows").get.value === 4 },
+      AssertOnQuery { _.stateNodes.head.metrics.get("numUpdatedStateRows").get.value === 3 },
+      AssertOnQuery { _.stateNodes.head.metrics.get("numTotalStateRows").get.value === 4 }
+    )
+  }
+
   test("multiple keys") {
     val inputData = MemoryStream[Int]
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
index 831543a47420a..6256385dfd0e4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
@@ -17,92 +17,97 @@
 
 package org.apache.spark.sql.streaming
 
-import java.util.concurrent.ConcurrentLinkedQueue
-
+import org.scalactic.TolerantNumerics
 import org.scalatest.BeforeAndAfter
 import org.scalatest.PrivateMethodTester._
-import org.scalatest.concurrent.AsyncAssertions.Waiter
-import org.scalatest.concurrent.Eventually._
-import org.scalatest.concurrent.PatienceConfiguration.Timeout
-import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.SparkException
+import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.execution.streaming._
-import org.apache.spark.util.JsonProtocol
+import org.apache.spark.sql.functions._
+import org.apache.spark.util.{JsonProtocol, ManualClock}
 
 
 class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
 
   import testImplicits._
-  import StreamingQueryListener._
+  import StreamingQueryListenerSuite._
+
+  // To make === between double tolerate inexact values
+  implicit val doubleEquality = TolerantNumerics.tolerantDoubleEquality(0.01)
 
   after {
     spark.streams.active.foreach(_.stop())
     assert(spark.streams.active.isEmpty)
     assert(addedListeners.isEmpty)
     // Make sure we don't leak any events to the next test
-    spark.sparkContext.listenerBus.waitUntilEmpty(10000)
   }
 
-  test("single listener") {
-    val listener = new QueryStatusCollector
-    val input = MemoryStream[Int]
-    withListenerAdded(listener) {
-      testStream(input.toDS)(
-        StartStream(),
-        AssertOnQuery("Incorrect query status in onQueryStarted") { query =>
-          val status = listener.startStatus
-          assert(status != null)
-          assert(status.name === query.name)
-          assert(status.id === query.id)
-          assert(status.sourceStatuses.size === 1)
-          assert(status.sourceStatuses(0).description.contains("Memory"))
-
-          // The source and sink offsets must be None as this must be called before the
-          // batches have started
-          assert(status.sourceStatuses(0).offsetDesc === None)
-          assert(status.sinkStatus.offsetDesc === CompositeOffset(None :: Nil).toString)
-
-          // No progress events or termination events
-          assert(listener.progressStatuses.isEmpty)
-          assert(listener.terminationStatus === null)
-          true
-        },
-        AddDataMemory(input, Seq(1, 2, 3)),
-        CheckAnswer(1, 2, 3),
-        AssertOnQuery("Incorrect query status in onQueryProgress") { query =>
-          eventually(Timeout(streamingTimeout)) {
+  test("single listener, check trigger statuses") {
+    import StreamingQueryListenerSuite._
+    clock = new ManualClock()
+
+    /** Custom MemoryStream that waits for manual clock to reach a time */
+    val inputData = new MemoryStream[Int](0, sqlContext) {
+      // Wait for manual clock to be 100 first time there is data
+      override def getOffset: Option[Offset] = {
+        val offset = super.getOffset
+        if (offset.nonEmpty) {
+          clock.waitTillTime(100)
+        }
+        offset
+      }
 
-            // There should be only on progress event as batch has been processed
-            assert(listener.progressStatuses.size === 1)
-            val status = listener.progressStatuses.peek()
-            assert(status != null)
-            assert(status.name === query.name)
-            assert(status.id === query.id)
-            assert(status.sourceStatuses(0).offsetDesc === Some(LongOffset(0).toString))
-            assert(status.sinkStatus.offsetDesc === CompositeOffset.fill(LongOffset(0)).toString)
+      // Wait for manual clock to be 300 first time there is data
+      override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
+        clock.waitTillTime(300)
+        super.getBatch(start, end)
+      }
+    }
 
-            // No termination events
-            assert(listener.terminationStatus === null)
-          }
-          true
-        },
-        StopStream,
-        AssertOnQuery("Incorrect query status in onQueryTerminated") { query =>
-          eventually(Timeout(streamingTimeout)) {
-            val status = listener.terminationStatus
-            assert(status != null)
-            assert(status.name === query.name)
-            assert(status.id === query.id)
-            assert(status.sourceStatuses(0).offsetDesc === Some(LongOffset(0).toString))
-            assert(status.sinkStatus.offsetDesc === CompositeOffset.fill(LongOffset(0)).toString)
-            assert(listener.terminationException === None)
-          }
-          listener.checkAsyncErrors()
-          true
-        }
-      )
+    // This is to make sure thatquery waits for manual clock to be 600 first time there is data
+    val mapped = inputData.toDS().agg(count("*")).as[Long].coalesce(1).map { x =>
+      clock.waitTillTime(600)
+      x
     }
+
+    testStream(mapped, OutputMode.Complete)(
+      StartStream(triggerClock = clock),
+      AddData(inputData, 1, 2),
+      AdvanceManualClock(100),  // unblock getOffset, will block on getBatch
+      AdvanceManualClock(200),  // unblock getBatch, will block on computation
+      AdvanceManualClock(300),  // unblock computation
+      AssertOnQuery { _ => clock.getTimeMillis() === 600 },
+      AssertOnLastQueryStatus { status: StreamingQueryStatus =>
+        // Check the correctness of the trigger info of the last completed batch reported by
+        // onQueryProgress
+        assert(status.triggerDetails.get("triggerId") == "0")
+        assert(status.triggerDetails.get("isTriggerActive") === "false")
+        assert(status.triggerDetails.get("isDataPresentInTrigger") === "true")
+
+        assert(status.triggerDetails.get("timestamp.triggerStart") === "0")
+        assert(status.triggerDetails.get("timestamp.afterGetOffset") === "100")
+        assert(status.triggerDetails.get("timestamp.afterGetBatch") === "300")
+        assert(status.triggerDetails.get("timestamp.triggerFinish") === "600")
+
+        assert(status.triggerDetails.get("latency.getOffset.total") === "100")
+        assert(status.triggerDetails.get("latency.getBatch.total") === "200")
+        assert(status.triggerDetails.get("latency.optimizer") === "0")
+        assert(status.triggerDetails.get("latency.offsetLogWrite") === "0")
+        assert(status.triggerDetails.get("latency.fullTrigger") === "600")
+
+        assert(status.triggerDetails.get("numRows.input.total") === "2")
+        assert(status.triggerDetails.get("numRows.state.aggregation1.total") === "1")
+        assert(status.triggerDetails.get("numRows.state.aggregation1.updated") === "1")
+
+        assert(status.sourceStatuses.length === 1)
+        assert(status.sourceStatuses(0).triggerDetails.get("triggerId") === "0")
+        assert(status.sourceStatuses(0).triggerDetails.get("latency.getOffset.source") === "100")
+        assert(status.sourceStatuses(0).triggerDetails.get("latency.getBatch.source") === "200")
+        assert(status.sourceStatuses(0).triggerDetails.get("numRows.input.source") === "2")
+      },
+      CheckAnswer(2)
+    )
   }
 
   test("adding and removing listener") {
@@ -172,56 +177,37 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
   }
 
   test("QueryStarted serialization") {
-    val queryStartedInfo = new StreamingQueryInfo(
-      "name",
-      1,
-      Seq(new SourceStatus("source1", None), new SourceStatus("source2", None)),
-      new SinkStatus("sink", CompositeOffset(None :: None :: Nil).toString))
-    val queryStarted = new StreamingQueryListener.QueryStarted(queryStartedInfo)
+    val queryStarted = new StreamingQueryListener.QueryStarted(StreamingQueryStatus.testStatus)
     val json = JsonProtocol.sparkEventToJson(queryStarted)
     val newQueryStarted = JsonProtocol.sparkEventFromJson(json)
       .asInstanceOf[StreamingQueryListener.QueryStarted]
-    assertStreamingQueryInfoEquals(queryStarted.queryInfo, newQueryStarted.queryInfo)
+    assertStreamingQueryInfoEquals(queryStarted.queryStatus, newQueryStarted.queryStatus)
   }
 
   test("QueryProgress serialization") {
-    val queryProcessInfo = new StreamingQueryInfo(
-      "name",
-      1,
-      Seq(
-        new SourceStatus("source1", Some(LongOffset(0).toString)),
-        new SourceStatus("source2", Some(LongOffset(1).toString))),
-      new SinkStatus("sink", new CompositeOffset(Array(None, Some(LongOffset(1)))).toString))
-    val queryProcess = new StreamingQueryListener.QueryProgress(queryProcessInfo)
+    val queryProcess = new StreamingQueryListener.QueryProgress(StreamingQueryStatus.testStatus)
     val json = JsonProtocol.sparkEventToJson(queryProcess)
     val newQueryProcess = JsonProtocol.sparkEventFromJson(json)
       .asInstanceOf[StreamingQueryListener.QueryProgress]
-    assertStreamingQueryInfoEquals(queryProcess.queryInfo, newQueryProcess.queryInfo)
+    assertStreamingQueryInfoEquals(queryProcess.queryStatus, newQueryProcess.queryStatus)
   }
 
   test("QueryTerminated serialization") {
-    val queryTerminatedInfo = new StreamingQueryInfo(
-      "name",
-      1,
-      Seq(
-        new SourceStatus("source1", Some(LongOffset(0).toString)),
-        new SourceStatus("source2", Some(LongOffset(1).toString))),
-      new SinkStatus("sink", new CompositeOffset(Array(None, Some(LongOffset(1)))).toString))
     val exception = new RuntimeException("exception")
     val queryQueryTerminated = new StreamingQueryListener.QueryTerminated(
-      queryTerminatedInfo,
+      StreamingQueryStatus.testStatus,
       Some(exception.getMessage))
     val json =
       JsonProtocol.sparkEventToJson(queryQueryTerminated)
     val newQueryTerminated = JsonProtocol.sparkEventFromJson(json)
       .asInstanceOf[StreamingQueryListener.QueryTerminated]
-    assertStreamingQueryInfoEquals(queryQueryTerminated.queryInfo, newQueryTerminated.queryInfo)
+    assertStreamingQueryInfoEquals(queryQueryTerminated.queryStatus, newQueryTerminated.queryStatus)
     assert(queryQueryTerminated.exception === newQueryTerminated.exception)
   }
 
   private def assertStreamingQueryInfoEquals(
-      expected: StreamingQueryInfo,
-      actual: StreamingQueryInfo): Unit = {
+      expected: StreamingQueryStatus,
+      actual: StreamingQueryStatus): Unit = {
     assert(expected.name === actual.name)
     assert(expected.sourceStatuses.size === actual.sourceStatuses.size)
     expected.sourceStatuses.zip(actual.sourceStatuses).foreach {
@@ -243,7 +229,7 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
 
   private def withListenerAdded(listener: StreamingQueryListener)(body: => Unit): Unit = {
     try {
-      failAfter(1 minute) {
+      failAfter(streamingTimeout) {
         spark.streams.addListener(listener)
         body
       }
@@ -258,49 +244,9 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
     val listenerBus = spark.streams invokePrivate listenerBusMethod()
     listenerBus.listeners.toArray.map(_.asInstanceOf[StreamingQueryListener])
   }
+}
 
-  class QueryStatusCollector extends StreamingQueryListener {
-    // to catch errors in the async listener events
-    @volatile private var asyncTestWaiter = new Waiter
-
-    @volatile var startStatus: StreamingQueryInfo = null
-    @volatile var terminationStatus: StreamingQueryInfo = null
-    @volatile var terminationException: Option[String] = null
-
-    val progressStatuses = new ConcurrentLinkedQueue[StreamingQueryInfo]
-
-    def reset(): Unit = {
-      startStatus = null
-      terminationStatus = null
-      progressStatuses.clear()
-      asyncTestWaiter = new Waiter
-    }
-
-    def checkAsyncErrors(): Unit = {
-      asyncTestWaiter.await(timeout(streamingTimeout))
-    }
-
-
-    override def onQueryStarted(queryStarted: QueryStarted): Unit = {
-      asyncTestWaiter {
-        startStatus = queryStarted.queryInfo
-      }
-    }
-
-    override def onQueryProgress(queryProgress: QueryProgress): Unit = {
-      asyncTestWaiter {
-        assert(startStatus != null, "onQueryProgress called before onQueryStarted")
-        progressStatuses.add(queryProgress.queryInfo)
-      }
-    }
-
-    override def onQueryTerminated(queryTerminated: QueryTerminated): Unit = {
-      asyncTestWaiter {
-        assert(startStatus != null, "onQueryTerminated called before onQueryStarted")
-        terminationStatus = queryTerminated.queryInfo
-        terminationException = queryTerminated.exception
-      }
-      asyncTestWaiter.dismiss()
-    }
-  }
+object StreamingQueryListenerSuite {
+  // Singleton reference to clock that does not get serialized in task closures
+  @volatile var clock: ManualClock = null
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
index 88f1f188ab2af..9f8e2db966367 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
@@ -17,18 +17,27 @@
 
 package org.apache.spark.sql.streaming
 
+import org.scalactic.TolerantNumerics
+import org.scalatest.concurrent.Eventually._
 import org.scalatest.BeforeAndAfter
 
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.streaming.StreamingQueryListener._
+import org.apache.spark.sql.types.StructType
 import org.apache.spark.SparkException
-import org.apache.spark.sql.execution.streaming.{CompositeOffset, LongOffset, MemoryStream, StreamExecution}
+import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.util.Utils
 
 
-class StreamingQuerySuite extends StreamTest with BeforeAndAfter {
+class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
 
   import AwaitTerminationTester._
   import testImplicits._
 
+  // To make === between double tolerate inexact values
+  implicit val doubleEquality = TolerantNumerics.tolerantDoubleEquality(0.01)
+
   after {
     sqlContext.streams.active.foreach(_.stop())
   }
@@ -100,31 +109,145 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter {
     )
   }
 
-  testQuietly("source and sink statuses") {
+  testQuietly("query statuses") {
     val inputData = MemoryStream[Int]
     val mapped = inputData.toDS().map(6 / _)
-
     testStream(mapped)(
-      AssertOnQuery(_.sourceStatuses.length === 1),
+      AssertOnQuery(q => q.status.name === q.name),
+      AssertOnQuery(q => q.status.id === q.id),
+      AssertOnQuery(_.status.timestamp <= System.currentTimeMillis),
+      AssertOnQuery(_.status.inputRate === 0.0),
+      AssertOnQuery(_.status.processingRate === 0.0),
+      AssertOnQuery(_.status.sourceStatuses.length === 1),
+      AssertOnQuery(_.status.sourceStatuses(0).description.contains("Memory")),
+      AssertOnQuery(_.status.sourceStatuses(0).offsetDesc === "-"),
+      AssertOnQuery(_.status.sourceStatuses(0).inputRate === 0.0),
+      AssertOnQuery(_.status.sourceStatuses(0).processingRate === 0.0),
+      AssertOnQuery(_.status.sinkStatus.description.contains("Memory")),
+      AssertOnQuery(_.status.sinkStatus.offsetDesc === CompositeOffset(None :: Nil).toString),
       AssertOnQuery(_.sourceStatuses(0).description.contains("Memory")),
-      AssertOnQuery(_.sourceStatuses(0).offsetDesc === None),
+      AssertOnQuery(_.sourceStatuses(0).offsetDesc === "-"),
+      AssertOnQuery(_.sourceStatuses(0).inputRate === 0.0),
+      AssertOnQuery(_.sourceStatuses(0).processingRate === 0.0),
       AssertOnQuery(_.sinkStatus.description.contains("Memory")),
       AssertOnQuery(_.sinkStatus.offsetDesc === new CompositeOffset(None :: Nil).toString),
+
       AddData(inputData, 1, 2),
       CheckAnswer(6, 3),
-      AssertOnQuery(_.sourceStatuses(0).offsetDesc === Some(LongOffset(0).toString)),
+      AssertOnQuery(_.status.timestamp <= System.currentTimeMillis),
+      AssertOnQuery(_.status.inputRate >= 0.0),
+      AssertOnQuery(_.status.processingRate >= 0.0),
+      AssertOnQuery(_.status.sourceStatuses.length === 1),
+      AssertOnQuery(_.status.sourceStatuses(0).description.contains("Memory")),
+      AssertOnQuery(_.status.sourceStatuses(0).offsetDesc === LongOffset(0).toString),
+      AssertOnQuery(_.status.sourceStatuses(0).inputRate >= 0.0),
+      AssertOnQuery(_.status.sourceStatuses(0).processingRate >= 0.0),
+      AssertOnQuery(_.status.sinkStatus.description.contains("Memory")),
+      AssertOnQuery(_.status.sinkStatus.offsetDesc ===
+        CompositeOffset.fill(LongOffset(0)).toString),
+      AssertOnQuery(_.sourceStatuses(0).offsetDesc === LongOffset(0).toString),
+      AssertOnQuery(_.sourceStatuses(0).inputRate >= 0.0),
+      AssertOnQuery(_.sourceStatuses(0).processingRate >= 0.0),
       AssertOnQuery(_.sinkStatus.offsetDesc === CompositeOffset.fill(LongOffset(0)).toString),
+
       AddData(inputData, 1, 2),
       CheckAnswer(6, 3, 6, 3),
-      AssertOnQuery(_.sourceStatuses(0).offsetDesc === Some(LongOffset(1).toString)),
+      AssertOnQuery(_.status.sourceStatuses(0).offsetDesc === LongOffset(1).toString),
+      AssertOnQuery(_.status.sinkStatus.offsetDesc ===
+        CompositeOffset.fill(LongOffset(1)).toString),
+      AssertOnQuery(_.sourceStatuses(0).offsetDesc === LongOffset(1).toString),
       AssertOnQuery(_.sinkStatus.offsetDesc === CompositeOffset.fill(LongOffset(1)).toString),
+
+      StopStream,
+      AssertOnQuery(_.status.inputRate === 0.0),
+      AssertOnQuery(_.status.processingRate === 0.0),
+      AssertOnQuery(_.status.sourceStatuses.length === 1),
+      AssertOnQuery(_.status.sourceStatuses(0).offsetDesc === LongOffset(1).toString),
+      AssertOnQuery(_.status.sourceStatuses(0).inputRate === 0.0),
+      AssertOnQuery(_.status.sourceStatuses(0).processingRate === 0.0),
+      AssertOnQuery(_.status.sinkStatus.offsetDesc ===
+        CompositeOffset.fill(LongOffset(1)).toString),
+      AssertOnQuery(_.sourceStatuses(0).offsetDesc === LongOffset(1).toString),
+      AssertOnQuery(_.sourceStatuses(0).inputRate === 0.0),
+      AssertOnQuery(_.sourceStatuses(0).processingRate === 0.0),
+      AssertOnQuery(_.sinkStatus.offsetDesc === CompositeOffset.fill(LongOffset(1)).toString),
+      AssertOnQuery(_.status.triggerDetails.isEmpty),
+
+      StartStream(),
       AddData(inputData, 0),
       ExpectFailure[SparkException],
-      AssertOnQuery(_.sourceStatuses(0).offsetDesc === Some(LongOffset(2).toString)),
+      AssertOnQuery(_.status.inputRate === 0.0),
+      AssertOnQuery(_.status.processingRate === 0.0),
+      AssertOnQuery(_.status.sourceStatuses.length === 1),
+      AssertOnQuery(_.status.sourceStatuses(0).offsetDesc === LongOffset(2).toString),
+      AssertOnQuery(_.status.sourceStatuses(0).inputRate === 0.0),
+      AssertOnQuery(_.status.sourceStatuses(0).processingRate === 0.0),
+      AssertOnQuery(_.status.sinkStatus.offsetDesc ===
+        CompositeOffset.fill(LongOffset(1)).toString),
+      AssertOnQuery(_.sourceStatuses(0).offsetDesc === LongOffset(2).toString),
+      AssertOnQuery(_.sourceStatuses(0).inputRate === 0.0),
+      AssertOnQuery(_.sourceStatuses(0).processingRate === 0.0),
       AssertOnQuery(_.sinkStatus.offsetDesc === CompositeOffset.fill(LongOffset(1)).toString)
     )
   }
 
+  test("codahale metrics") {
+    val inputData = MemoryStream[Int]
+
+    /** Whether metrics of a query is registered for reporting */
+    def isMetricsRegistered(query: StreamingQuery): Boolean = {
+      val sourceName = s"StructuredStreaming.${query.name}"
+      val sources = spark.sparkContext.env.metricsSystem.getSourcesByName(sourceName)
+      require(sources.size <= 1)
+      sources.nonEmpty
+    }
+    // Disabled by default
+    assert(spark.conf.get("spark.sql.streaming.metricsEnabled").toBoolean === false)
+
+    withSQLConf("spark.sql.streaming.metricsEnabled" -> "false") {
+      testStream(inputData.toDF)(
+        AssertOnQuery { q => !isMetricsRegistered(q) },
+        StopStream,
+        AssertOnQuery { q => !isMetricsRegistered(q) }
+      )
+    }
+
+    // Registered when enabled
+    withSQLConf("spark.sql.streaming.metricsEnabled" -> "true") {
+      testStream(inputData.toDF)(
+        AssertOnQuery { q => isMetricsRegistered(q) },
+        StopStream,
+        AssertOnQuery { q => !isMetricsRegistered(q) }
+      )
+    }
+  }
+
+  test("input row calculation with mixed batch and streaming sources") {
+    val streamingTriggerDF = spark.createDataset(1 to 10).toDF
+    val streamingInputDF = createSingleTriggerStreamingDF(streamingTriggerDF).toDF("value")
+    val staticInputDF = spark.createDataFrame(Seq(1 -> "1", 2 -> "2")).toDF("value", "anotherValue")
+
+    // Trigger input has 10 rows, static input has 2 rows,
+    // therefore after the first trigger, the calculated input rows should be 10
+    val status = getFirstTriggerStatus(streamingInputDF.join(staticInputDF, "value"))
+    assert(status.triggerDetails.get("numRows.input.total") === "10")
+    assert(status.sourceStatuses.size === 1)
+    assert(status.sourceStatuses(0).triggerDetails.get("numRows.input.source") === "10")
+  }
+
+  test("input row calculation with trigger DF having multiple leaves") {
+    val streamingTriggerDF =
+      spark.createDataset(1 to 5).toDF.union(spark.createDataset(6 to 10).toDF)
+    require(streamingTriggerDF.logicalPlan.collectLeaves().size > 1)
+    val streamingInputDF = createSingleTriggerStreamingDF(streamingTriggerDF)
+
+    // After the first trigger, the calculated input rows should be 10
+    val status = getFirstTriggerStatus(streamingInputDF)
+    assert(status.triggerDetails.get("numRows.input.total") === "10")
+    assert(status.sourceStatuses.size === 1)
+    assert(status.sourceStatuses(0).triggerDetails.get("numRows.input.source") === "10")
+  }
+
   testQuietly("StreamExecution metadata garbage collection") {
     val inputData = MemoryStream[Int]
     val mapped = inputData.toDS().map(6 / _)
@@ -149,6 +272,45 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter {
     )
   }
 
+  /** Create a streaming DF that only execute one batch in which it returns the given static DF */
+  private def createSingleTriggerStreamingDF(triggerDF: DataFrame): DataFrame = {
+    require(!triggerDF.isStreaming)
+    // A streaming Source that generate only on trigger and returns the given Dataframe as batch
+    val source = new Source() {
+      override def schema: StructType = triggerDF.schema
+      override def getOffset: Option[Offset] = Some(LongOffset(0))
+      override def getBatch(start: Option[Offset], end: Offset): DataFrame = triggerDF
+      override def stop(): Unit = {}
+    }
+    StreamingExecutionRelation(source)
+  }
+
+  /** Returns the query status at the end of the first trigger of streaming DF */
+  private def getFirstTriggerStatus(streamingDF: DataFrame): StreamingQueryStatus = {
+    // A StreamingQueryListener that gets the query status after the first completed trigger
+    val listener = new StreamingQueryListener {
+      @volatile var firstStatus: StreamingQueryStatus = null
+      override def onQueryStarted(queryStarted: QueryStarted): Unit = { }
+      override def onQueryProgress(queryProgress: QueryProgress): Unit = {
+       if (firstStatus == null) firstStatus = queryProgress.queryStatus
+      }
+      override def onQueryTerminated(queryTerminated: QueryTerminated): Unit = { }
+    }
+
+    try {
+      spark.streams.addListener(listener)
+      val q = streamingDF.writeStream.format("memory").queryName("test").start()
+      q.processAllAvailable()
+      eventually(timeout(streamingTimeout)) {
+        assert(listener.firstStatus != null)
+      }
+      listener.firstStatus
+    } finally {
+      spark.streams.active.map(_.stop())
+      spark.streams.removeListener(listener)
+    }
+  }
+
   /**
    * A [[StreamAction]] to test the behavior of `StreamingQuery.awaitTermination()`.
    *

From adc112429d6fe671e6e8294824a0e41a2b1ec2e0 Mon Sep 17 00:00:00 2001
From: petermaxlee <petermaxlee@gmail.com>
Date: Thu, 13 Oct 2016 14:16:39 -0700
Subject: [PATCH 005/162] [SPARK-17661][SQL] Consolidate various listLeafFiles
 implementations

## What changes were proposed in this pull request?
There are 4 listLeafFiles-related functions in Spark:

- ListingFileCatalog.listLeafFiles (which calls HadoopFsRelation.listLeafFilesInParallel if the number of paths passed in is greater than a threshold; if it is lower, then it has its own serial version implemented)
- HadoopFsRelation.listLeafFiles (called only by HadoopFsRelation.listLeafFilesInParallel)
- HadoopFsRelation.listLeafFilesInParallel (called only by ListingFileCatalog.listLeafFiles)

It is actually very confusing and error prone because there are effectively two distinct implementations for the serial version of listing leaf files. As an example, SPARK-17599 updated only one of the code path and ignored the other one.

This code can be improved by:

- Move all file listing code into ListingFileCatalog, since it is the only class that needs this.
- Keep only one function for listing files in serial.

## How was this patch tested?
This change should be covered by existing unit and integration tests. I also moved a test case for HadoopFsRelation.shouldFilterOut from HadoopFsRelationSuite to ListingFileCatalogSuite.

Author: petermaxlee <petermaxlee@gmail.com>

Closes #15235 from petermaxlee/SPARK-17661.
---
 .../datasources/ListingFileCatalog.scala      | 231 +++++++++++++-----
 .../datasources/fileSourceInterfaces.scala    | 154 ------------
 .../datasources/HadoopFsRelationSuite.scala   |  11 -
 .../datasources/ListingFileCatalogSuite.scala |  34 +++
 4 files changed, 206 insertions(+), 224 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalogSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
index 32532084236cf..a68ae523e0faa 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
@@ -21,11 +21,14 @@ import java.io.FileNotFoundException
 
 import scala.collection.mutable
 
-import org.apache.hadoop.fs.{FileStatus, LocatedFileStatus, Path}
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs._
 import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
 
+import org.apache.spark.internal.Logging
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.types.StructType
+import org.apache.spark.util.SerializableConfiguration
 
 
 /**
@@ -82,73 +85,183 @@ class ListingFileCatalog(
    * This is publicly visible for testing.
    */
   def listLeafFiles(paths: Seq[Path]): mutable.LinkedHashSet[FileStatus] = {
-    if (paths.length >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) {
-      HadoopFsRelation.listLeafFilesInParallel(paths, hadoopConf, sparkSession)
-    } else {
-      // Right now, the number of paths is less than the value of
-      // parallelPartitionDiscoveryThreshold. So, we will list file statues at the driver.
-      // If there is any child that has more files than the threshold, we will use parallel
-      // listing.
-
-      // Dummy jobconf to get to the pathFilter defined in configuration
-      val jobConf = new JobConf(hadoopConf, this.getClass)
-      val pathFilter = FileInputFormat.getInputPathFilter(jobConf)
-
-      val statuses: Seq[FileStatus] = paths.flatMap { path =>
-        val fs = path.getFileSystem(hadoopConf)
-        logTrace(s"Listing $path on driver")
-
-        val childStatuses = {
-          try {
-            val stats = fs.listStatus(path)
-            if (pathFilter != null) stats.filter(f => pathFilter.accept(f.getPath)) else stats
-          } catch {
-            case _: FileNotFoundException =>
-              logWarning(s"The directory $path was not found. Was it deleted very recently?")
-              Array.empty[FileStatus]
-          }
-        }
+    val files =
+      if (paths.length >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) {
+        ListingFileCatalog.listLeafFilesInParallel(paths, hadoopConf, sparkSession)
+      } else {
+        ListingFileCatalog.listLeafFilesInSerial(paths, hadoopConf)
+      }
+
+    mutable.LinkedHashSet(files: _*)
+  }
+
+  override def equals(other: Any): Boolean = other match {
+    case hdfs: ListingFileCatalog => paths.toSet == hdfs.paths.toSet
+    case _ => false
+  }
+
+  override def hashCode(): Int = paths.toSet.hashCode()
+}
+
+
+object ListingFileCatalog extends Logging {
+
+  /** A serializable variant of HDFS's BlockLocation. */
+  private case class SerializableBlockLocation(
+      names: Array[String],
+      hosts: Array[String],
+      offset: Long,
+      length: Long)
+
+  /** A serializable variant of HDFS's FileStatus. */
+  private case class SerializableFileStatus(
+      path: String,
+      length: Long,
+      isDir: Boolean,
+      blockReplication: Short,
+      blockSize: Long,
+      modificationTime: Long,
+      accessTime: Long,
+      blockLocations: Array[SerializableBlockLocation])
+
+  /**
+   * List a collection of path recursively.
+   */
+  private def listLeafFilesInSerial(
+      paths: Seq[Path],
+      hadoopConf: Configuration): Seq[FileStatus] = {
+    // Dummy jobconf to get to the pathFilter defined in configuration
+    val jobConf = new JobConf(hadoopConf, this.getClass)
+    val filter = FileInputFormat.getInputPathFilter(jobConf)
+
+    paths.flatMap { path =>
+      val fs = path.getFileSystem(hadoopConf)
+      listLeafFiles0(fs, path, filter)
+    }
+  }
 
-        childStatuses.map {
-          case f: LocatedFileStatus => f
-
-          // NOTE:
-          //
-          // - Although S3/S3A/S3N file system can be quite slow for remote file metadata
-          //   operations, calling `getFileBlockLocations` does no harm here since these file system
-          //   implementations don't actually issue RPC for this method.
-          //
-          // - Here we are calling `getFileBlockLocations` in a sequential manner, but it should not
-          //   be a big deal since we always use to `listLeafFilesInParallel` when the number of
-          //   paths exceeds threshold.
-          case f =>
-            if (f.isDirectory ) {
-              // If f is a directory, we do not need to call getFileBlockLocations (SPARK-14959).
-              f
-            } else {
-              HadoopFsRelation.createLocatedFileStatus(f, fs.getFileBlockLocations(f, 0, f.getLen))
+  /**
+   * List a collection of path recursively in parallel (using Spark executors).
+   * Each task launched will use [[listLeafFilesInSerial]] to list.
+   */
+  private def listLeafFilesInParallel(
+      paths: Seq[Path],
+      hadoopConf: Configuration,
+      sparkSession: SparkSession): Seq[FileStatus] = {
+    assert(paths.size >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold)
+    logInfo(s"Listing leaf files and directories in parallel under: ${paths.mkString(", ")}")
+
+    val sparkContext = sparkSession.sparkContext
+    val serializableConfiguration = new SerializableConfiguration(hadoopConf)
+    val serializedPaths = paths.map(_.toString)
+
+    // Set the number of parallelism to prevent following file listing from generating many tasks
+    // in case of large #defaultParallelism.
+    val numParallelism = Math.min(paths.size, 10000)
+
+    val statuses = sparkContext
+      .parallelize(serializedPaths, numParallelism)
+      .mapPartitions { paths =>
+        val hadoopConf = serializableConfiguration.value
+        listLeafFilesInSerial(paths.map(new Path(_)).toSeq, hadoopConf).iterator
+      }.map { status =>
+        // Turn FileStatus into SerializableFileStatus so we can send it back to the driver
+        val blockLocations = status match {
+          case f: LocatedFileStatus =>
+            f.getBlockLocations.map { loc =>
+              SerializableBlockLocation(
+                loc.getNames,
+                loc.getHosts,
+                loc.getOffset,
+                loc.getLength)
             }
+
+          case _ =>
+            Array.empty[SerializableBlockLocation]
         }
-      }.filterNot { status =>
-        val name = status.getPath.getName
-        HadoopFsRelation.shouldFilterOut(name)
-      }
 
-      val (dirs, files) = statuses.partition(_.isDirectory)
+        SerializableFileStatus(
+          status.getPath.toString,
+          status.getLen,
+          status.isDirectory,
+          status.getReplication,
+          status.getBlockSize,
+          status.getModificationTime,
+          status.getAccessTime,
+          blockLocations)
+      }.collect()
 
-      // It uses [[LinkedHashSet]] since the order of files can affect the results. (SPARK-11500)
-      if (dirs.isEmpty) {
-        mutable.LinkedHashSet(files: _*)
-      } else {
-        mutable.LinkedHashSet(files: _*) ++ listLeafFiles(dirs.map(_.getPath))
+    // Turn SerializableFileStatus back to Status
+    statuses.map { f =>
+      val blockLocations = f.blockLocations.map { loc =>
+        new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length)
       }
+      new LocatedFileStatus(
+        new FileStatus(
+          f.length, f.isDir, f.blockReplication, f.blockSize, f.modificationTime, new Path(f.path)),
+        blockLocations)
     }
   }
 
-  override def equals(other: Any): Boolean = other match {
-    case hdfs: ListingFileCatalog => paths.toSet == hdfs.paths.toSet
-    case _ => false
+  /**
+   * List a single path, provided as a FileStatus, in serial.
+   */
+  private def listLeafFiles0(
+      fs: FileSystem, path: Path, filter: PathFilter): Seq[FileStatus] = {
+    logTrace(s"Listing $path")
+    val name = path.getName.toLowerCase
+    if (shouldFilterOut(name)) {
+      Seq.empty[FileStatus]
+    } else {
+      // [SPARK-17599] Prevent ListingFileCatalog from failing if path doesn't exist
+      // Note that statuses only include FileStatus for the files and dirs directly under path,
+      // and does not include anything else recursively.
+      val statuses = try fs.listStatus(path) catch {
+        case _: FileNotFoundException =>
+          logWarning(s"The directory $path was not found. Was it deleted very recently?")
+          Array.empty[FileStatus]
+      }
+
+      val allLeafStatuses = {
+        val (dirs, files) = statuses.partition(_.isDirectory)
+        val stats = files ++ dirs.flatMap(dir => listLeafFiles0(fs, dir.getPath, filter))
+        if (filter != null) stats.filter(f => filter.accept(f.getPath)) else stats
+      }
+
+      allLeafStatuses.filterNot(status => shouldFilterOut(status.getPath.getName)).map {
+        case f: LocatedFileStatus =>
+          f
+
+        // NOTE:
+        //
+        // - Although S3/S3A/S3N file system can be quite slow for remote file metadata
+        //   operations, calling `getFileBlockLocations` does no harm here since these file system
+        //   implementations don't actually issue RPC for this method.
+        //
+        // - Here we are calling `getFileBlockLocations` in a sequential manner, but it should not
+        //   be a big deal since we always use to `listLeafFilesInParallel` when the number of
+        //   paths exceeds threshold.
+        case f =>
+          // The other constructor of LocatedFileStatus will call FileStatus.getPermission(),
+          // which is very slow on some file system (RawLocalFileSystem, which is launch a
+          // subprocess and parse the stdout).
+          val locations = fs.getFileBlockLocations(f, 0, f.getLen)
+          val lfs = new LocatedFileStatus(f.getLen, f.isDirectory, f.getReplication, f.getBlockSize,
+            f.getModificationTime, 0, null, null, null, null, f.getPath, locations)
+          if (f.isSymlink) {
+            lfs.setSymlink(f.getSymlink)
+          }
+          lfs
+      }
+    }
   }
 
-  override def hashCode(): Int = paths.toSet.hashCode()
+  /** Checks if we should filter out this path name. */
+  def shouldFilterOut(pathName: String): Boolean = {
+    // We filter everything that starts with _ and ., except _common_metadata and _metadata
+    // because Parquet needs to find those metadata files from leaf files returned by this method.
+    // We should refactor this logic to not mix metadata files with data files.
+    ((pathName.startsWith("_") && !pathName.contains("=")) || pathName.startsWith(".")) &&
+      !pathName.startsWith("_common_metadata") && !pathName.startsWith("_metadata")
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala
index 5cc5f32e6e809..69dd622ce4a54 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala
@@ -17,16 +17,12 @@
 
 package org.apache.spark.sql.execution.datasources
 
-import scala.collection.mutable
-
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs._
 import org.apache.hadoop.io.compress.{CompressionCodecFactory, SplittableCompressionCodec}
-import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
 import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}
 
 import org.apache.spark.annotation.Experimental
-import org.apache.spark.internal.Logging
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.catalog.BucketSpec
@@ -35,7 +31,6 @@ import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjectio
 import org.apache.spark.sql.execution.FileRelation
 import org.apache.spark.sql.sources.{BaseRelation, DataSourceRegister, Filter}
 import org.apache.spark.sql.types.StructType
-import org.apache.spark.util.SerializableConfiguration
 
 /**
  * ::Experimental::
@@ -352,152 +347,3 @@ trait FileCatalog {
   /** Refresh the file listing */
   def refresh(): Unit
 }
-
-
-/**
- * Helper methods for gathering metadata from HDFS.
- */
-object HadoopFsRelation extends Logging {
-
-  /** Checks if we should filter out this path name. */
-  def shouldFilterOut(pathName: String): Boolean = {
-    // We filter everything that starts with _ and ., except _common_metadata and _metadata
-    // because Parquet needs to find those metadata files from leaf files returned by this method.
-    // We should refactor this logic to not mix metadata files with data files.
-    ((pathName.startsWith("_") && !pathName.contains("=")) || pathName.startsWith(".")) &&
-      !pathName.startsWith("_common_metadata") && !pathName.startsWith("_metadata")
-  }
-
-  /**
-   * Create a LocatedFileStatus using FileStatus and block locations.
-   */
-  def createLocatedFileStatus(f: FileStatus, locations: Array[BlockLocation]): LocatedFileStatus = {
-    // The other constructor of LocatedFileStatus will call FileStatus.getPermission(), which is
-    // very slow on some file system (RawLocalFileSystem, which is launch a subprocess and parse the
-    // stdout).
-    val lfs = new LocatedFileStatus(f.getLen, f.isDirectory, f.getReplication, f.getBlockSize,
-      f.getModificationTime, 0, null, null, null, null, f.getPath, locations)
-    if (f.isSymlink) {
-      lfs.setSymlink(f.getSymlink)
-    }
-    lfs
-  }
-
-  // We don't filter files/directories whose name start with "_" except "_temporary" here, as
-  // specific data sources may take advantages over them (e.g. Parquet _metadata and
-  // _common_metadata files). "_temporary" directories are explicitly ignored since failed
-  // tasks/jobs may leave partial/corrupted data files there.  Files and directories whose name
-  // start with "." are also ignored.
-  def listLeafFiles(fs: FileSystem, status: FileStatus, filter: PathFilter): Array[FileStatus] = {
-    logTrace(s"Listing ${status.getPath}")
-    val name = status.getPath.getName.toLowerCase
-    if (shouldFilterOut(name)) {
-      Array.empty[FileStatus]
-    } else {
-      val statuses = {
-        val (dirs, files) = fs.listStatus(status.getPath).partition(_.isDirectory)
-        val stats = files ++ dirs.flatMap(dir => listLeafFiles(fs, dir, filter))
-        if (filter != null) stats.filter(f => filter.accept(f.getPath)) else stats
-      }
-      // statuses do not have any dirs.
-      statuses.filterNot(status => shouldFilterOut(status.getPath.getName)).map {
-        case f: LocatedFileStatus => f
-
-        // NOTE:
-        //
-        // - Although S3/S3A/S3N file system can be quite slow for remote file metadata
-        //   operations, calling `getFileBlockLocations` does no harm here since these file system
-        //   implementations don't actually issue RPC for this method.
-        //
-        // - Here we are calling `getFileBlockLocations` in a sequential manner, but it should not
-        //   be a big deal since we always use to `listLeafFilesInParallel` when the number of
-        //   paths exceeds threshold.
-        case f => createLocatedFileStatus(f, fs.getFileBlockLocations(f, 0, f.getLen))
-      }
-    }
-  }
-
-  // `FileStatus` is Writable but not serializable.  What make it worse, somehow it doesn't play
-  // well with `SerializableWritable`.  So there seems to be no way to serialize a `FileStatus`.
-  // Here we use `FakeFileStatus` to extract key components of a `FileStatus` to serialize it from
-  // executor side and reconstruct it on driver side.
-  case class FakeBlockLocation(
-      names: Array[String],
-      hosts: Array[String],
-      offset: Long,
-      length: Long)
-
-  case class FakeFileStatus(
-      path: String,
-      length: Long,
-      isDir: Boolean,
-      blockReplication: Short,
-      blockSize: Long,
-      modificationTime: Long,
-      accessTime: Long,
-      blockLocations: Array[FakeBlockLocation])
-
-  def listLeafFilesInParallel(
-      paths: Seq[Path],
-      hadoopConf: Configuration,
-      sparkSession: SparkSession): mutable.LinkedHashSet[FileStatus] = {
-    assert(paths.size >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold)
-    logInfo(s"Listing leaf files and directories in parallel under: ${paths.mkString(", ")}")
-
-    val sparkContext = sparkSession.sparkContext
-    val serializableConfiguration = new SerializableConfiguration(hadoopConf)
-    val serializedPaths = paths.map(_.toString)
-
-    // Set the number of parallelism to prevent following file listing from generating many tasks
-    // in case of large #defaultParallelism.
-    val numParallelism = Math.min(paths.size, 10000)
-
-    val fakeStatuses = sparkContext
-        .parallelize(serializedPaths, numParallelism)
-        .mapPartitions { paths =>
-      // Dummy jobconf to get to the pathFilter defined in configuration
-      // It's very expensive to create a JobConf(ClassUtil.findContainingJar() is slow)
-      val jobConf = new JobConf(serializableConfiguration.value, this.getClass)
-      val pathFilter = FileInputFormat.getInputPathFilter(jobConf)
-      paths.map(new Path(_)).flatMap { path =>
-        val fs = path.getFileSystem(serializableConfiguration.value)
-        listLeafFiles(fs, fs.getFileStatus(path), pathFilter)
-      }
-    }.map { status =>
-      val blockLocations = status match {
-        case f: LocatedFileStatus =>
-          f.getBlockLocations.map { loc =>
-            FakeBlockLocation(
-              loc.getNames,
-              loc.getHosts,
-              loc.getOffset,
-              loc.getLength)
-          }
-
-        case _ =>
-          Array.empty[FakeBlockLocation]
-      }
-
-      FakeFileStatus(
-        status.getPath.toString,
-        status.getLen,
-        status.isDirectory,
-        status.getReplication,
-        status.getBlockSize,
-        status.getModificationTime,
-        status.getAccessTime,
-        blockLocations)
-    }.collect()
-
-    val hadoopFakeStatuses = fakeStatuses.map { f =>
-      val blockLocations = f.blockLocations.map { loc =>
-        new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length)
-      }
-      new LocatedFileStatus(
-        new FileStatus(
-          f.length, f.isDir, f.blockReplication, f.blockSize, f.modificationTime, new Path(f.path)),
-        blockLocations)
-    }
-    mutable.LinkedHashSet(hadoopFakeStatuses: _*)
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala
index 3c68dc8bb98d8..89d57653adcbd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala
@@ -39,15 +39,4 @@ class HadoopFsRelationSuite extends QueryTest with SharedSQLContext {
       assert(df.queryExecution.logical.statistics.sizeInBytes === BigInt(totalSize))
     }
   }
-
-  test("file filtering") {
-    assert(!HadoopFsRelation.shouldFilterOut("abcd"))
-    assert(HadoopFsRelation.shouldFilterOut(".ab"))
-    assert(HadoopFsRelation.shouldFilterOut("_cd"))
-
-    assert(!HadoopFsRelation.shouldFilterOut("_metadata"))
-    assert(!HadoopFsRelation.shouldFilterOut("_common_metadata"))
-    assert(HadoopFsRelation.shouldFilterOut("_ab_metadata"))
-    assert(HadoopFsRelation.shouldFilterOut("_cd_common_metadata"))
-  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalogSuite.scala
new file mode 100644
index 0000000000000..f15730aeb11f2
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalogSuite.scala
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import org.apache.spark.SparkFunSuite
+
+class ListingFileCatalogSuite extends SparkFunSuite {
+
+  test("file filtering") {
+    assert(!ListingFileCatalog.shouldFilterOut("abcd"))
+    assert(ListingFileCatalog.shouldFilterOut(".ab"))
+    assert(ListingFileCatalog.shouldFilterOut("_cd"))
+
+    assert(!ListingFileCatalog.shouldFilterOut("_metadata"))
+    assert(!ListingFileCatalog.shouldFilterOut("_common_metadata"))
+    assert(ListingFileCatalog.shouldFilterOut("_ab_metadata"))
+    assert(ListingFileCatalog.shouldFilterOut("_cd_common_metadata"))
+  }
+}

From 9dc0ca060d5925cd666b34021e62f7b38bb3aabb Mon Sep 17 00:00:00 2001
From: Jakob Odersky <jakob@odersky.com>
Date: Thu, 13 Oct 2016 17:48:09 -0700
Subject: [PATCH 006/162] [SPARK-17368][SQL] Add support for value class
 serialization and deserialization

## What changes were proposed in this pull request?
Value classes were unsupported because catalyst data types were
obtained through reflection on erased types, which would resolve to a
value class' wrapped type and hence lead to unavailable methods during
code generation.

E.g. the following class
```scala
case class Foo(x: Int) extends AnyVal
```
would be seen as an `int` in catalyst and will cause instance cast failures when generated java code tries to treat it as a `Foo`.

This patch simply removes the erasure step when getting data types for
catalyst.

## How was this patch tested?
Additional tests in `ExpressionEncoderSuite`.

Author: Jakob Odersky <jakob@odersky.com>

Closes #15284 from jodersky/value-classes.
---
 .../spark/sql/catalyst/ScalaReflection.scala       |  2 +-
 .../catalyst/encoders/ExpressionEncoderSuite.scala | 14 ++++++++++++--
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index 7923cfce82100..31c6e5def143b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -628,7 +628,7 @@ object ScalaReflection extends ScalaReflection {
   /*
    * Retrieves the runtime class corresponding to the provided type.
    */
-  def getClassFromType(tpe: Type): Class[_] = mirror.runtimeClass(tpe.erasure.typeSymbol.asClass)
+  def getClassFromType(tpe: Type): Class[_] = mirror.runtimeClass(tpe.typeSymbol.asClass)
 
   case class Schema(dataType: DataType, nullable: Boolean)
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala
index 4df9062018995..4d896c2e38f10 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala
@@ -66,8 +66,6 @@ case class RepeatedData(
     mapFieldNull: scala.collection.Map[Int, java.lang.Long],
     structField: PrimitiveData)
 
-case class SpecificCollection(l: List[Int])
-
 /** For testing Kryo serialization based encoder. */
 class KryoSerializable(val value: Int) {
   override def hashCode(): Int = value
@@ -107,6 +105,12 @@ class UDTForCaseClass extends UserDefinedType[UDTCaseClass] {
   }
 }
 
+case class PrimitiveValueClass(wrapped: Int) extends AnyVal
+case class ReferenceValueClass(wrapped: ReferenceValueClass.Container) extends AnyVal
+object ReferenceValueClass {
+  case class Container(data: Int)
+}
+
 class ExpressionEncoderSuite extends PlanTest with AnalysisTest {
   OuterScopes.addOuterScope(this)
 
@@ -290,6 +294,12 @@ class ExpressionEncoderSuite extends PlanTest with AnalysisTest {
     ExpressionEncoder.tuple(intEnc, ExpressionEncoder.tuple(intEnc, longEnc))
   }
 
+  encodeDecodeTest(
+    PrimitiveValueClass(42), "primitive value class")
+
+  encodeDecodeTest(
+    ReferenceValueClass(ReferenceValueClass.Container(1)), "reference value class")
+
   productTest(("UDT", new ExamplePoint(0.1, 0.2)))
 
   test("nullable of encoder schema") {

From 44cbb61b34a98e3e0d8e2543a4eb6e950e0019a5 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Thu, 13 Oct 2016 19:44:24 -0700
Subject: [PATCH 007/162] [SPARK-15957][FOLLOW-UP][ML][PYSPARK] Add Python API
 for RFormula forceIndexLabel.

## What changes were proposed in this pull request?
Follow-up work of #13675, add Python API for ```RFormula forceIndexLabel```.

## How was this patch tested?
Unit test.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #15430 from yanboliang/spark-15957-python.
---
 python/pyspark/ml/feature.py | 31 +++++++++++++++++++++++++++----
 python/pyspark/ml/tests.py   | 16 ++++++++++++++++
 2 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 64b21caa616ec..a33c3e79453e1 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -2494,21 +2494,30 @@ class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol, JavaMLReadable, JavaM
     formula = Param(Params._dummy(), "formula", "R model formula",
                     typeConverter=TypeConverters.toString)
 
+    forceIndexLabel = Param(Params._dummy(), "forceIndexLabel",
+                            "Force to index label whether it is numeric or string",
+                            typeConverter=TypeConverters.toBoolean)
+
     @keyword_only
-    def __init__(self, formula=None, featuresCol="features", labelCol="label"):
+    def __init__(self, formula=None, featuresCol="features", labelCol="label",
+                 forceIndexLabel=False):
         """
-        __init__(self, formula=None, featuresCol="features", labelCol="label")
+        __init__(self, formula=None, featuresCol="features", labelCol="label", \
+                 forceIndexLabel=False)
         """
         super(RFormula, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RFormula", self.uid)
+        self._setDefault(forceIndexLabel=False)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
     @since("1.5.0")
-    def setParams(self, formula=None, featuresCol="features", labelCol="label"):
+    def setParams(self, formula=None, featuresCol="features", labelCol="label",
+                  forceIndexLabel=False):
         """
-        setParams(self, formula=None, featuresCol="features", labelCol="label")
+        setParams(self, formula=None, featuresCol="features", labelCol="label", \
+                  forceIndexLabel=False)
         Sets params for RFormula.
         """
         kwargs = self.setParams._input_kwargs
@@ -2528,6 +2537,20 @@ def getFormula(self):
         """
         return self.getOrDefault(self.formula)
 
+    @since("2.1.0")
+    def setForceIndexLabel(self, value):
+        """
+        Sets the value of :py:attr:`forceIndexLabel`.
+        """
+        return self._set(forceIndexLabel=value)
+
+    @since("2.1.0")
+    def getForceIndexLabel(self):
+        """
+        Gets the value of :py:attr:`forceIndexLabel`.
+        """
+        return self.getOrDefault(self.forceIndexLabel)
+
     def _create_model(self, java_model):
         return RFormulaModel(java_model)
 
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index e233549850888..9d46cc3b4ae64 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -477,6 +477,22 @@ def test_count_vectorizer_with_binary(self):
             feature, expected = r
             self.assertEqual(feature, expected)
 
+    def test_rformula_force_index_label(self):
+        df = self.spark.createDataFrame([
+            (1.0, 1.0, "a"),
+            (0.0, 2.0, "b"),
+            (1.0, 0.0, "a")], ["y", "x", "s"])
+        # Does not index label by default since it's numeric type.
+        rf = RFormula(formula="y ~ x + s")
+        model = rf.fit(df)
+        transformedDF = model.transform(df)
+        self.assertEqual(transformedDF.head().label, 1.0)
+        # Force to index label.
+        rf2 = RFormula(formula="y ~ x + s").setForceIndexLabel(True)
+        model2 = rf2.fit(df)
+        transformedDF2 = model2.transform(df)
+        self.assertEqual(transformedDF2.head().label, 0.0)
+
 
 class HasInducedError(Params):
 

From 8543996c3f44098a521fc6b90ca0bb575f606e2a Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Fri, 14 Oct 2016 12:35:59 +0800
Subject: [PATCH 008/162] [SPARK-17927][SQL] Remove dead code in
 WriterContainer.

## What changes were proposed in this pull request?
speculationEnabled and DATASOURCE_OUTPUTPATH seem like just dead code.

## How was this patch tested?
Tests should fail if they are not dead code.

Author: Reynold Xin <rxin@databricks.com>

Closes #15477 from rxin/SPARK-17927.
---
 .../sql/execution/datasources/WriterContainer.scala   | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
index 7880c7cfa16f8..253aa4405defa 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
@@ -49,7 +49,6 @@ private[datasources] case class WriteRelation(
 
 object WriterContainer {
   val DATASOURCE_WRITEJOBUUID = "spark.sql.sources.writeJobUUID"
-  val DATASOURCE_OUTPUTPATH = "spark.sql.sources.output.path"
 }
 
 private[datasources] abstract class BaseWriterContainer(
@@ -73,9 +72,6 @@ private[datasources] abstract class BaseWriterContainer(
   // This is only used on driver side.
   @transient private val jobContext: JobContext = job
 
-  private val speculationEnabled: Boolean =
-    relation.sparkSession.sparkContext.conf.getBoolean("spark.speculation", defaultValue = false)
-
   // The following fields are initialized and used on both driver and executor side.
   @transient protected var outputCommitter: OutputCommitter = _
   @transient private var jobId: JobID = _
@@ -247,8 +243,6 @@ private[datasources] class DefaultWriterContainer(
 
   def writeRows(taskContext: TaskContext, iterator: Iterator[InternalRow]): Unit = {
     executorSideSetup(taskContext)
-    val configuration = taskAttemptContext.getConfiguration
-    configuration.set(WriterContainer.DATASOURCE_OUTPUTPATH, outputPath)
     var writer = newOutputWriter(getWorkPath)
     writer.initConverter(dataSchema)
 
@@ -353,15 +347,10 @@ private[datasources] class DynamicPartitionWriterContainer(
   private def newOutputWriter(
       key: InternalRow,
       getPartitionString: UnsafeProjection): OutputWriter = {
-    val configuration = taskAttemptContext.getConfiguration
     val path = if (partitionColumns.nonEmpty) {
       val partitionPath = getPartitionString(key).getString(0)
-      configuration.set(
-        WriterContainer.DATASOURCE_OUTPUTPATH,
-        new Path(outputPath, partitionPath).toString)
       new Path(getWorkPath, partitionPath).toString
     } else {
-      configuration.set(WriterContainer.DATASOURCE_OUTPUTPATH, outputPath)
       getWorkPath
     }
     val bucketId = getBucketIdFromKey(key)

From 6c29b3de763115d8676ed91f896e75c490e8c5b2 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Fri, 14 Oct 2016 14:14:52 +0800
Subject: [PATCH 009/162] [SPARK-17925][SQL] Break fileSourceInterfaces.scala
 into multiple pieces

## What changes were proposed in this pull request?
This patch does a few changes to the file structure of data sources:

- Break fileSourceInterfaces.scala into multiple pieces (HadoopFsRelation, FileFormat, OutputWriter)
- Move ParquetOutputWriter into its own file

I created this as a separate patch so it'd be easier to review my future PRs that focus on refactoring this internal logic. This patch only moves code around, and has no logic changes.

## How was this patch tested?
N/A - should be covered by existing tests.

Author: Reynold Xin <rxin@databricks.com>

Closes #15473 from rxin/SPARK-17925.
---
 ...ourceInterfaces.scala => FileFormat.scala} | 143 +-------------
 .../datasources/HadoopFsRelation.scala        |  77 ++++++++
 .../execution/datasources/OutputWriter.scala  | 101 ++++++++++
 .../parquet/ParquetFileFormat.scala           | 144 --------------
 .../parquet/ParquetOutputWriter.scala         | 178 ++++++++++++++++++
 5 files changed, 359 insertions(+), 284 deletions(-)
 rename sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/{fileSourceInterfaces.scala => FileFormat.scala} (59%)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/OutputWriter.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOutputWriter.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
similarity index 59%
rename from sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
index 69dd622ce4a54..bde2d2b89d56f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
@@ -20,152 +20,15 @@ package org.apache.spark.sql.execution.datasources
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs._
 import org.apache.hadoop.io.compress.{CompressionCodecFactory, SplittableCompressionCodec}
-import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}
+import org.apache.hadoop.mapreduce.Job
 
-import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
-import org.apache.spark.sql.catalyst.catalog.BucketSpec
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
-import org.apache.spark.sql.execution.FileRelation
-import org.apache.spark.sql.sources.{BaseRelation, DataSourceRegister, Filter}
+import org.apache.spark.sql.sources.Filter
 import org.apache.spark.sql.types.StructType
 
-/**
- * ::Experimental::
- * A factory that produces [[OutputWriter]]s.  A new [[OutputWriterFactory]] is created on driver
- * side for each write job issued when writing to a [[HadoopFsRelation]], and then gets serialized
- * to executor side to create actual [[OutputWriter]]s on the fly.
- *
- * @since 1.4.0
- */
-@Experimental
-abstract class OutputWriterFactory extends Serializable {
-  /**
-   * When writing to a [[HadoopFsRelation]], this method gets called by each task on executor side
-   * to instantiate new [[OutputWriter]]s.
-   *
-   * @param path Path of the file to which this [[OutputWriter]] is supposed to write.  Note that
-   *        this may not point to the final output file.  For example, `FileOutputFormat` writes to
-   *        temporary directories and then merge written files back to the final destination.  In
-   *        this case, `path` points to a temporary output file under the temporary directory.
-   * @param dataSchema Schema of the rows to be written. Partition columns are not included in the
-   *        schema if the relation being written is partitioned.
-   * @param context The Hadoop MapReduce task context.
-   * @since 1.4.0
-   */
-  def newInstance(
-      path: String,
-      bucketId: Option[Int], // TODO: This doesn't belong here...
-      dataSchema: StructType,
-      context: TaskAttemptContext): OutputWriter
-
-  /**
-   * Returns a new instance of [[OutputWriter]] that will write data to the given path.
-   * This method gets called by each task on executor to write [[InternalRow]]s to
-   * format-specific files. Compared to the other `newInstance()`, this is a newer API that
-   * passes only the path that the writer must write to. The writer must write to the exact path
-   * and not modify it (do not add subdirectories, extensions, etc.). All other
-   * file-format-specific information needed to create the writer must be passed
-   * through the [[OutputWriterFactory]] implementation.
-   * @since 2.0.0
-   */
-  def newWriter(path: String): OutputWriter = {
-    throw new UnsupportedOperationException("newInstance with just path not supported")
-  }
-}
-
-/**
- * ::Experimental::
- * [[OutputWriter]] is used together with [[HadoopFsRelation]] for persisting rows to the
- * underlying file system.  Subclasses of [[OutputWriter]] must provide a zero-argument constructor.
- * An [[OutputWriter]] instance is created and initialized when a new output file is opened on
- * executor side.  This instance is used to persist rows to this single output file.
- *
- * @since 1.4.0
- */
-@Experimental
-abstract class OutputWriter {
-  /**
-   * Persists a single row.  Invoked on the executor side.  When writing to dynamically partitioned
-   * tables, dynamic partition columns are not included in rows to be written.
-   *
-   * @since 1.4.0
-   */
-  def write(row: Row): Unit
-
-  /**
-   * Closes the [[OutputWriter]]. Invoked on the executor side after all rows are persisted, before
-   * the task output is committed.
-   *
-   * @since 1.4.0
-   */
-  def close(): Unit
-
-  private var converter: InternalRow => Row = _
-
-  protected[sql] def initConverter(dataSchema: StructType) = {
-    converter =
-      CatalystTypeConverters.createToScalaConverter(dataSchema).asInstanceOf[InternalRow => Row]
-  }
-
-  protected[sql] def writeInternal(row: InternalRow): Unit = {
-    write(converter(row))
-  }
-}
-
-/**
- * Acts as a container for all of the metadata required to read from a datasource. All discovery,
- * resolution and merging logic for schemas and partitions has been removed.
- *
- * @param location A [[FileCatalog]] that can enumerate the locations of all the files that comprise
- *                 this relation.
- * @param partitionSchema The schema of the columns (if any) that are used to partition the relation
- * @param dataSchema The schema of any remaining columns.  Note that if any partition columns are
- *                   present in the actual data files as well, they are preserved.
- * @param bucketSpec Describes the bucketing (hash-partitioning of the files by some column values).
- * @param fileFormat A file format that can be used to read and write the data in files.
- * @param options Configuration used when reading / writing data.
- */
-case class HadoopFsRelation(
-    location: FileCatalog,
-    partitionSchema: StructType,
-    dataSchema: StructType,
-    bucketSpec: Option[BucketSpec],
-    fileFormat: FileFormat,
-    options: Map[String, String])(val sparkSession: SparkSession)
-  extends BaseRelation with FileRelation {
-
-  override def sqlContext: SQLContext = sparkSession.sqlContext
-
-  val schema: StructType = {
-    val dataSchemaColumnNames = dataSchema.map(_.name.toLowerCase).toSet
-    StructType(dataSchema ++ partitionSchema.filterNot { column =>
-      dataSchemaColumnNames.contains(column.name.toLowerCase)
-    })
-  }
-
-  def partitionSchemaOption: Option[StructType] =
-    if (partitionSchema.isEmpty) None else Some(partitionSchema)
-  def partitionSpec: PartitionSpec = location.partitionSpec()
-
-  def refresh(): Unit = location.refresh()
-
-  override def toString: String = {
-    fileFormat match {
-      case source: DataSourceRegister => source.shortName()
-      case _ => "HadoopFiles"
-    }
-  }
-
-  /** Returns the list of files that will be read when scanning this relation. */
-  override def inputFiles: Array[String] =
-    location.allFiles().map(_.getPath.toUri.toString).toArray
-
-  override def sizeInBytes: Long = location.allFiles().map(_.getLen).sum
-}
-
 /**
  * Used to read and write data stored in files to/from the [[InternalRow]] format.
  */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
new file mode 100644
index 0000000000000..c7ebe0b76a150
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import org.apache.spark.sql.{SparkSession, SQLContext}
+import org.apache.spark.sql.catalyst.catalog.BucketSpec
+import org.apache.spark.sql.execution.FileRelation
+import org.apache.spark.sql.sources.{BaseRelation, DataSourceRegister}
+import org.apache.spark.sql.types.StructType
+
+
+/**
+ * Acts as a container for all of the metadata required to read from a datasource. All discovery,
+ * resolution and merging logic for schemas and partitions has been removed.
+ *
+ * @param location A [[FileCatalog]] that can enumerate the locations of all the files that comprise
+ *                 this relation.
+ * @param partitionSchema The schema of the columns (if any) that are used to partition the relation
+ * @param dataSchema The schema of any remaining columns.  Note that if any partition columns are
+ *                   present in the actual data files as well, they are preserved.
+ * @param bucketSpec Describes the bucketing (hash-partitioning of the files by some column values).
+ * @param fileFormat A file format that can be used to read and write the data in files.
+ * @param options Configuration used when reading / writing data.
+ */
+case class HadoopFsRelation(
+    location: FileCatalog,
+    partitionSchema: StructType,
+    dataSchema: StructType,
+    bucketSpec: Option[BucketSpec],
+    fileFormat: FileFormat,
+    options: Map[String, String])(val sparkSession: SparkSession)
+  extends BaseRelation with FileRelation {
+
+  override def sqlContext: SQLContext = sparkSession.sqlContext
+
+  val schema: StructType = {
+    val dataSchemaColumnNames = dataSchema.map(_.name.toLowerCase).toSet
+    StructType(dataSchema ++ partitionSchema.filterNot { column =>
+      dataSchemaColumnNames.contains(column.name.toLowerCase)
+    })
+  }
+
+  def partitionSchemaOption: Option[StructType] =
+    if (partitionSchema.isEmpty) None else Some(partitionSchema)
+
+  def partitionSpec: PartitionSpec = location.partitionSpec()
+
+  def refresh(): Unit = location.refresh()
+
+  override def toString: String = {
+    fileFormat match {
+      case source: DataSourceRegister => source.shortName()
+      case _ => "HadoopFiles"
+    }
+  }
+
+  /** Returns the list of files that will be read when scanning this relation. */
+  override def inputFiles: Array[String] =
+    location.allFiles().map(_.getPath.toUri.toString).toArray
+
+  override def sizeInBytes: Long = location.allFiles().map(_.getLen).sum
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/OutputWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/OutputWriter.scala
new file mode 100644
index 0000000000000..d2eec7b1413f8
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/OutputWriter.scala
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import org.apache.hadoop.mapreduce.TaskAttemptContext
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
+import org.apache.spark.sql.types.StructType
+
+
+/**
+ * A factory that produces [[OutputWriter]]s.  A new [[OutputWriterFactory]] is created on driver
+ * side for each write job issued when writing to a [[HadoopFsRelation]], and then gets serialized
+ * to executor side to create actual [[OutputWriter]]s on the fly.
+ */
+abstract class OutputWriterFactory extends Serializable {
+  /**
+   * When writing to a [[HadoopFsRelation]], this method gets called by each task on executor side
+   * to instantiate new [[OutputWriter]]s.
+   *
+   * @param path Path of the file to which this [[OutputWriter]] is supposed to write.  Note that
+   *        this may not point to the final output file.  For example, `FileOutputFormat` writes to
+   *        temporary directories and then merge written files back to the final destination.  In
+   *        this case, `path` points to a temporary output file under the temporary directory.
+   * @param dataSchema Schema of the rows to be written. Partition columns are not included in the
+   *        schema if the relation being written is partitioned.
+   * @param context The Hadoop MapReduce task context.
+   * @since 1.4.0
+   */
+  def newInstance(
+      path: String,
+      bucketId: Option[Int], // TODO: This doesn't belong here...
+      dataSchema: StructType,
+      context: TaskAttemptContext): OutputWriter
+
+  /**
+   * Returns a new instance of [[OutputWriter]] that will write data to the given path.
+   * This method gets called by each task on executor to write InternalRows to
+   * format-specific files. Compared to the other `newInstance()`, this is a newer API that
+   * passes only the path that the writer must write to. The writer must write to the exact path
+   * and not modify it (do not add subdirectories, extensions, etc.). All other
+   * file-format-specific information needed to create the writer must be passed
+   * through the [[OutputWriterFactory]] implementation.
+   * @since 2.0.0
+   */
+  def newWriter(path: String): OutputWriter = {
+    throw new UnsupportedOperationException("newInstance with just path not supported")
+  }
+}
+
+
+/**
+ * [[OutputWriter]] is used together with [[HadoopFsRelation]] for persisting rows to the
+ * underlying file system.  Subclasses of [[OutputWriter]] must provide a zero-argument constructor.
+ * An [[OutputWriter]] instance is created and initialized when a new output file is opened on
+ * executor side.  This instance is used to persist rows to this single output file.
+ */
+abstract class OutputWriter {
+  /**
+   * Persists a single row.  Invoked on the executor side.  When writing to dynamically partitioned
+   * tables, dynamic partition columns are not included in rows to be written.
+   *
+   * @since 1.4.0
+   */
+  def write(row: Row): Unit
+
+  /**
+   * Closes the [[OutputWriter]]. Invoked on the executor side after all rows are persisted, before
+   * the task output is committed.
+   *
+   * @since 1.4.0
+   */
+  def close(): Unit
+
+  private var converter: InternalRow => Row = _
+
+  protected[sql] def initConverter(dataSchema: StructType) = {
+    converter =
+      CatalystTypeConverters.createToScalaConverter(dataSchema).asInstanceOf[InternalRow => Row]
+  }
+
+  protected[sql] def writeInternal(row: InternalRow): Unit = {
+    write(converter(row))
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
index 4a308ff1a32f8..6faafed1e6290 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
@@ -425,150 +425,6 @@ class ParquetFileFormat
   }
 }
 
-/**
- * A factory for generating OutputWriters for writing parquet files. This implemented is different
- * from the [[ParquetOutputWriter]] as this does not use any [[OutputCommitter]]. It simply
- * writes the data to the path used to generate the output writer. Callers of this factory
- * has to ensure which files are to be considered as committed.
- */
-private[parquet] class ParquetOutputWriterFactory(
-    sqlConf: SQLConf,
-    dataSchema: StructType,
-    hadoopConf: Configuration,
-    options: Map[String, String]) extends OutputWriterFactory {
-
-  private val serializableConf: SerializableConfiguration = {
-    val job = Job.getInstance(hadoopConf)
-    val conf = ContextUtil.getConfiguration(job)
-    val parquetOptions = new ParquetOptions(options, sqlConf)
-
-    // We're not really using `ParquetOutputFormat[Row]` for writing data here, because we override
-    // it in `ParquetOutputWriter` to support appending and dynamic partitioning.  The reason why
-    // we set it here is to setup the output committer class to `ParquetOutputCommitter`, which is
-    // bundled with `ParquetOutputFormat[Row]`.
-    job.setOutputFormatClass(classOf[ParquetOutputFormat[Row]])
-
-    ParquetOutputFormat.setWriteSupportClass(job, classOf[ParquetWriteSupport])
-
-    // We want to clear this temporary metadata from saving into Parquet file.
-    // This metadata is only useful for detecting optional columns when pushing down filters.
-    val dataSchemaToWrite = StructType.removeMetadata(
-      StructType.metadataKeyForOptionalField,
-      dataSchema).asInstanceOf[StructType]
-    ParquetWriteSupport.setSchema(dataSchemaToWrite, conf)
-
-    // Sets flags for `CatalystSchemaConverter` (which converts Catalyst schema to Parquet schema)
-    // and `CatalystWriteSupport` (writing actual rows to Parquet files).
-    conf.set(
-      SQLConf.PARQUET_BINARY_AS_STRING.key,
-      sqlConf.isParquetBinaryAsString.toString)
-
-    conf.set(
-      SQLConf.PARQUET_INT96_AS_TIMESTAMP.key,
-      sqlConf.isParquetINT96AsTimestamp.toString)
-
-    conf.set(
-      SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key,
-      sqlConf.writeLegacyParquetFormat.toString)
-
-    // Sets compression scheme
-    conf.set(ParquetOutputFormat.COMPRESSION, parquetOptions.compressionCodec)
-    new SerializableConfiguration(conf)
-  }
-
-  /**
-   * Returns a [[OutputWriter]] that writes data to the give path without using
-   * [[OutputCommitter]].
-   */
-  override def newWriter(path: String): OutputWriter = new OutputWriter {
-
-    // Create TaskAttemptContext that is used to pass on Configuration to the ParquetRecordWriter
-    private val hadoopTaskAttemptId = new TaskAttemptID(new TaskID(new JobID, TaskType.MAP, 0), 0)
-    private val hadoopAttemptContext = new TaskAttemptContextImpl(
-      serializableConf.value, hadoopTaskAttemptId)
-
-    // Instance of ParquetRecordWriter that does not use OutputCommitter
-    private val recordWriter = createNoCommitterRecordWriter(path, hadoopAttemptContext)
-
-    override def write(row: Row): Unit = {
-      throw new UnsupportedOperationException("call writeInternal")
-    }
-
-    protected[sql] override def writeInternal(row: InternalRow): Unit = {
-      recordWriter.write(null, row)
-    }
-
-    override def close(): Unit = recordWriter.close(hadoopAttemptContext)
-  }
-
-  /** Create a [[ParquetRecordWriter]] that writes the given path without using OutputCommitter */
-  private def createNoCommitterRecordWriter(
-      path: String,
-      hadoopAttemptContext: TaskAttemptContext): RecordWriter[Void, InternalRow] = {
-    // Custom ParquetOutputFormat that disable use of committer and writes to the given path
-    val outputFormat = new ParquetOutputFormat[InternalRow]() {
-      override def getOutputCommitter(c: TaskAttemptContext): OutputCommitter = { null }
-      override def getDefaultWorkFile(c: TaskAttemptContext, ext: String): Path = { new Path(path) }
-    }
-    outputFormat.getRecordWriter(hadoopAttemptContext)
-  }
-
-  /** Disable the use of the older API. */
-  def newInstance(
-      path: String,
-      bucketId: Option[Int],
-      dataSchema: StructType,
-      context: TaskAttemptContext): OutputWriter = {
-    throw new UnsupportedOperationException(
-      "this version of newInstance not supported for " +
-        "ParquetOutputWriterFactory")
-  }
-}
-
-
-// NOTE: This class is instantiated and used on executor side only, no need to be serializable.
-private[parquet] class ParquetOutputWriter(
-    path: String,
-    bucketId: Option[Int],
-    context: TaskAttemptContext)
-  extends OutputWriter {
-
-  private val recordWriter: RecordWriter[Void, InternalRow] = {
-    val outputFormat = {
-      new ParquetOutputFormat[InternalRow]() {
-        // Here we override `getDefaultWorkFile` for two reasons:
-        //
-        //  1. To allow appending.  We need to generate unique output file names to avoid
-        //     overwriting existing files (either exist before the write job, or are just written
-        //     by other tasks within the same write job).
-        //
-        //  2. To allow dynamic partitioning.  Default `getDefaultWorkFile` uses
-        //     `FileOutputCommitter.getWorkPath()`, which points to the base directory of all
-        //     partitions in the case of dynamic partitioning.
-        override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
-          val configuration = context.getConfiguration
-          val uniqueWriteJobId = configuration.get(WriterContainer.DATASOURCE_WRITEJOBUUID)
-          val taskAttemptId = context.getTaskAttemptID
-          val split = taskAttemptId.getTaskID.getId
-          val bucketString = bucketId.map(BucketingUtils.bucketIdToString).getOrElse("")
-          // It has the `.parquet` extension at the end because (de)compression tools
-          // such as gunzip would not be able to decompress this as the compression
-          // is not applied on this whole file but on each "page" in Parquet format.
-          new Path(path, f"part-r-$split%05d-$uniqueWriteJobId$bucketString$extension")
-        }
-      }
-    }
-
-    outputFormat.getRecordWriter(context)
-  }
-
-  override def write(row: Row): Unit = throw new UnsupportedOperationException("call writeInternal")
-
-  override def writeInternal(row: InternalRow): Unit = recordWriter.write(null, row)
-
-  override def close(): Unit = recordWriter.close(context)
-}
-
 object ParquetFileFormat extends Logging {
   private[parquet] def readSchema(
       footers: Seq[Footer], sparkSession: SparkSession): Option[StructType] = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOutputWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOutputWriter.scala
new file mode 100644
index 0000000000000..f89ce05d82d90
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOutputWriter.scala
@@ -0,0 +1,178 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.parquet
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.mapreduce._
+import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
+import org.apache.parquet.hadoop.{ParquetOutputFormat, ParquetRecordWriter}
+import org.apache.parquet.hadoop.util.ContextUtil
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.execution.datasources.{BucketingUtils, OutputWriter, OutputWriterFactory, WriterContainer}
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.util.SerializableConfiguration
+
+
+/**
+ * A factory for generating OutputWriters for writing parquet files. This implemented is different
+ * from the [[ParquetOutputWriter]] as this does not use any [[OutputCommitter]]. It simply
+ * writes the data to the path used to generate the output writer. Callers of this factory
+ * has to ensure which files are to be considered as committed.
+ */
+private[parquet] class ParquetOutputWriterFactory(
+    sqlConf: SQLConf,
+    dataSchema: StructType,
+    hadoopConf: Configuration,
+    options: Map[String, String])
+  extends OutputWriterFactory {
+
+  private val serializableConf: SerializableConfiguration = {
+    val job = Job.getInstance(hadoopConf)
+    val conf = ContextUtil.getConfiguration(job)
+    val parquetOptions = new ParquetOptions(options, sqlConf)
+
+    // We're not really using `ParquetOutputFormat[Row]` for writing data here, because we override
+    // it in `ParquetOutputWriter` to support appending and dynamic partitioning.  The reason why
+    // we set it here is to setup the output committer class to `ParquetOutputCommitter`, which is
+    // bundled with `ParquetOutputFormat[Row]`.
+    job.setOutputFormatClass(classOf[ParquetOutputFormat[Row]])
+
+    ParquetOutputFormat.setWriteSupportClass(job, classOf[ParquetWriteSupport])
+
+    // We want to clear this temporary metadata from saving into Parquet file.
+    // This metadata is only useful for detecting optional columns when pushing down filters.
+    val dataSchemaToWrite = StructType.removeMetadata(
+      StructType.metadataKeyForOptionalField,
+      dataSchema).asInstanceOf[StructType]
+    ParquetWriteSupport.setSchema(dataSchemaToWrite, conf)
+
+    // Sets flags for `CatalystSchemaConverter` (which converts Catalyst schema to Parquet schema)
+    // and `CatalystWriteSupport` (writing actual rows to Parquet files).
+    conf.set(
+      SQLConf.PARQUET_BINARY_AS_STRING.key,
+      sqlConf.isParquetBinaryAsString.toString)
+
+    conf.set(
+      SQLConf.PARQUET_INT96_AS_TIMESTAMP.key,
+      sqlConf.isParquetINT96AsTimestamp.toString)
+
+    conf.set(
+      SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key,
+      sqlConf.writeLegacyParquetFormat.toString)
+
+    // Sets compression scheme
+    conf.set(ParquetOutputFormat.COMPRESSION, parquetOptions.compressionCodec)
+    new SerializableConfiguration(conf)
+  }
+
+  /**
+   * Returns a [[OutputWriter]] that writes data to the give path without using
+   * [[OutputCommitter]].
+   */
+  override def newWriter(path: String): OutputWriter = new OutputWriter {
+
+    // Create TaskAttemptContext that is used to pass on Configuration to the ParquetRecordWriter
+    private val hadoopTaskAttemptId = new TaskAttemptID(new TaskID(new JobID, TaskType.MAP, 0), 0)
+    private val hadoopAttemptContext = new TaskAttemptContextImpl(
+      serializableConf.value, hadoopTaskAttemptId)
+
+    // Instance of ParquetRecordWriter that does not use OutputCommitter
+    private val recordWriter = createNoCommitterRecordWriter(path, hadoopAttemptContext)
+
+    override def write(row: Row): Unit = {
+      throw new UnsupportedOperationException("call writeInternal")
+    }
+
+    protected[sql] override def writeInternal(row: InternalRow): Unit = {
+      recordWriter.write(null, row)
+    }
+
+    override def close(): Unit = recordWriter.close(hadoopAttemptContext)
+  }
+
+  /** Create a [[ParquetRecordWriter]] that writes the given path without using OutputCommitter */
+  private def createNoCommitterRecordWriter(
+      path: String,
+      hadoopAttemptContext: TaskAttemptContext): RecordWriter[Void, InternalRow] = {
+    // Custom ParquetOutputFormat that disable use of committer and writes to the given path
+    val outputFormat = new ParquetOutputFormat[InternalRow]() {
+      override def getOutputCommitter(c: TaskAttemptContext): OutputCommitter = { null }
+      override def getDefaultWorkFile(c: TaskAttemptContext, ext: String): Path = { new Path(path) }
+    }
+    outputFormat.getRecordWriter(hadoopAttemptContext)
+  }
+
+  /** Disable the use of the older API. */
+  def newInstance(
+      path: String,
+      bucketId: Option[Int],
+      dataSchema: StructType,
+      context: TaskAttemptContext): OutputWriter = {
+    throw new UnsupportedOperationException(
+      "this version of newInstance not supported for " +
+        "ParquetOutputWriterFactory")
+  }
+}
+
+
+// NOTE: This class is instantiated and used on executor side only, no need to be serializable.
+private[parquet] class ParquetOutputWriter(
+    path: String,
+    bucketId: Option[Int],
+    context: TaskAttemptContext)
+  extends OutputWriter {
+
+  private val recordWriter: RecordWriter[Void, InternalRow] = {
+    val outputFormat = {
+      new ParquetOutputFormat[InternalRow]() {
+        // Here we override `getDefaultWorkFile` for two reasons:
+        //
+        //  1. To allow appending.  We need to generate unique output file names to avoid
+        //     overwriting existing files (either exist before the write job, or are just written
+        //     by other tasks within the same write job).
+        //
+        //  2. To allow dynamic partitioning.  Default `getDefaultWorkFile` uses
+        //     `FileOutputCommitter.getWorkPath()`, which points to the base directory of all
+        //     partitions in the case of dynamic partitioning.
+        override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
+          val configuration = context.getConfiguration
+          val uniqueWriteJobId = configuration.get(WriterContainer.DATASOURCE_WRITEJOBUUID)
+          val taskAttemptId = context.getTaskAttemptID
+          val split = taskAttemptId.getTaskID.getId
+          val bucketString = bucketId.map(BucketingUtils.bucketIdToString).getOrElse("")
+          // It has the `.parquet` extension at the end because (de)compression tools
+          // such as gunzip would not be able to decompress this as the compression
+          // is not applied on this whole file but on each "page" in Parquet format.
+          new Path(path, f"part-r-$split%05d-$uniqueWriteJobId$bucketString$extension")
+        }
+      }
+    }
+
+    outputFormat.getRecordWriter(context)
+  }
+
+  override def write(row: Row): Unit = throw new UnsupportedOperationException("call writeInternal")
+
+  override def writeInternal(row: InternalRow): Unit = recordWriter.write(null, row)
+
+  override def close(): Unit = recordWriter.close(context)
+}

From 2fb12b0a33deeeadfac451095f64dea6c967caac Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Fri, 14 Oct 2016 15:53:50 +0800
Subject: [PATCH 010/162] [SPARK-17903][SQL] MetastoreRelation should talk to
 external catalog instead of hive client

## What changes were proposed in this pull request?

`HiveExternalCatalog` should be the only interface to talk to the hive metastore. In `MetastoreRelation` we can just use `ExternalCatalog` instead of `HiveClient` to interact with hive metastore,  and add missing API in `ExternalCatalog`.

## How was this patch tested?

existing tests.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #15460 from cloud-fan/relation.
---
 .../catalyst/catalog/ExternalCatalog.scala    | 13 +++++++++++++
 .../catalyst/catalog/InMemoryCatalog.scala    |  8 ++++++++
 .../spark/sql/hive/HiveExternalCatalog.scala  |  8 ++++++++
 .../spark/sql/hive/HiveMetastoreCatalog.scala |  7 +++----
 .../spark/sql/hive/MetastoreRelation.scala    | 19 ++++++++++++-------
 .../apache/spark/sql/hive/TableReader.scala   |  3 +--
 .../spark/sql/hive/client/HiveClient.scala    | 15 +++------------
 .../sql/hive/client/HiveClientImpl.scala      | 10 ++++++----
 .../sql/hive/HiveExternalCatalogSuite.scala   |  9 +++++++++
 .../sql/hive/MetastoreRelationSuite.scala     |  2 +-
 .../spark/sql/hive/client/VersionsSuite.scala |  4 ++--
 11 files changed, 66 insertions(+), 32 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
index dd93b467eeeb2..348d3d0be2152 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.catalyst.catalog
 
 import org.apache.spark.sql.catalyst.analysis.{FunctionAlreadyExistsException, NoSuchDatabaseException, NoSuchFunctionException}
+import org.apache.spark.sql.catalyst.expressions.Expression
 
 
 /**
@@ -196,6 +197,18 @@ abstract class ExternalCatalog {
       table: String,
       partialSpec: Option[TablePartitionSpec] = None): Seq[CatalogTablePartition]
 
+  /**
+   * List the metadata of selected partitions according to the given partition predicates.
+   *
+   * @param db database name
+   * @param table table name
+   * @param predicates partition predicated
+   */
+  def listPartitionsByFilter(
+      db: String,
+      table: String,
+      predicates: Seq[Expression]): Seq[CatalogTablePartition]
+
   // --------------------------------------------------------------------------
   // Functions
   // --------------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
index 3e31127118b44..49280f82e20be 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
@@ -28,6 +28,7 @@ import org.apache.spark.{SparkConf, SparkException}
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
 import org.apache.spark.sql.catalyst.analysis._
+import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.catalyst.util.StringUtils
 
 /**
@@ -477,6 +478,13 @@ class InMemoryCatalog(
     catalog(db).tables(table).partitions.values.toSeq
   }
 
+  override def listPartitionsByFilter(
+      db: String,
+      table: String,
+      predicates: Seq[Expression]): Seq[CatalogTablePartition] = {
+    throw new UnsupportedOperationException("listPartitionsByFilter is not implemented.")
+  }
+
   // --------------------------------------------------------------------------
   // Functions
   // --------------------------------------------------------------------------
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 237b829da882f..b5d93c3d7c804 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -32,6 +32,7 @@ import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException
 import org.apache.spark.sql.catalyst.catalog._
+import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Statistics}
 import org.apache.spark.sql.execution.command.{ColumnStatStruct, DDLUtils}
 import org.apache.spark.sql.execution.datasources.CaseInsensitiveMap
@@ -646,6 +647,13 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     client.getPartitions(db, table, partialSpec)
   }
 
+  override def listPartitionsByFilter(
+      db: String,
+      table: String,
+      predicates: Seq[Expression]): Seq[CatalogTablePartition] = {
+    client.getPartitionsByFilter(db, table, predicates)
+  }
+
   // --------------------------------------------------------------------------
   // Functions
   // --------------------------------------------------------------------------
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 8410a2e4a47ca..c44f0adda44c0 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -44,8 +44,6 @@ import org.apache.spark.sql.types._
  */
 private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Logging {
   private val sessionState = sparkSession.sessionState.asInstanceOf[HiveSessionState]
-  private val client =
-    sparkSession.sharedState.externalCatalog.asInstanceOf[HiveExternalCatalog].client
 
   /** A fully qualified identifier for a table (i.e., database.tableName) */
   case class QualifiedTableName(database: String, name: String)
@@ -104,7 +102,8 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
   def hiveDefaultTableFilePath(tableIdent: TableIdentifier): String = {
     // Code based on: hiveWarehouse.getTablePath(currentDatabase, tableName)
     val QualifiedTableName(dbName, tblName) = getQualifiedTableName(tableIdent)
-    new Path(new Path(client.getDatabase(dbName).locationUri), tblName).toString
+    val dbLocation = sparkSession.sharedState.externalCatalog.getDatabase(dbName).locationUri
+    new Path(new Path(dbLocation), tblName).toString
   }
 
   def lookupRelation(
@@ -129,7 +128,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
     } else {
       val qualifiedTable =
         MetastoreRelation(
-          qualifiedTableName.database, qualifiedTableName.name)(table, client, sparkSession)
+          qualifiedTableName.database, qualifiedTableName.name)(table, sparkSession)
       alias.map(a => SubqueryAlias(a, qualifiedTable, None)).getOrElse(qualifiedTable)
     }
   }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala
index 33f0ecff63529..da809cf991de2 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala
@@ -43,7 +43,6 @@ private[hive] case class MetastoreRelation(
     databaseName: String,
     tableName: String)
     (val catalogTable: CatalogTable,
-     @transient private val client: HiveClient,
      @transient private val sparkSession: SparkSession)
   extends LeafNode with MultiInstanceRelation with FileRelation with CatalogRelation {
 
@@ -59,7 +58,7 @@ private[hive] case class MetastoreRelation(
     Objects.hashCode(databaseName, tableName, output)
   }
 
-  override protected def otherCopyArgs: Seq[AnyRef] = catalogTable :: client :: sparkSession :: Nil
+  override protected def otherCopyArgs: Seq[AnyRef] = catalogTable :: sparkSession :: Nil
 
   private def toHiveColumn(c: StructField): FieldSchema = {
     new FieldSchema(c.name, c.dataType.catalogString, c.getComment.orNull)
@@ -146,11 +145,18 @@ private[hive] case class MetastoreRelation(
 
   // When metastore partition pruning is turned off, we cache the list of all partitions to
   // mimic the behavior of Spark < 1.5
-  private lazy val allPartitions: Seq[CatalogTablePartition] = client.getPartitions(catalogTable)
+  private lazy val allPartitions: Seq[CatalogTablePartition] = {
+    sparkSession.sharedState.externalCatalog.listPartitions(
+      catalogTable.database,
+      catalogTable.identifier.table)
+  }
 
   def getHiveQlPartitions(predicates: Seq[Expression] = Nil): Seq[Partition] = {
     val rawPartitions = if (sparkSession.sessionState.conf.metastorePartitionPruning) {
-      client.getPartitionsByFilter(catalogTable, predicates)
+      sparkSession.sharedState.externalCatalog.listPartitionsByFilter(
+        catalogTable.database,
+        catalogTable.identifier.table,
+        predicates)
     } else {
       allPartitions
     }
@@ -234,8 +240,7 @@ private[hive] case class MetastoreRelation(
   val columnOrdinals = AttributeMap(attributes.zipWithIndex)
 
   override def inputFiles: Array[String] = {
-    val partLocations = client
-      .getPartitionsByFilter(catalogTable, Nil)
+    val partLocations = allPartitions
       .flatMap(_.storage.locationUri)
       .toArray
     if (partLocations.nonEmpty) {
@@ -248,6 +253,6 @@ private[hive] case class MetastoreRelation(
   }
 
   override def newInstance(): MetastoreRelation = {
-    MetastoreRelation(databaseName, tableName)(catalogTable, client, sparkSession)
+    MetastoreRelation(databaseName, tableName)(catalogTable, sparkSession)
   }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
index 2a54163a04e9b..aaf30f41f29c2 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
@@ -149,8 +149,7 @@ class HadoopTableReader(
    *     subdirectory of each partition being read. If None, then all files are accepted.
    */
   def makeRDDForPartitionedTable(
-      partitionToDeserializer: Map[HivePartition,
-      Class[_ <: Deserializer]],
+      partitionToDeserializer: Map[HivePartition, Class[_ <: Deserializer]],
       filterOpt: Option[PathFilter]): RDD[InternalRow] = {
 
     // SPARK-5068:get FileStatus and do the filtering locally when the path is not exists
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
index 984d23bb09dbd..9ee3d629c9977 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
@@ -172,24 +172,15 @@ private[hive] trait HiveClient {
    * Returns the partitions for the given table that match the supplied partition spec.
    * If no partition spec is specified, all partitions are returned.
    */
-  final def getPartitions(
+  def getPartitions(
       db: String,
       table: String,
-      partialSpec: Option[TablePartitionSpec]): Seq[CatalogTablePartition] = {
-    getPartitions(getTable(db, table), partialSpec)
-  }
-
-  /**
-   * Returns the partitions for the given table that match the supplied partition spec.
-   * If no partition spec is specified, all partitions are returned.
-   */
-  def getPartitions(
-      table: CatalogTable,
       partialSpec: Option[TablePartitionSpec] = None): Seq[CatalogTablePartition]
 
   /** Returns partitions filtered by predicates for the given table. */
   def getPartitionsByFilter(
-      table: CatalogTable,
+      db: String,
+      table: String,
       predicates: Seq[Expression]): Seq[CatalogTablePartition]
 
   /** Loads a static partition into an existing table. */
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index dd33d750a4d45..5c8f7ff1af9fa 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -525,9 +525,10 @@ private[hive] class HiveClientImpl(
    * If no partition spec is specified, all partitions are returned.
    */
   override def getPartitions(
-      table: CatalogTable,
+      db: String,
+      table: String,
       spec: Option[TablePartitionSpec]): Seq[CatalogTablePartition] = withHiveState {
-    val hiveTable = toHiveTable(table)
+    val hiveTable = toHiveTable(getTable(db, table))
     spec match {
       case None => shim.getAllPartitions(client, hiveTable).map(fromHivePartition)
       case Some(s) => client.getPartitions(hiveTable, s.asJava).asScala.map(fromHivePartition)
@@ -535,9 +536,10 @@ private[hive] class HiveClientImpl(
   }
 
   override def getPartitionsByFilter(
-      table: CatalogTable,
+      db: String,
+      table: String,
       predicates: Seq[Expression]): Seq[CatalogTablePartition] = withHiveState {
-    val hiveTable = toHiveTable(table)
+    val hiveTable = toHiveTable(getTable(db, table))
     shim.getPartitionsByFilter(client, hiveTable, predicates).map(fromHivePartition)
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala
index 26c2549820de6..efa0beb85030b 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala
@@ -21,6 +21,7 @@ import org.apache.hadoop.conf.Configuration
 
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.catalyst.catalog._
+import org.apache.spark.sql.catalyst.dsl.expressions._
 
 /**
  * Test suite for the [[HiveExternalCatalog]].
@@ -43,4 +44,12 @@ class HiveExternalCatalogSuite extends ExternalCatalogSuite {
     externalCatalog.client.reset()
   }
 
+  import utils._
+
+  test("list partitions by filter") {
+    val catalog = newBasicCatalog()
+    val selectedPartitions = catalog.listPartitionsByFilter("db2", "tbl2", Seq('a.int === 1))
+    assert(selectedPartitions.length == 1)
+    assert(selectedPartitions.head.spec == part1.spec)
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreRelationSuite.scala
index 2f3055dcac4c5..c28e41a85c39d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreRelationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreRelationSuite.scala
@@ -29,7 +29,7 @@ class MetastoreRelationSuite extends SparkFunSuite {
       tableType = CatalogTableType.VIEW,
       storage = CatalogStorageFormat.empty,
       schema = StructType(StructField("a", IntegerType, true) :: Nil))
-    val relation = MetastoreRelation("db", "test")(table, null, null)
+    val relation = MetastoreRelation("db", "test")(table, null)
 
     // No exception should be thrown
     relation.makeCopy(Array("db", "test"))
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
index 9a10957c8efa5..c158bf1ab09cb 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
@@ -295,12 +295,12 @@ class VersionsSuite extends SparkFunSuite with Logging {
     }
 
     test(s"$version: getPartitions(catalogTable)") {
-      assert(2 == client.getPartitions(client.getTable("default", "src_part")).size)
+      assert(2 == client.getPartitions("default", "src_part").size)
     }
 
     test(s"$version: getPartitionsByFilter") {
       // Only one partition [1, 1] for key2 == 1
-      val result = client.getPartitionsByFilter(client.getTable("default", "src_part"),
+      val result = client.getPartitionsByFilter("default", "src_part",
         Seq(EqualTo(AttributeReference("key2", IntegerType)(), Literal(1))))
 
       // Hive 0.12 doesn't support getPartitionsByFilter, it ignores the filter condition.

From 1db8feab8c564053c05e8bdc1a7f5026fd637d4f Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Fri, 14 Oct 2016 04:17:03 -0700
Subject: [PATCH 011/162] [SPARK-15402][ML][PYSPARK] PySpark ml.evaluation
 should support save/load

## What changes were proposed in this pull request?
Since ```ml.evaluation``` has supported save/load at Scala side, supporting it at Python side is very straightforward and easy.

## How was this patch tested?
Add python doctest.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #13194 from yanboliang/spark-15402.
---
 python/pyspark/ml/evaluation.py | 45 ++++++++++++++++++++++++++-------
 1 file changed, 36 insertions(+), 9 deletions(-)

diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py
index 1fe8772da772a..7aa16fa5b90f2 100644
--- a/python/pyspark/ml/evaluation.py
+++ b/python/pyspark/ml/evaluation.py
@@ -22,6 +22,7 @@
 from pyspark.ml.param import Param, Params, TypeConverters
 from pyspark.ml.param.shared import HasLabelCol, HasPredictionCol, HasRawPredictionCol
 from pyspark.ml.common import inherit_doc
+from pyspark.ml.util import JavaMLReadable, JavaMLWritable
 
 __all__ = ['Evaluator', 'BinaryClassificationEvaluator', 'RegressionEvaluator',
            'MulticlassClassificationEvaluator']
@@ -103,7 +104,8 @@ def isLargerBetter(self):
 
 
 @inherit_doc
-class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPredictionCol):
+class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPredictionCol,
+                                    JavaMLReadable, JavaMLWritable):
     """
     .. note:: Experimental
 
@@ -121,6 +123,11 @@ class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPrediction
     0.70...
     >>> evaluator.evaluate(dataset, {evaluator.metricName: "areaUnderPR"})
     0.83...
+    >>> bce_path = temp_path + "/bce"
+    >>> evaluator.save(bce_path)
+    >>> evaluator2 = BinaryClassificationEvaluator.load(bce_path)
+    >>> str(evaluator2.getRawPredictionCol())
+    'raw'
 
     .. versionadded:: 1.4.0
     """
@@ -172,7 +179,8 @@ def setParams(self, rawPredictionCol="rawPrediction", labelCol="label",
 
 
 @inherit_doc
-class RegressionEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol):
+class RegressionEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol,
+                          JavaMLReadable, JavaMLWritable):
     """
     .. note:: Experimental
 
@@ -190,6 +198,11 @@ class RegressionEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol):
     0.993...
     >>> evaluator.evaluate(dataset, {evaluator.metricName: "mae"})
     2.649...
+    >>> re_path = temp_path + "/re"
+    >>> evaluator.save(re_path)
+    >>> evaluator2 = RegressionEvaluator.load(re_path)
+    >>> str(evaluator2.getPredictionCol())
+    'raw'
 
     .. versionadded:: 1.4.0
     """
@@ -244,7 +257,8 @@ def setParams(self, predictionCol="prediction", labelCol="label",
 
 
 @inherit_doc
-class MulticlassClassificationEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol):
+class MulticlassClassificationEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol,
+                                        JavaMLReadable, JavaMLWritable):
     """
     .. note:: Experimental
 
@@ -260,6 +274,11 @@ class MulticlassClassificationEvaluator(JavaEvaluator, HasLabelCol, HasPredictio
     0.66...
     >>> evaluator.evaluate(dataset, {evaluator.metricName: "accuracy"})
     0.66...
+    >>> mce_path = temp_path + "/mce"
+    >>> evaluator.save(mce_path)
+    >>> evaluator2 = MulticlassClassificationEvaluator.load(mce_path)
+    >>> str(evaluator2.getPredictionCol())
+    'prediction'
 
     .. versionadded:: 1.5.0
     """
@@ -311,19 +330,27 @@ def setParams(self, predictionCol="prediction", labelCol="label",
 
 if __name__ == "__main__":
     import doctest
+    import tempfile
+    import pyspark.ml.evaluation
     from pyspark.sql import SparkSession
-    globs = globals().copy()
+    globs = pyspark.ml.evaluation.__dict__.copy()
     # The small batch size here ensures that we see multiple batches,
     # even in these small test examples:
     spark = SparkSession.builder\
         .master("local[2]")\
         .appName("ml.evaluation tests")\
         .getOrCreate()
-    sc = spark.sparkContext
-    globs['sc'] = sc
     globs['spark'] = spark
-    (failure_count, test_count) = doctest.testmod(
-        globs=globs, optionflags=doctest.ELLIPSIS)
-    spark.stop()
+    temp_path = tempfile.mkdtemp()
+    globs['temp_path'] = temp_path
+    try:
+        (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
+        spark.stop()
+    finally:
+        from shutil import rmtree
+        try:
+            rmtree(temp_path)
+        except OSError:
+            pass
     if failure_count:
         exit(-1)

From a1b136d05c6c458ae8211b0844bfc98d7693fa42 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Fri, 14 Oct 2016 04:25:14 -0700
Subject: [PATCH 012/162] [SPARK-14634][ML] Add BisectingKMeansSummary

## What changes were proposed in this pull request?
Add BisectingKMeansSummary

## How was this patch tested?
unit test

Author: Zheng RuiFeng <ruifengz@foxmail.com>

Closes #12394 from zhengruifeng/biKMSummary.
---
 .../spark/ml/clustering/BisectingKMeans.scala | 74 ++++++++++++++++++-
 .../ml/clustering/BisectingKMeansSuite.scala  | 18 ++++-
 .../ml/clustering/GaussianMixtureSuite.scala  |  2 +-
 .../spark/ml/clustering/KMeansSuite.scala     |  2 +-
 4 files changed, 91 insertions(+), 5 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
index a97bd0fb16fd7..add8ee2a4ff8e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
@@ -19,6 +19,7 @@ package org.apache.spark.ml.clustering
 
 import org.apache.hadoop.fs.Path
 
+import org.apache.spark.SparkException
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.linalg.{Vector, VectorUDT}
@@ -127,6 +128,29 @@ class BisectingKMeansModel private[ml] (
 
   @Since("2.0.0")
   override def write: MLWriter = new BisectingKMeansModel.BisectingKMeansModelWriter(this)
+
+  private var trainingSummary: Option[BisectingKMeansSummary] = None
+
+  private[clustering] def setSummary(summary: BisectingKMeansSummary): this.type = {
+    this.trainingSummary = Some(summary)
+    this
+  }
+
+  /**
+   * Return true if there exists summary of model.
+   */
+  @Since("2.1.0")
+  def hasSummary: Boolean = trainingSummary.nonEmpty
+
+  /**
+   * Gets summary of model on training set. An exception is
+   * thrown if `trainingSummary == None`.
+   */
+  @Since("2.1.0")
+  def summary: BisectingKMeansSummary = trainingSummary.getOrElse {
+    throw new SparkException(
+      s"No training summary available for the ${this.getClass.getSimpleName}")
+  }
 }
 
 object BisectingKMeansModel extends MLReadable[BisectingKMeansModel] {
@@ -228,14 +252,22 @@ class BisectingKMeans @Since("2.0.0") (
       case Row(point: Vector) => OldVectors.fromML(point)
     }
 
+    val instr = Instrumentation.create(this, rdd)
+    instr.logParams(featuresCol, predictionCol, k, maxIter, seed, minDivisibleClusterSize)
+
     val bkm = new MLlibBisectingKMeans()
       .setK($(k))
       .setMaxIterations($(maxIter))
       .setMinDivisibleClusterSize($(minDivisibleClusterSize))
       .setSeed($(seed))
     val parentModel = bkm.run(rdd)
-    val model = new BisectingKMeansModel(uid, parentModel)
-    copyValues(model.setParent(this))
+    val model = copyValues(new BisectingKMeansModel(uid, parentModel).setParent(this))
+    val summary = new BisectingKMeansSummary(
+      model.transform(dataset), $(predictionCol), $(featuresCol), $(k))
+    model.setSummary(summary)
+    val m = model.setSummary(summary)
+    instr.logSuccess(m)
+    m
   }
 
   @Since("2.0.0")
@@ -251,3 +283,41 @@ object BisectingKMeans extends DefaultParamsReadable[BisectingKMeans] {
   @Since("2.0.0")
   override def load(path: String): BisectingKMeans = super.load(path)
 }
+
+
+/**
+ * :: Experimental ::
+ * Summary of BisectingKMeans.
+ *
+ * @param predictions  [[DataFrame]] produced by [[BisectingKMeansModel.transform()]]
+ * @param predictionCol  Name for column of predicted clusters in `predictions`
+ * @param featuresCol  Name for column of features in `predictions`
+ * @param k  Number of clusters
+ */
+@Since("2.1.0")
+@Experimental
+class BisectingKMeansSummary private[clustering] (
+    @Since("2.1.0") @transient val predictions: DataFrame,
+    @Since("2.1.0") val predictionCol: String,
+    @Since("2.1.0") val featuresCol: String,
+    @Since("2.1.0") val k: Int) extends Serializable {
+
+  /**
+   * Cluster centers of the transformed data.
+   */
+  @Since("2.1.0")
+  @transient lazy val cluster: DataFrame = predictions.select(predictionCol)
+
+  /**
+   * Size of (number of data points in) each cluster.
+   */
+  @Since("2.1.0")
+  lazy val clusterSizes: Array[Long] = {
+    val sizes = Array.fill[Long](k)(0)
+    cluster.groupBy(predictionCol).count().select(predictionCol, "count").collect().foreach {
+      case Row(cluster: Int, count: Long) => sizes(cluster) = count
+    }
+    sizes
+  }
+
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
index 4f7d4418a8d09..f2368a9f8dad5 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
@@ -68,7 +68,7 @@ class BisectingKMeansSuite
     }
   }
 
-  test("fit & transform") {
+  test("fit, transform and summary") {
     val predictionColName = "bisecting_kmeans_prediction"
     val bkm = new BisectingKMeans().setK(k).setPredictionCol(predictionColName).setSeed(1)
     val model = bkm.fit(dataset)
@@ -85,6 +85,22 @@ class BisectingKMeansSuite
     assert(clusters === Set(0, 1, 2, 3, 4))
     assert(model.computeCost(dataset) < 0.1)
     assert(model.hasParent)
+
+    // Check validity of model summary
+    val numRows = dataset.count()
+    assert(model.hasSummary)
+    val summary: BisectingKMeansSummary = model.summary
+    assert(summary.predictionCol === predictionColName)
+    assert(summary.featuresCol === "features")
+    assert(summary.predictions.count() === numRows)
+    for (c <- Array(predictionColName, "features")) {
+      assert(summary.predictions.columns.contains(c))
+    }
+    assert(summary.cluster.columns === Array(predictionColName))
+    val clusterSizes = summary.clusterSizes
+    assert(clusterSizes.length === k)
+    assert(clusterSizes.sum === numRows)
+    assert(clusterSizes.forall(_ >= 0))
   }
 
   test("read/write") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
index 04366f5250287..003fa6abf6597 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
@@ -70,7 +70,7 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
     }
   }
 
-  test("fit, transform, and summary") {
+  test("fit, transform and summary") {
     val predictionColName = "gm_prediction"
     val probabilityColName = "gm_probability"
     val gm = new GaussianMixture().setK(k).setMaxIter(2).setPredictionCol(predictionColName)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
index c9ba5a288aadf..ca392653557c4 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
@@ -82,7 +82,7 @@ class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultR
     }
   }
 
-  test("fit, transform, and summary") {
+  test("fit, transform and summary") {
     val predictionColName = "kmeans_prediction"
     val kmeans = new KMeans().setK(k).setPredictionCol(predictionColName).setSeed(1)
     val model = kmeans.fit(dataset)

From c8b612decba28e51789891f7881b6d4ebc50e2bb Mon Sep 17 00:00:00 2001
From: Peng <peng.meng@intel.com>
Date: Fri, 14 Oct 2016 12:48:57 +0100
Subject: [PATCH 013/162] [SPARK-17870][MLLIB][ML] Change statistic to pValue
 for SelectKBest and SelectPercentile because of DoF difference

## What changes were proposed in this pull request?

For feature selection method ChiSquareSelector, it is based on the ChiSquareTestResult.statistic (ChiSqure value) to select the features. It select the features with the largest ChiSqure value. But the Degree of Freedom (df) of ChiSqure value is different in Statistics.chiSqTest(RDD), and for different df, you cannot base on ChiSqure value to select features.

So we change statistic to pValue for SelectKBest and SelectPercentile

## How was this patch tested?
change existing test

Author: Peng <peng.meng@intel.com>

Closes #15444 from mpjlu/chisqure-bug.
---
 .../org/apache/spark/mllib/feature/ChiSqSelector.scala    | 4 ++--
 .../org/apache/spark/ml/feature/ChiSqSelectorSuite.scala  | 6 +++---
 .../apache/spark/mllib/feature/ChiSqSelectorSuite.scala   | 8 ++++----
 python/pyspark/ml/feature.py                              | 4 ++--
 python/pyspark/mllib/feature.py                           | 8 ++++----
 5 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index c305b36278e87..f8276de4f23d4 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -234,11 +234,11 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable {
     val features = selectorType match {
       case ChiSqSelector.KBest =>
         chiSqTestResult
-          .sortBy { case (res, _) => -res.statistic }
+          .sortBy { case (res, _) => res.pValue }
           .take(numTopFeatures)
       case ChiSqSelector.Percentile =>
         chiSqTestResult
-          .sortBy { case (res, _) => -res.statistic }
+          .sortBy { case (res, _) => res.pValue }
           .take((chiSqTestResult.length * percentile).toInt)
       case ChiSqSelector.FPR =>
         chiSqTestResult
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala
index dfebfc87ea1d3..6af06d82d671a 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala
@@ -38,10 +38,10 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext
     )
 
     val preFilteredData = Seq(
-      Vectors.dense(0.0),
-      Vectors.dense(6.0),
       Vectors.dense(8.0),
-      Vectors.dense(5.0)
+      Vectors.dense(0.0),
+      Vectors.dense(0.0),
+      Vectors.dense(8.0)
     )
 
     val df = sc.parallelize(data.zip(preFilteredData))
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala
index ec23a4aa7364d..ac702b4b7c69e 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala
@@ -54,10 +54,10 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext {
         LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))),
         LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))), 2)
     val preFilteredData =
-      Set(LabeledPoint(0.0, Vectors.dense(Array(0.0))),
-        LabeledPoint(1.0, Vectors.dense(Array(6.0))),
-        LabeledPoint(1.0, Vectors.dense(Array(8.0))),
-        LabeledPoint(2.0, Vectors.dense(Array(5.0))))
+      Set(LabeledPoint(0.0, Vectors.dense(Array(8.0))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
+        LabeledPoint(2.0, Vectors.dense(Array(8.0))))
     val model = new ChiSqSelector(1).fit(labeledDiscreteData)
     val filteredData = labeledDiscreteData.map { lp =>
       LabeledPoint(lp.label, model.transform(lp.features))
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index a33c3e79453e1..7683360664ebd 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -2592,9 +2592,9 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja
     >>> selector = ChiSqSelector(numTopFeatures=1, outputCol="selectedFeatures")
     >>> model = selector.fit(df)
     >>> model.transform(df).head().selectedFeatures
-    DenseVector([1.0])
+    DenseVector([18.0])
     >>> model.selectedFeatures
-    [3]
+    [2]
     >>> chiSqSelectorPath = temp_path + "/chi-sq-selector"
     >>> selector.save(chiSqSelectorPath)
     >>> loadedSelector = ChiSqSelector.load(chiSqSelectorPath)
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index 4aea81840a162..50ef7c7901c2c 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -288,15 +288,15 @@ class ChiSqSelector(object):
     ... ]
     >>> model = ChiSqSelector().setNumTopFeatures(1).fit(sc.parallelize(data))
     >>> model.transform(SparseVector(3, {1: 9.0, 2: 6.0}))
-    SparseVector(1, {0: 6.0})
+    SparseVector(1, {})
     >>> model.transform(DenseVector([8.0, 9.0, 5.0]))
-    DenseVector([5.0])
+    DenseVector([8.0])
     >>> model = ChiSqSelector().setSelectorType("percentile").setPercentile(0.34).fit(
     ...     sc.parallelize(data))
     >>> model.transform(SparseVector(3, {1: 9.0, 2: 6.0}))
-    SparseVector(1, {0: 6.0})
+    SparseVector(1, {})
     >>> model.transform(DenseVector([8.0, 9.0, 5.0]))
-    DenseVector([5.0])
+    DenseVector([8.0])
     >>> data = [
     ...     LabeledPoint(0.0, SparseVector(4, {0: 8.0, 1: 7.0})),
     ...     LabeledPoint(1.0, SparseVector(4, {1: 9.0, 2: 6.0, 3: 4.0})),

From 28b645b1e643ae0f6c56cbe5a92356623306717f Mon Sep 17 00:00:00 2001
From: invkrh <invkrh@gmail.com>
Date: Fri, 14 Oct 2016 12:52:08 +0100
Subject: [PATCH 014/162] [SPARK-17855][CORE] Remove query string from jar url

## What changes were proposed in this pull request?

Spark-submit support jar url with http protocol. However, if the url contains any query strings, `worker.DriverRunner.downloadUserJar()` method will throw "Did not see expected jar" exception. This is because this method checks the existance of a downloaded jar whose name contains query strings. This is a problem when your jar is located on some web service which requires some additional information to retrieve the file.

This pr just removes query strings before checking jar existance on worker.

## How was this patch tested?

For now, you can only test this patch by manual test.
* Deploy a spark cluster locally
* Make sure apache httpd service is on
* Save an uber jar, e.g spark-job.jar under `/var/www/html/`
* Use http://localhost/spark-job.jar?param=1 as jar url when running `spark-submit`
* Job should be launched

Author: invkrh <invkrh@gmail.com>

Closes #15420 from invkrh/spark-17855.
---
 .../spark/deploy/worker/DriverRunner.scala    | 24 +++++++------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala b/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala
index 289b0b93b0e84..e878c10183f61 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala
@@ -18,12 +18,12 @@
 package org.apache.spark.deploy.worker
 
 import java.io._
+import java.net.URI
 import java.nio.charset.StandardCharsets
 
 import scala.collection.JavaConverters._
 
 import com.google.common.io.Files
-import org.apache.hadoop.fs.Path
 
 import org.apache.spark.{SecurityManager, SparkConf}
 import org.apache.spark.deploy.{DriverDescription, SparkHadoopUtil}
@@ -147,30 +147,24 @@ private[deploy] class DriverRunner(
    * Will throw an exception if there are errors downloading the jar.
    */
   private def downloadUserJar(driverDir: File): String = {
-    val jarPath = new Path(driverDesc.jarUrl)
-    val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf)
-    val destPath = new File(driverDir.getAbsolutePath, jarPath.getName)
-    val jarFileName = jarPath.getName
+    val jarFileName = new URI(driverDesc.jarUrl).getPath.split("/").last
     val localJarFile = new File(driverDir, jarFileName)
-    val localJarFilename = localJarFile.getAbsolutePath
-
     if (!localJarFile.exists()) { // May already exist if running multiple workers on one node
-      logInfo(s"Copying user jar $jarPath to $destPath")
+      logInfo(s"Copying user jar ${driverDesc.jarUrl} to $localJarFile")
       Utils.fetchFile(
         driverDesc.jarUrl,
         driverDir,
         conf,
         securityManager,
-        hadoopConf,
+        SparkHadoopUtil.get.newConfiguration(conf),
         System.currentTimeMillis(),
         useCache = false)
+      if (!localJarFile.exists()) { // Verify copy succeeded
+        throw new IOException(
+          s"Can not find expected jar $jarFileName which should have been loaded in $driverDir")
+      }
     }
-
-    if (!localJarFile.exists()) { // Verify copy succeeded
-      throw new Exception(s"Did not see expected jar $jarFileName in $driverDir")
-    }
-
-    localJarFilename
+    localJarFile.getAbsolutePath
   }
 
   private[worker] def prepareAndRunDriver(): Int = {

From 7486442fe0b70f2aea21d569604e71d7ddf19a77 Mon Sep 17 00:00:00 2001
From: wangzhenhua <wangzhenhua@huawei.com>
Date: Fri, 14 Oct 2016 21:18:49 +0800
Subject: [PATCH 015/162] [SPARK-17073][SQL][FOLLOWUP] generate column-level
 statistics

## What changes were proposed in this pull request?
This pr adds some test cases for statistics: case sensitive column names, non ascii column names, refresh table, and also improves some documentation.

## How was this patch tested?
add test cases

Author: wangzhenhua <wangzhenhua@huawei.com>

Closes #15360 from wzhfy/colStats2.
---
 .../command/AnalyzeColumnCommand.scala        |  53 ++---
 .../apache/spark/sql/internal/SQLConf.scala   |   3 +-
 .../spark/sql/hive/StatisticsSuite.scala      | 198 +++++++++++++++---
 3 files changed, 197 insertions(+), 57 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
index 7066378279971..488138709a12b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
@@ -59,10 +59,12 @@ case class AnalyzeColumnCommand(
 
     def updateStats(catalogTable: CatalogTable, newTotalSize: Long): Unit = {
       val (rowCount, columnStats) = computeColStats(sparkSession, relation)
+      // We also update table-level stats in order to keep them consistent with column-level stats.
       val statistics = Statistics(
         sizeInBytes = newTotalSize,
         rowCount = Some(rowCount),
-        colStats = columnStats ++ catalogTable.stats.map(_.colStats).getOrElse(Map()))
+        // Newly computed column stats should override the existing ones.
+        colStats = catalogTable.stats.map(_.colStats).getOrElse(Map()) ++ columnStats)
       sessionState.catalog.alterTable(catalogTable.copy(stats = Some(statistics)))
       // Refresh the cached data source table in the catalog.
       sessionState.catalog.refreshTable(tableIdentWithDB)
@@ -90,8 +92,9 @@ case class AnalyzeColumnCommand(
       }
     }
     if (duplicatedColumns.nonEmpty) {
-      logWarning(s"Duplicated columns ${duplicatedColumns.mkString("(", ", ", ")")} detected " +
-        s"when analyzing columns ${columnNames.mkString("(", ", ", ")")}, ignoring them.")
+      logWarning("Duplicate column names were deduplicated in `ANALYZE TABLE` statement. " +
+        s"Input columns: ${columnNames.mkString("(", ", ", ")")}. " +
+        s"Duplicate columns: ${duplicatedColumns.mkString("(", ", ", ")")}.")
     }
 
     // Collect statistics per column.
@@ -116,22 +119,24 @@ case class AnalyzeColumnCommand(
 }
 
 object ColumnStatStruct {
-  val zero = Literal(0, LongType)
-  val one = Literal(1, LongType)
+  private val zero = Literal(0, LongType)
+  private val one = Literal(1, LongType)
 
-  def numNulls(e: Expression): Expression = if (e.nullable) Sum(If(IsNull(e), one, zero)) else zero
-  def max(e: Expression): Expression = Max(e)
-  def min(e: Expression): Expression = Min(e)
-  def ndv(e: Expression, relativeSD: Double): Expression = {
+  private def numNulls(e: Expression): Expression = {
+    if (e.nullable) Sum(If(IsNull(e), one, zero)) else zero
+  }
+  private def max(e: Expression): Expression = Max(e)
+  private def min(e: Expression): Expression = Min(e)
+  private def ndv(e: Expression, relativeSD: Double): Expression = {
     // the approximate ndv should never be larger than the number of rows
     Least(Seq(HyperLogLogPlusPlus(e, relativeSD), Count(one)))
   }
-  def avgLength(e: Expression): Expression = Average(Length(e))
-  def maxLength(e: Expression): Expression = Max(Length(e))
-  def numTrues(e: Expression): Expression = Sum(If(e, one, zero))
-  def numFalses(e: Expression): Expression = Sum(If(Not(e), one, zero))
+  private def avgLength(e: Expression): Expression = Average(Length(e))
+  private def maxLength(e: Expression): Expression = Max(Length(e))
+  private def numTrues(e: Expression): Expression = Sum(If(e, one, zero))
+  private def numFalses(e: Expression): Expression = Sum(If(Not(e), one, zero))
 
-  def getStruct(exprs: Seq[Expression]): CreateStruct = {
+  private def getStruct(exprs: Seq[Expression]): CreateStruct = {
     CreateStruct(exprs.map { expr: Expression =>
       expr.transformUp {
         case af: AggregateFunction => af.toAggregateExpression()
@@ -139,19 +144,19 @@ object ColumnStatStruct {
     })
   }
 
-  def numericColumnStat(e: Expression, relativeSD: Double): Seq[Expression] = {
+  private def numericColumnStat(e: Expression, relativeSD: Double): Seq[Expression] = {
     Seq(numNulls(e), max(e), min(e), ndv(e, relativeSD))
   }
 
-  def stringColumnStat(e: Expression, relativeSD: Double): Seq[Expression] = {
+  private def stringColumnStat(e: Expression, relativeSD: Double): Seq[Expression] = {
     Seq(numNulls(e), avgLength(e), maxLength(e), ndv(e, relativeSD))
   }
 
-  def binaryColumnStat(e: Expression): Seq[Expression] = {
+  private def binaryColumnStat(e: Expression): Seq[Expression] = {
     Seq(numNulls(e), avgLength(e), maxLength(e))
   }
 
-  def booleanColumnStat(e: Expression): Seq[Expression] = {
+  private def booleanColumnStat(e: Expression): Seq[Expression] = {
     Seq(numNulls(e), numTrues(e), numFalses(e))
   }
 
@@ -162,14 +167,14 @@ object ColumnStatStruct {
     }
   }
 
-  def apply(e: Attribute, relativeSD: Double): CreateStruct = e.dataType match {
+  def apply(attr: Attribute, relativeSD: Double): CreateStruct = attr.dataType match {
     // Use aggregate functions to compute statistics we need.
-    case _: NumericType | TimestampType | DateType => getStruct(numericColumnStat(e, relativeSD))
-    case StringType => getStruct(stringColumnStat(e, relativeSD))
-    case BinaryType => getStruct(binaryColumnStat(e))
-    case BooleanType => getStruct(booleanColumnStat(e))
+    case _: NumericType | TimestampType | DateType => getStruct(numericColumnStat(attr, relativeSD))
+    case StringType => getStruct(stringColumnStat(attr, relativeSD))
+    case BinaryType => getStruct(binaryColumnStat(attr))
+    case BooleanType => getStruct(booleanColumnStat(attr))
     case otherType =>
       throw new AnalysisException("Analyzing columns is not supported for column " +
-        s"${e.name} of data type: ${e.dataType}.")
+        s"${attr.name} of data type: ${attr.dataType}.")
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index e671604c39855..c8447651dd672 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -578,7 +578,8 @@ object SQLConf {
   val NDV_MAX_ERROR =
     SQLConfigBuilder("spark.sql.statistics.ndv.maxError")
       .internal()
-      .doc("The maximum estimation error allowed in HyperLogLog++ algorithm.")
+      .doc("The maximum estimation error allowed in HyperLogLog++ algorithm when generating " +
+        "column level statistics.")
       .doubleConf
       .createWithDefault(0.05)
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index 85228bb00123d..c351063a63ff8 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -21,7 +21,7 @@ import java.io.{File, PrintWriter}
 
 import scala.reflect.ClassTag
 
-import org.apache.spark.sql.{AnalysisException, QueryTest, Row, StatisticsTest}
+import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier}
 import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Statistics}
 import org.apache.spark.sql.execution.command.{AnalyzeTableCommand, DDLUtils}
@@ -358,53 +358,187 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
     }
   }
 
-  test("generate column-level statistics and load them from hive metastore") {
+  private def getStatsBeforeAfterUpdate(isAnalyzeColumns: Boolean): (Statistics, Statistics) = {
+    val tableName = "tbl"
+    var statsBeforeUpdate: Statistics = null
+    var statsAfterUpdate: Statistics = null
+    withTable(tableName) {
+      val tableIndent = TableIdentifier(tableName, Some("default"))
+      val catalog = spark.sessionState.catalog.asInstanceOf[HiveSessionCatalog]
+      sql(s"CREATE TABLE $tableName (key int) USING PARQUET")
+      sql(s"INSERT INTO $tableName SELECT 1")
+      if (isAnalyzeColumns) {
+        sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS FOR COLUMNS key")
+      } else {
+        sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS")
+      }
+      // Table lookup will make the table cached.
+      catalog.lookupRelation(tableIndent)
+      statsBeforeUpdate = catalog.getCachedDataSourceTable(tableIndent)
+        .asInstanceOf[LogicalRelation].catalogTable.get.stats.get
+
+      sql(s"INSERT INTO $tableName SELECT 2")
+      if (isAnalyzeColumns) {
+        sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS FOR COLUMNS key")
+      } else {
+        sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS")
+      }
+      catalog.lookupRelation(tableIndent)
+      statsAfterUpdate = catalog.getCachedDataSourceTable(tableIndent)
+        .asInstanceOf[LogicalRelation].catalogTable.get.stats.get
+    }
+    (statsBeforeUpdate, statsAfterUpdate)
+  }
+
+  test("test refreshing table stats of cached data source table by `ANALYZE TABLE` statement") {
+    val (statsBeforeUpdate, statsAfterUpdate) = getStatsBeforeAfterUpdate(isAnalyzeColumns = false)
+
+    assert(statsBeforeUpdate.sizeInBytes > 0)
+    assert(statsBeforeUpdate.rowCount == Some(1))
+
+    assert(statsAfterUpdate.sizeInBytes > statsBeforeUpdate.sizeInBytes)
+    assert(statsAfterUpdate.rowCount == Some(2))
+  }
+
+  test("test refreshing column stats of cached data source table by `ANALYZE TABLE` statement") {
+    val (statsBeforeUpdate, statsAfterUpdate) = getStatsBeforeAfterUpdate(isAnalyzeColumns = true)
+
+    assert(statsBeforeUpdate.sizeInBytes > 0)
+    assert(statsBeforeUpdate.rowCount == Some(1))
+    StatisticsTest.checkColStat(
+      dataType = IntegerType,
+      colStat = statsBeforeUpdate.colStats("key"),
+      expectedColStat = ColumnStat(InternalRow(0L, 1, 1, 1L)),
+      rsd = spark.sessionState.conf.ndvMaxError)
+
+    assert(statsAfterUpdate.sizeInBytes > statsBeforeUpdate.sizeInBytes)
+    assert(statsAfterUpdate.rowCount == Some(2))
+    StatisticsTest.checkColStat(
+      dataType = IntegerType,
+      colStat = statsAfterUpdate.colStats("key"),
+      expectedColStat = ColumnStat(InternalRow(0L, 2, 1, 2L)),
+      rsd = spark.sessionState.conf.ndvMaxError)
+  }
+
+  private lazy val (testDataFrame, expectedColStatsSeq) = {
     import testImplicits._
 
     val intSeq = Seq(1, 2)
     val stringSeq = Seq("a", "bb")
+    val binarySeq = Seq("a", "bb").map(_.getBytes)
     val booleanSeq = Seq(true, false)
-
     val data = intSeq.indices.map { i =>
-      (intSeq(i), stringSeq(i), booleanSeq(i))
+      (intSeq(i), stringSeq(i), binarySeq(i), booleanSeq(i))
     }
-    val tableName = "table"
-    withTable(tableName) {
-      val df = data.toDF("c1", "c2", "c3")
-      df.write.format("parquet").saveAsTable(tableName)
-      val expectedColStatsSeq = df.schema.map { f =>
-        val colStat = f.dataType match {
-          case IntegerType =>
-            ColumnStat(InternalRow(0L, intSeq.max, intSeq.min, intSeq.distinct.length.toLong))
-          case StringType =>
-            ColumnStat(InternalRow(0L, stringSeq.map(_.length).sum / stringSeq.length.toDouble,
-              stringSeq.map(_.length).max.toInt, stringSeq.distinct.length.toLong))
-          case BooleanType =>
-            ColumnStat(InternalRow(0L, booleanSeq.count(_.equals(true)).toLong,
-              booleanSeq.count(_.equals(false)).toLong))
-        }
-        (f, colStat)
+    val df: DataFrame = data.toDF("c1", "c2", "c3", "c4")
+    val expectedColStatsSeq: Seq[(StructField, ColumnStat)] = df.schema.map { f =>
+      val colStat = f.dataType match {
+        case IntegerType =>
+          ColumnStat(InternalRow(0L, intSeq.max, intSeq.min, intSeq.distinct.length.toLong))
+        case StringType =>
+          ColumnStat(InternalRow(0L, stringSeq.map(_.length).sum / stringSeq.length.toDouble,
+            stringSeq.map(_.length).max.toInt, stringSeq.distinct.length.toLong))
+        case BinaryType =>
+          ColumnStat(InternalRow(0L, binarySeq.map(_.length).sum / binarySeq.length.toDouble,
+            binarySeq.map(_.length).max.toInt))
+        case BooleanType =>
+          ColumnStat(InternalRow(0L, booleanSeq.count(_.equals(true)).toLong,
+            booleanSeq.count(_.equals(false)).toLong))
       }
+      (f, colStat)
+    }
+    (df, expectedColStatsSeq)
+  }
+
+  private def checkColStats(
+      tableName: String,
+      isDataSourceTable: Boolean,
+      expectedColStatsSeq: Seq[(StructField, ColumnStat)]): Unit = {
+    val readback = spark.table(tableName)
+    val stats = readback.queryExecution.analyzed.collect {
+      case rel: MetastoreRelation =>
+        assert(!isDataSourceTable, "Expected a Hive serde table, but got a data source table")
+        rel.catalogTable.stats.get
+      case rel: LogicalRelation =>
+        assert(isDataSourceTable, "Expected a data source table, but got a Hive serde table")
+        rel.catalogTable.get.stats.get
+    }
+    assert(stats.length == 1)
+    val columnStats = stats.head.colStats
+    assert(columnStats.size == expectedColStatsSeq.length)
+    expectedColStatsSeq.foreach { case (field, expectedColStat) =>
+      StatisticsTest.checkColStat(
+        dataType = field.dataType,
+        colStat = columnStats(field.name),
+        expectedColStat = expectedColStat,
+        rsd = spark.sessionState.conf.ndvMaxError)
+    }
+  }
 
-      sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS FOR COLUMNS c1, c2, c3")
-      val readback = spark.table(tableName)
-      val relations = readback.queryExecution.analyzed.collect { case rel: LogicalRelation =>
-        val columnStats = rel.catalogTable.get.stats.get.colStats
-        expectedColStatsSeq.foreach { case (field, expectedColStat) =>
-          assert(columnStats.contains(field.name))
-          val colStat = columnStats(field.name)
+  test("generate and load column-level stats for data source table") {
+    val dsTable = "dsTable"
+    withTable(dsTable) {
+      testDataFrame.write.format("parquet").saveAsTable(dsTable)
+      sql(s"ANALYZE TABLE $dsTable COMPUTE STATISTICS FOR COLUMNS c1, c2, c3, c4")
+      checkColStats(dsTable, isDataSourceTable = true, expectedColStatsSeq)
+    }
+  }
+
+  test("generate and load column-level stats for hive serde table") {
+    val hTable = "hTable"
+    val tmp = "tmp"
+    withTable(hTable, tmp) {
+      testDataFrame.write.format("parquet").saveAsTable(tmp)
+      sql(s"CREATE TABLE $hTable (c1 int, c2 string, c3 binary, c4 boolean) STORED AS TEXTFILE")
+      sql(s"INSERT INTO $hTable SELECT * FROM $tmp")
+      sql(s"ANALYZE TABLE $hTable COMPUTE STATISTICS FOR COLUMNS c1, c2, c3, c4")
+      checkColStats(hTable, isDataSourceTable = false, expectedColStatsSeq)
+    }
+  }
+
+  // When caseSensitive is on, for columns with only case difference, they are different columns
+  // and we should generate column stats for all of them.
+  private def checkCaseSensitiveColStats(columnName: String): Unit = {
+    val tableName = "tbl"
+    withTable(tableName) {
+      val column1 = columnName.toLowerCase
+      val column2 = columnName.toUpperCase
+      withSQLConf("spark.sql.caseSensitive" -> "true") {
+        sql(s"CREATE TABLE $tableName (`$column1` int, `$column2` double) USING PARQUET")
+        sql(s"INSERT INTO $tableName SELECT 1, 3.0")
+        sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS FOR COLUMNS `$column1`, `$column2`")
+        val readback = spark.table(tableName)
+        val relations = readback.queryExecution.analyzed.collect { case rel: LogicalRelation =>
+          val columnStats = rel.catalogTable.get.stats.get.colStats
+          assert(columnStats.size == 2)
+          StatisticsTest.checkColStat(
+            dataType = IntegerType,
+            colStat = columnStats(column1),
+            expectedColStat = ColumnStat(InternalRow(0L, 1, 1, 1L)),
+            rsd = spark.sessionState.conf.ndvMaxError)
           StatisticsTest.checkColStat(
-            dataType = field.dataType,
-            colStat = colStat,
-            expectedColStat = expectedColStat,
+            dataType = DoubleType,
+            colStat = columnStats(column2),
+            expectedColStat = ColumnStat(InternalRow(0L, 3.0d, 3.0d, 1L)),
             rsd = spark.sessionState.conf.ndvMaxError)
+          rel
         }
-        rel
+        assert(relations.size == 1)
       }
-      assert(relations.size == 1)
     }
   }
 
+  test("check column statistics for case sensitive column names") {
+    checkCaseSensitiveColStats(columnName = "c1")
+  }
+
+  test("check column statistics for case sensitive non-ascii column names") {
+    // scalastyle:off
+    // non ascii characters are not allowed in the source code, so we disable the scalastyle.
+    checkCaseSensitiveColStats(columnName = "列c")
+    // scalastyle:on
+  }
+
   test("estimates the size of a test MetastoreRelation") {
     val df = sql("""SELECT * FROM src""")
     val sizes = df.queryExecution.analyzed.collect { case mr: MetastoreRelation =>

From a0ebcb3a30ec64e01608ed6fa7b7ffb7acbd3af2 Mon Sep 17 00:00:00 2001
From: Dhruve Ashar <dhruveashar@gmail.com>
Date: Fri, 14 Oct 2016 17:45:27 +0100
Subject: [PATCH 016/162] [DOC] Fix typo in sql hive doc

Change is too trivial to file a JIRA.

Author: Dhruve Ashar <dhruveashar@gmail.com>

Closes #15485 from dhruve/master.
---
 docs/sql-programming-guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index d0f43ab0a9cc9..dcc828cc69fed 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -998,7 +998,7 @@ The following options can be used to configure the version of Hive that is used
         <li>A classpath in the standard format for the JVM. This classpath must include all of Hive
         and its dependencies, including the correct version of Hadoop. These jars only need to be
         present on the driver, but if you are running in yarn cluster mode then you must ensure
-        they are packaged with you application.</li>
+        they are packaged with your application.</li>
       </ol>
     </td>
   </tr>

From fa37877af02a956203e8a00811b20f34af0278f7 Mon Sep 17 00:00:00 2001
From: Andrew Ash <andrew@andrewash.com>
Date: Fri, 14 Oct 2016 18:13:19 +0100
Subject: [PATCH 017/162] Typo: form -> from

## What changes were proposed in this pull request?

Minor typo fix

## How was this patch tested?

Existing unit tests on Jenkins

Author: Andrew Ash <andrew@andrewash.com>

Closes #15486 from ash211/patch-8.
---
 .../src/main/scala/org/apache/spark/sql/DataFrameReader.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index a716a916b7f7f..ac3358592202f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -363,7 +363,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * type.</li>
    * <li>`quote` (default `"`): sets the single character used for escaping quoted values where
    * the separator can be part of the value. If you would like to turn off quotations, you need to
-   * set not `null` but an empty string. This behaviour is different form
+   * set not `null` but an empty string. This behaviour is different from
    * `com.databricks.spark.csv`.</li>
    * <li>`escape` (default `\`): sets the single character used for escaping quotes inside
    * an already quoted value.</li>

From 05800b4b4e7873ebc445dfcd020b76d7539686e1 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Fri, 14 Oct 2016 12:39:25 -0700
Subject: [PATCH 018/162] [TEST] Ignore flaky test in
 StreamingQueryListenerSuite

## What changes were proposed in this pull request?

Ignoring the flaky test introduced in #15307

https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.7/1736/testReport/junit/org.apache.spark.sql.streaming/StreamingQueryListenerSuite/single_listener__check_trigger_statuses/

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #15491 from tdas/metrics-flaky-test.
---
 .../spark/sql/streaming/StreamingQueryListenerSuite.scala       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
index 6256385dfd0e4..9e0eefbc58aa5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
@@ -43,7 +43,7 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
     // Make sure we don't leak any events to the next test
   }
 
-  test("single listener, check trigger statuses") {
+  ignore("single listener, check trigger statuses") {
     import StreamingQueryListenerSuite._
     clock = new ManualClock()
 

From de1c1ca5c9d6064d3b7b3711e3bfb08fa018abe8 Mon Sep 17 00:00:00 2001
From: sethah <seth.hendrickson16@gmail.com>
Date: Fri, 14 Oct 2016 20:21:03 +0000
Subject: [PATCH 019/162] [SPARK-17941][ML][TEST] Logistic regression tests
 should use sample weights.

## What changes were proposed in this pull request?

The sample weight testing for logistic regressions is not robust. Logistic regression suite already has many test cases comparing results to R glmnet. Since both libraries support sample weights, we should use sample weights in the test to increase coverage for sample weighting. This patch doesn't really add any code and makes the testing more complete.

Also fixed some errors with the R code that was referenced in the test suit. Changed `standardization=T` to `standardize=T` since the former is invalid.

## How was this patch tested?

Existing unit tests are modified. No non-test code is touched.

Author: sethah <seth.hendrickson16@gmail.com>

Closes #15488 from sethah/logreg_weight_tests.
---
 .../LogisticRegressionSuite.scala             | 1493 +++++++++--------
 1 file changed, 748 insertions(+), 745 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index 42b56754e0835..bc631dc6d3149 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -25,14 +25,14 @@ import scala.util.control.Breaks._
 import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.ml.attribute.NominalAttribute
 import org.apache.spark.ml.classification.LogisticRegressionSuite._
-import org.apache.spark.ml.feature.LabeledPoint
+import org.apache.spark.ml.feature.{Instance, LabeledPoint}
 import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, SparseMatrix, SparseVector, Vector, Vectors}
 import org.apache.spark.ml.param.ParamsSuite
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.{Dataset, Row}
-import org.apache.spark.sql.functions.{col, lit}
+import org.apache.spark.sql.functions.{col, lit, rand}
 import org.apache.spark.sql.types.LongType
 
 class LogisticRegressionSuite
@@ -40,6 +40,7 @@ class LogisticRegressionSuite
 
   import testImplicits._
 
+  private val seed = 42
   @transient var smallBinaryDataset: Dataset[_] = _
   @transient var smallMultinomialDataset: Dataset[_] = _
   @transient var binaryDataset: Dataset[_] = _
@@ -49,7 +50,7 @@ class LogisticRegressionSuite
   override def beforeAll(): Unit = {
     super.beforeAll()
 
-    smallBinaryDataset = generateLogisticInput(1.0, 1.0, nPoints = 100, seed = 42).toDF()
+    smallBinaryDataset = generateLogisticInput(1.0, 1.0, nPoints = 100, seed = seed).toDF()
 
     smallMultinomialDataset = {
       val nPoints = 100
@@ -61,7 +62,7 @@ class LogisticRegressionSuite
       val xVariance = Array(0.6856, 0.1899)
 
       val testData = generateMultinomialLogisticInput(
-        coefficients, xMean, xVariance, addIntercept = true, nPoints, 42)
+        coefficients, xMean, xVariance, addIntercept = true, nPoints, seed)
 
       val df = sc.parallelize(testData, 4).toDF()
       df.cache()
@@ -76,9 +77,9 @@ class LogisticRegressionSuite
 
       val testData =
         generateMultinomialLogisticInput(coefficients, xMean, xVariance,
-          addIntercept = true, nPoints, 42)
+          addIntercept = true, nPoints, seed)
 
-      sc.parallelize(testData, 4).toDF()
+      sc.parallelize(testData, 4).toDF().withColumn("weight", rand(seed))
     }
 
     multinomialDataset = {
@@ -91,9 +92,9 @@ class LogisticRegressionSuite
       val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)
 
       val testData = generateMultinomialLogisticInput(
-        coefficients, xMean, xVariance, addIntercept = true, nPoints, 42)
+        coefficients, xMean, xVariance, addIntercept = true, nPoints, seed)
 
-      val df = sc.parallelize(testData, 4).toDF()
+      val df = sc.parallelize(testData, 4).toDF().withColumn("weight", rand(seed))
       df.cache()
       df
     }
@@ -104,11 +105,11 @@ class LogisticRegressionSuite
    * so we can validate the training accuracy compared with R's glmnet package.
    */
   ignore("export test data into CSV format") {
-    binaryDataset.rdd.map { case Row(label: Double, features: Vector) =>
-      label + "," + features.toArray.mkString(",")
+    binaryDataset.rdd.map { case Row(label: Double, features: Vector, weight: Double) =>
+      label + "," + weight + "," + features.toArray.mkString(",")
     }.repartition(1).saveAsTextFile("target/tmp/LogisticRegressionSuite/binaryDataset")
-    multinomialDataset.rdd.map { case Row(label: Double, features: Vector) =>
-      label + "," + features.toArray.mkString(",")
+    multinomialDataset.rdd.map { case Row(label: Double, features: Vector, weight: Double) =>
+      label + "," + weight + "," + features.toArray.mkString(",")
     }.repartition(1).saveAsTextFile("target/tmp/LogisticRegressionSuite/multinomialDataset")
   }
 
@@ -519,31 +520,35 @@ class LogisticRegressionSuite
 
   test("binary logistic regression with intercept without regularization") {
     val trainer1 = (new LogisticRegression).setFitIntercept(true).setStandardization(true)
+      .setWeightCol("weight")
     val trainer2 = (new LogisticRegression).setFitIntercept(true).setStandardization(false)
+      .setWeightCol("weight")
 
     val model1 = trainer1.fit(binaryDataset)
     val model2 = trainer2.fit(binaryDataset)
 
     /*
-       Using the following R code to load the data and train the model using glmnet package.
-
-       library("glmnet")
-       data <- read.csv("path", header=FALSE)
-       label = factor(data$V1)
-       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 0))
-       coefficients
+      Use the following R code to load the data and train the model using glmnet package.
+      library("glmnet")
+      data <- read.csv("path", header=FALSE)
+      label = factor(data$V1)
+      w = data$V2
+      features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
+      coefficients = coef(glmnet(features, label, weights=w, family="binomial", alpha = 0,
+      lambda = 0))
+      coefficients
+      $`0`
+      5 x 1 sparse Matrix of class "dgCMatrix"
+                          s0
+      (Intercept)  2.7355261
+      data.V3     -0.5734389
+      data.V4      0.8911736
+      data.V5     -0.3878645
+      data.V6     -0.8060570
 
-       5 x 1 sparse Matrix of class "dgCMatrix"
-                           s0
-       (Intercept)  2.8366423
-       data.V2     -0.5895848
-       data.V3      0.8931147
-       data.V4     -0.3925051
-       data.V5     -0.7996864
      */
-    val interceptR = 2.8366423
-    val coefficientsR = Vectors.dense(-0.5895848, 0.8931147, -0.3925051, -0.7996864)
+    val coefficientsR = Vectors.dense(-0.5734389, 0.8911736, -0.3878645, -0.8060570)
+    val interceptR = 2.7355261
 
     assert(model1.intercept ~== interceptR relTol 1E-3)
     assert(model1.coefficients ~= coefficientsR relTol 1E-3)
@@ -555,413 +560,374 @@ class LogisticRegressionSuite
 
   test("binary logistic regression without intercept without regularization") {
     val trainer1 = (new LogisticRegression).setFitIntercept(false).setStandardization(true)
+      .setWeightCol("weight")
     val trainer2 = (new LogisticRegression).setFitIntercept(false).setStandardization(false)
+      .setWeightCol("weight")
 
     val model1 = trainer1.fit(binaryDataset)
     val model2 = trainer2.fit(binaryDataset)
 
     /*
-       Using the following R code to load the data and train the model using glmnet package.
+      Use the following R code to load the data and train the model using glmnet package.
 
-       library("glmnet")
-       data <- read.csv("path", header=FALSE)
-       label = factor(data$V1)
-       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       coefficients =
-           coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 0, intercept=FALSE))
-       coefficients
+      library("glmnet")
+      data <- read.csv("path", header=FALSE)
+      label = factor(data$V1)
+      w = data$V2
+      features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
+      coefficients = coef(glmnet(features, label, weights=w, family="binomial", alpha = 0,
+      lambda = 0, intercept=FALSE))
+      coefficients
+      $`0`
+      5 x 1 sparse Matrix of class "dgCMatrix"
+                          s0
+      (Intercept)  .
+      data.V3     -0.3448461
+      data.V4      1.2776453
+      data.V5     -0.3539178
+      data.V6     -0.7469384
 
-       5 x 1 sparse Matrix of class "dgCMatrix"
-                           s0
-       (Intercept)   .
-       data.V2     -0.3534996
-       data.V3      1.2964482
-       data.V4     -0.3571741
-       data.V5     -0.7407946
      */
-    val interceptR = 0.0
-    val coefficientsR = Vectors.dense(-0.3534996, 1.2964482, -0.3571741, -0.7407946)
+    val coefficientsR = Vectors.dense(-0.3448461, 1.2776453, -0.3539178, -0.7469384)
 
-    assert(model1.intercept ~== interceptR relTol 1E-3)
+    assert(model1.intercept ~== 0.0 relTol 1E-3)
     assert(model1.coefficients ~= coefficientsR relTol 1E-2)
 
     // Without regularization, with or without standardization should converge to the same solution.
-    assert(model2.intercept ~== interceptR relTol 1E-3)
+    assert(model2.intercept ~== 0.0 relTol 1E-3)
     assert(model2.coefficients ~= coefficientsR relTol 1E-2)
   }
 
   test("binary logistic regression with intercept with L1 regularization") {
     val trainer1 = (new LogisticRegression).setFitIntercept(true)
-      .setElasticNetParam(1.0).setRegParam(0.12).setStandardization(true)
+      .setElasticNetParam(1.0).setRegParam(0.12).setStandardization(true).setWeightCol("weight")
     val trainer2 = (new LogisticRegression).setFitIntercept(true)
-      .setElasticNetParam(1.0).setRegParam(0.12).setStandardization(false)
+      .setElasticNetParam(1.0).setRegParam(0.12).setStandardization(false).setWeightCol("weight")
 
     val model1 = trainer1.fit(binaryDataset)
     val model2 = trainer2.fit(binaryDataset)
 
     /*
-       Using the following R code to load the data and train the model using glmnet package.
+      Use the following R code to load the data and train the model using glmnet package.
 
-       library("glmnet")
-       data <- read.csv("path", header=FALSE)
-       label = factor(data$V1)
-       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       coefficients = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12))
-       coefficients
+      library("glmnet")
+      data <- read.csv("path", header=FALSE)
+      label = factor(data$V1)
+      w = data$V2
+      features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
+      coefficients = coef(glmnet(features, label, weights=w, family="binomial", alpha = 1,
+      lambda = 0.12, standardize=T))
+      coefficients
+      $`0`
+      5 x 1 sparse Matrix of class "dgCMatrix"
+                           s0
+      (Intercept) -0.06775980
+      data.V3      .
+      data.V4      .
+      data.V5     -0.03933146
+      data.V6     -0.03047580
 
-       5 x 1 sparse Matrix of class "dgCMatrix"
-                            s0
-       (Intercept) -0.05627428
-       data.V2       .
-       data.V3       .
-       data.V4     -0.04325749
-       data.V5     -0.02481551
      */
-    val interceptR1 = -0.05627428
-    val coefficientsR1 = Vectors.dense(0.0, 0.0, -0.04325749, -0.02481551)
+    val coefficientsRStd = Vectors.dense(0.0, 0.0, -0.03933146, -0.03047580)
+    val interceptRStd = -0.06775980
 
-    assert(model1.intercept ~== interceptR1 relTol 1E-2)
-    assert(model1.coefficients ~= coefficientsR1 absTol 2E-2)
+    assert(model1.intercept ~== interceptRStd relTol 1E-2)
+    assert(model1.coefficients ~= coefficientsRStd absTol 2E-2)
 
     /*
-       Using the following R code to load the data and train the model using glmnet package.
+      Use the following R code to load the data and train the model using glmnet package.
 
-       library("glmnet")
-       data <- read.csv("path", header=FALSE)
-       label = factor(data$V1)
-       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       coefficients = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12,
-           standardize=FALSE))
-       coefficients
+      library("glmnet")
+      data <- read.csv("path", header=FALSE)
+      label = factor(data$V1)
+      w = data$V2
+      features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
+      coefficients = coef(glmnet(features, label, weights=w, family="binomial", alpha = 1,
+      lambda = 0.12, standardize=F))
+      coefficients
+      $`0`
+      5 x 1 sparse Matrix of class "dgCMatrix"
+                          s0
+      (Intercept)  0.3544768
+      data.V3      .
+      data.V4      .
+      data.V5     -0.1626191
+      data.V6      .
 
-       5 x 1 sparse Matrix of class "dgCMatrix"
-                           s0
-       (Intercept)  0.3722152
-       data.V2       .
-       data.V3       .
-       data.V4     -0.1665453
-       data.V5       .
      */
-    val interceptR2 = 0.3722152
-    val coefficientsR2 = Vectors.dense(0.0, 0.0, -0.1665453, 0.0)
+    val coefficientsR = Vectors.dense(0.0, 0.0, -0.1626191, 0.0)
+    val interceptR = 0.3544768
 
-    assert(model2.intercept ~== interceptR2 relTol 1E-2)
-    assert(model2.coefficients ~== coefficientsR2 absTol 1E-3)
+    assert(model2.intercept ~== interceptR relTol 1E-2)
+    assert(model2.coefficients ~== coefficientsR absTol 1E-3)
     // TODO: move this to a standalone test of compression after SPARK-17471
     assert(model2.coefficients.isInstanceOf[SparseVector])
   }
 
   test("binary logistic regression without intercept with L1 regularization") {
     val trainer1 = (new LogisticRegression).setFitIntercept(false)
-      .setElasticNetParam(1.0).setRegParam(0.12).setStandardization(true)
+      .setElasticNetParam(1.0).setRegParam(0.12).setStandardization(true).setWeightCol("weight")
     val trainer2 = (new LogisticRegression).setFitIntercept(false)
-      .setElasticNetParam(1.0).setRegParam(0.12).setStandardization(false)
+      .setElasticNetParam(1.0).setRegParam(0.12).setStandardization(false).setWeightCol("weight")
 
     val model1 = trainer1.fit(binaryDataset)
     val model2 = trainer2.fit(binaryDataset)
 
     /*
-       Using the following R code to load the data and train the model using glmnet package.
-
-       library("glmnet")
-       data <- read.csv("path", header=FALSE)
-       label = factor(data$V1)
-       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       coefficients = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12,
-           intercept=FALSE))
-       coefficients
-
-       5 x 1 sparse Matrix of class "dgCMatrix"
-                            s0
-       (Intercept)   .
-       data.V2       .
-       data.V3       .
-       data.V4     -0.05189203
-       data.V5     -0.03891782
-     */
-    val interceptR1 = 0.0
-    val coefficientsR1 = Vectors.dense(0.0, 0.0, -0.05189203, -0.03891782)
-
-    assert(model1.intercept ~== interceptR1 relTol 1E-3)
-    assert(model1.coefficients ~= coefficientsR1 absTol 1E-3)
+      Use the following R code to load the data and train the model using glmnet package.
 
-    /*
-       Using the following R code to load the data and train the model using glmnet package.
+      library("glmnet")
+      data <- read.csv("path", header=FALSE)
+      label = factor(data$V1)
+      w = data$V2
+      features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
+      coefficientsStd = coef(glmnet(features, label, weights=w, family="binomial", alpha = 1,
+      lambda = 0.12, intercept=F, standardize=T))
+      coefficients = coef(glmnet(features, label, weights=w, family="binomial", alpha = 1,
+      lambda = 0.12, intercept=F, standardize=F))
+      coefficientsStd
+      5 x 1 sparse Matrix of class "dgCMatrix"
+                           s0
+      (Intercept)  .
+      data.V3      .
+      data.V4      .
+      data.V5     -0.04967635
+      data.V6     -0.04757757
 
-       library("glmnet")
-       data <- read.csv("path", header=FALSE)
-       label = factor(data$V1)
-       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       coefficients = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12,
-           intercept=FALSE, standardize=FALSE))
-       coefficients
+      coefficients
+      5 x 1 sparse Matrix of class "dgCMatrix"
+                           s0
+      (Intercept)  .
+      data.V3      .
+      data.V4      .
+      data.V5     -0.08433195
+      data.V6      .
 
-       5 x 1 sparse Matrix of class "dgCMatrix"
-                            s0
-       (Intercept)   .
-       data.V2       .
-       data.V3       .
-       data.V4     -0.08420782
-       data.V5       .
      */
-    val interceptR2 = 0.0
-    val coefficientsR2 = Vectors.dense(0.0, 0.0, -0.08420782, 0.0)
+    val coefficientsRStd = Vectors.dense(0.0, 0.0, -0.04967635, -0.04757757)
 
-    assert(model2.intercept ~== interceptR2 absTol 1E-3)
-    assert(model2.coefficients ~= coefficientsR2 absTol 1E-3)
+    val coefficientsR = Vectors.dense(0.0, 0.0, -0.08433195, 0.0)
+
+    assert(model1.intercept ~== 0.0 absTol 1E-3)
+    assert(model1.coefficients ~= coefficientsRStd absTol 1E-3)
+    assert(model2.intercept ~== 0.0 absTol 1E-3)
+    assert(model2.coefficients ~= coefficientsR absTol 1E-3)
   }
 
   test("binary logistic regression with intercept with L2 regularization") {
     val trainer1 = (new LogisticRegression).setFitIntercept(true)
-      .setElasticNetParam(0.0).setRegParam(1.37).setStandardization(true)
+      .setElasticNetParam(0.0).setRegParam(1.37).setStandardization(true).setWeightCol("weight")
     val trainer2 = (new LogisticRegression).setFitIntercept(true)
-      .setElasticNetParam(0.0).setRegParam(1.37).setStandardization(false)
+      .setElasticNetParam(0.0).setRegParam(1.37).setStandardization(false).setWeightCol("weight")
 
     val model1 = trainer1.fit(binaryDataset)
     val model2 = trainer2.fit(binaryDataset)
 
     /*
-       Using the following R code to load the data and train the model using glmnet package.
-
-       library("glmnet")
-       data <- read.csv("path", header=FALSE)
-       label = factor(data$V1)
-       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37))
-       coefficients
-
-       5 x 1 sparse Matrix of class "dgCMatrix"
-                            s0
-       (Intercept)  0.15021751
-       data.V2     -0.07251837
-       data.V3      0.10724191
-       data.V4     -0.04865309
-       data.V5     -0.10062872
-     */
-    val interceptR1 = 0.15021751
-    val coefficientsR1 = Vectors.dense(-0.07251837, 0.10724191, -0.04865309, -0.10062872)
-
-    assert(model1.intercept ~== interceptR1 relTol 1E-3)
-    assert(model1.coefficients ~= coefficientsR1 relTol 1E-3)
+      Use the following R code to load the data and train the model using glmnet package.
 
-    /*
-       Using the following R code to load the data and train the model using glmnet package.
+      library("glmnet")
+      data <- read.csv("path", header=FALSE)
+      label = factor(data$V1)
+      w = data$V2
+      features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
+      coefficientsStd = coef(glmnet(features, label, weights=w, family="binomial", alpha = 0,
+      lambda = 1.37, standardize=T))
+      coefficients = coef(glmnet(features, label, weights=w, family="binomial", alpha = 0,
+      lambda = 1.37, standardize=F))
+      coefficientsStd
+      5 x 1 sparse Matrix of class "dgCMatrix"
+                           s0
+      (Intercept)  0.12707703
+      data.V3     -0.06980967
+      data.V4      0.10803933
+      data.V5     -0.04800404
+      data.V6     -0.10165096
 
-       library("glmnet")
-       data <- read.csv("path", header=FALSE)
-       label = factor(data$V1)
-       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37,
-           standardize=FALSE))
-       coefficients
+      coefficients
+      5 x 1 sparse Matrix of class "dgCMatrix"
+                           s0
+      (Intercept)  0.46613016
+      data.V3     -0.04944529
+      data.V4      0.02326772
+      data.V5     -0.11362772
+      data.V6     -0.06312848
 
-       5 x 1 sparse Matrix of class "dgCMatrix"
-                            s0
-       (Intercept)  0.48657516
-       data.V2     -0.05155371
-       data.V3      0.02301057
-       data.V4     -0.11482896
-       data.V5     -0.06266838
      */
-    val interceptR2 = 0.48657516
-    val coefficientsR2 = Vectors.dense(-0.05155371, 0.02301057, -0.11482896, -0.06266838)
+    val coefficientsRStd = Vectors.dense(-0.06980967, 0.10803933, -0.04800404, -0.10165096)
+    val interceptRStd = 0.12707703
+    val coefficientsR = Vectors.dense(-0.04944529, 0.02326772, -0.11362772, -0.06312848)
+    val interceptR = 0.46613016
 
-    assert(model2.intercept ~== interceptR2 relTol 1E-3)
-    assert(model2.coefficients ~= coefficientsR2 relTol 1E-3)
+    assert(model1.intercept ~== interceptRStd relTol 1E-3)
+    assert(model1.coefficients ~= coefficientsRStd relTol 1E-3)
+    assert(model2.intercept ~== interceptR relTol 1E-3)
+    assert(model2.coefficients ~= coefficientsR relTol 1E-3)
   }
 
   test("binary logistic regression without intercept with L2 regularization") {
     val trainer1 = (new LogisticRegression).setFitIntercept(false)
-      .setElasticNetParam(0.0).setRegParam(1.37).setStandardization(true)
+      .setElasticNetParam(0.0).setRegParam(1.37).setStandardization(true).setWeightCol("weight")
     val trainer2 = (new LogisticRegression).setFitIntercept(false)
-      .setElasticNetParam(0.0).setRegParam(1.37).setStandardization(false)
+      .setElasticNetParam(0.0).setRegParam(1.37).setStandardization(false).setWeightCol("weight")
 
     val model1 = trainer1.fit(binaryDataset)
     val model2 = trainer2.fit(binaryDataset)
 
     /*
-       Using the following R code to load the data and train the model using glmnet package.
+      Use the following R code to load the data and train the model using glmnet package.
 
-       library("glmnet")
-       data <- read.csv("path", header=FALSE)
-       label = factor(data$V1)
-       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37,
-           intercept=FALSE))
-       coefficients
+      library("glmnet")
+      data <- read.csv("path", header=FALSE)
+      label = factor(data$V1)
+      w = data$V2
+      features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
+      coefficientsStd = coef(glmnet(features, label, weights=w, family="binomial", alpha = 0,
+      lambda = 1.37, intercept=F, standardize=T))
+      coefficients = coef(glmnet(features, label, weights=w, family="binomial", alpha = 0,
+      lambda = 1.37, intercept=F, standardize=F))
+      coefficientsStd
+      5 x 1 sparse Matrix of class "dgCMatrix"
+                           s0
+      (Intercept)  .
+      data.V3     -0.06000152
+      data.V4      0.12598737
+      data.V5     -0.04669009
+      data.V6     -0.09941025
 
-       5 x 1 sparse Matrix of class "dgCMatrix"
+      coefficients
+      5 x 1 sparse Matrix of class "dgCMatrix"
                             s0
-       (Intercept)   .
-       data.V2     -0.06099165
-       data.V3      0.12857058
-       data.V4     -0.04708770
-       data.V5     -0.09799775
-     */
-    val interceptR1 = 0.0
-    val coefficientsR1 = Vectors.dense(-0.06099165, 0.12857058, -0.04708770, -0.09799775)
-
-    assert(model1.intercept ~== interceptR1 absTol 1E-3)
-    assert(model1.coefficients ~= coefficientsR1 relTol 1E-2)
-
-    /*
-       Using the following R code to load the data and train the model using glmnet package.
-
-       library("glmnet")
-       data <- read.csv("path", header=FALSE)
-       label = factor(data$V1)
-       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37,
-           intercept=FALSE, standardize=FALSE))
-       coefficients
+      (Intercept)  .
+      data.V3     -0.005482255
+      data.V4      0.048106338
+      data.V5     -0.093411640
+      data.V6     -0.054149798
 
-       5 x 1 sparse Matrix of class "dgCMatrix"
-                             s0
-       (Intercept)   .
-       data.V2     -0.005679651
-       data.V3      0.048967094
-       data.V4     -0.093714016
-       data.V5     -0.053314311
      */
-    val interceptR2 = 0.0
-    val coefficientsR2 = Vectors.dense(-0.005679651, 0.048967094, -0.093714016, -0.053314311)
+    val coefficientsRStd = Vectors.dense(-0.06000152, 0.12598737, -0.04669009, -0.09941025)
+    val coefficientsR = Vectors.dense(-0.005482255, 0.048106338, -0.093411640, -0.054149798)
 
-    assert(model2.intercept ~== interceptR2 absTol 1E-3)
-    assert(model2.coefficients ~= coefficientsR2 relTol 1E-2)
+    assert(model1.intercept ~== 0.0 absTol 1E-3)
+    assert(model1.coefficients ~= coefficientsRStd relTol 1E-2)
+    assert(model2.intercept ~== 0.0 absTol 1E-3)
+    assert(model2.coefficients ~= coefficientsR relTol 1E-2)
   }
 
   test("binary logistic regression with intercept with ElasticNet regularization") {
-    val trainer1 = (new LogisticRegression).setFitIntercept(true)
-      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(true)
+    val trainer1 = (new LogisticRegression).setFitIntercept(true).setMaxIter(200)
+      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(true).setWeightCol("weight")
     val trainer2 = (new LogisticRegression).setFitIntercept(true)
-      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(false)
+      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(false).setWeightCol("weight")
 
     val model1 = trainer1.fit(binaryDataset)
     val model2 = trainer2.fit(binaryDataset)
 
     /*
-       Using the following R code to load the data and train the model using glmnet package.
-
-       library("glmnet")
-       data <- read.csv("path", header=FALSE)
-       label = factor(data$V1)
-       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       coefficients = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21))
-       coefficients
-
-       5 x 1 sparse Matrix of class "dgCMatrix"
-                            s0
-       (Intercept)  0.57734851
-       data.V2     -0.05310287
-       data.V3       .
-       data.V4     -0.08849250
-       data.V5     -0.15458796
-     */
-    val interceptR1 = 0.57734851
-    val coefficientsR1 = Vectors.dense(-0.05310287, 0.0, -0.08849250, -0.15458796)
-
-    assert(model1.intercept ~== interceptR1 relTol 6E-3)
-    assert(model1.coefficients ~== coefficientsR1 absTol 5E-3)
+      Use the following R code to load the data and train the model using glmnet package.
 
-    /*
-       Using the following R code to load the data and train the model using glmnet package.
+      library("glmnet")
+      data <- read.csv("path", header=FALSE)
+      label = factor(data$V1)
+      w = data$V2
+      features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
+      coefficientsStd = coef(glmnet(features, label, weights=w, family="binomial", alpha = 0.38,
+      lambda = 0.21, standardize=T))
+      coefficients = coef(glmnet(features, label, weights=w, family="binomial", alpha = 0.38,
+      lambda = 0.21, standardize=F))
+      coefficientsStd
+      5 x 1 sparse Matrix of class "dgCMatrix"
+                           s0
+      (Intercept)  0.49991996
+      data.V3     -0.04131110
+      data.V4      .
+      data.V5     -0.08585233
+      data.V6     -0.15875400
 
-       library("glmnet")
-       data <- read.csv("path", header=FALSE)
-       label = factor(data$V1)
-       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       coefficients = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21,
-           standardize=FALSE))
-       coefficients
+      coefficients
+      5 x 1 sparse Matrix of class "dgCMatrix"
+                          s0
+      (Intercept)  0.5024256
+      data.V3      .
+      data.V4      .
+      data.V5     -0.1846038
+      data.V6     -0.0559614
 
-       5 x 1 sparse Matrix of class "dgCMatrix"
-                            s0
-       (Intercept)  0.51555993
-       data.V2       .
-       data.V3       .
-       data.V4     -0.18807395
-       data.V5     -0.05350074
      */
-    val interceptR2 = 0.51555993
-    val coefficientsR2 = Vectors.dense(0.0, 0.0, -0.18807395, -0.05350074)
-
-    assert(model2.intercept ~== interceptR2 relTol 6E-3)
-    assert(model2.coefficients ~= coefficientsR2 absTol 1E-3)
+    val coefficientsRStd = Vectors.dense(-0.04131110, 0.0, -0.08585233, -0.15875400)
+    val interceptRStd = 0.49991996
+    val coefficientsR = Vectors.dense(0.0, 0.0, -0.1846038, -0.0559614)
+    val interceptR = 0.5024256
+
+    assert(model1.intercept ~== interceptRStd relTol 6E-3)
+    assert(model1.coefficients ~== coefficientsRStd absTol 5E-3)
+    assert(model2.intercept ~== interceptR relTol 6E-3)
+    assert(model2.coefficients ~= coefficientsR absTol 1E-3)
   }
 
   test("binary logistic regression without intercept with ElasticNet regularization") {
     val trainer1 = (new LogisticRegression).setFitIntercept(false)
-      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(true)
+      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(true).setWeightCol("weight")
     val trainer2 = (new LogisticRegression).setFitIntercept(false)
-      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(false)
+      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(false).setWeightCol("weight")
 
     val model1 = trainer1.fit(binaryDataset)
     val model2 = trainer2.fit(binaryDataset)
 
     /*
-       Using the following R code to load the data and train the model using glmnet package.
-
-       library("glmnet")
-       data <- read.csv("path", header=FALSE)
-       label = factor(data$V1)
-       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       coefficients = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21,
-           intercept=FALSE))
-       coefficients
-
-       5 x 1 sparse Matrix of class "dgCMatrix"
-                            s0
-       (Intercept)   .
-       data.V2     -0.001005743
-       data.V3      0.072577857
-       data.V4     -0.081203769
-       data.V5     -0.142534158
-     */
-    val interceptR1 = 0.0
-    val coefficientsR1 = Vectors.dense(-0.001005743, 0.072577857, -0.081203769, -0.142534158)
-
-    assert(model1.intercept ~== interceptR1 relTol 1E-3)
-    assert(model1.coefficients ~= coefficientsR1 absTol 1E-2)
+      Use the following R code to load the data and train the model using glmnet package.
 
-    /*
-       Using the following R code to load the data and train the model using glmnet package.
+      library("glmnet")
+      data <- read.csv("path", header=FALSE)
+      label = factor(data$V1)
+      w = data$V2
+      features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
+      coefficientsStd = coef(glmnet(features, label, weights=w, family="binomial", alpha = 0.38,
+      lambda = 0.21, intercept=FALSE, standardize=T))
+      coefficients = coef(glmnet(features, label, weights=w, family="binomial", alpha = 0.38,
+      lambda = 0.21, intercept=FALSE, standardize=F))
+      coefficientsStd
+      5 x 1 sparse Matrix of class "dgCMatrix"
+                           s0
+      (Intercept)  .
+      data.V3      .
+      data.V4      0.06859390
+      data.V5     -0.07900058
+      data.V6     -0.14684320
 
-       library("glmnet")
-       data <- read.csv("path", header=FALSE)
-       label = factor(data$V1)
-       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       coefficients = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21,
-           intercept=FALSE, standardize=FALSE))
-       coefficients
+      coefficients
+      5 x 1 sparse Matrix of class "dgCMatrix"
+                           s0
+      (Intercept)  .
+      data.V3      .
+      data.V4      0.03060637
+      data.V5     -0.11126742
+      data.V6      .
 
-       5 x 1 sparse Matrix of class "dgCMatrix"
-                            s0
-       (Intercept)   .
-       data.V2       .
-       data.V3      0.03345223
-       data.V4     -0.11304532
-       data.V5       .
      */
-    val interceptR2 = 0.0
-    val coefficientsR2 = Vectors.dense(0.0, 0.03345223, -0.11304532, 0.0)
+    val coefficientsRStd = Vectors.dense(0.0, 0.06859390, -0.07900058, -0.14684320)
+    val coefficientsR = Vectors.dense(0.0, 0.03060637, -0.11126742, 0.0)
 
-    assert(model2.intercept ~== interceptR2 absTol 1E-3)
-    assert(model2.coefficients ~= coefficientsR2 absTol 1E-3)
+    assert(model1.intercept ~== 0.0 relTol 1E-3)
+    assert(model1.coefficients ~= coefficientsRStd absTol 1E-2)
+    assert(model2.intercept ~== 0.0 absTol 1E-3)
+    assert(model2.coefficients ~= coefficientsR absTol 1E-3)
   }
 
   test("binary logistic regression with intercept with strong L1 regularization") {
-    val trainer1 = (new LogisticRegression).setFitIntercept(true)
+    val trainer1 = (new LogisticRegression).setFitIntercept(true).setWeightCol("weight")
       .setElasticNetParam(1.0).setRegParam(6.0).setStandardization(true)
-    val trainer2 = (new LogisticRegression).setFitIntercept(true)
+    val trainer2 = (new LogisticRegression).setFitIntercept(true).setWeightCol("weight")
       .setElasticNetParam(1.0).setRegParam(6.0).setStandardization(false)
 
     val model1 = trainer1.fit(binaryDataset)
     val model2 = trainer2.fit(binaryDataset)
 
-    val histogram = binaryDataset.rdd.map { case Row(label: Double, features: Vector) => label }
+    val histogram = binaryDataset.as[Instance].rdd.map { i => (i.label, i.weight)}
       .treeAggregate(new MultiClassSummarizer)(
         seqOp = (c, v) => (c, v) match {
-          case (classSummarizer: MultiClassSummarizer, label: Double) => classSummarizer.add(label)
+          case (classSummarizer: MultiClassSummarizer, (label: Double, weight: Double)) =>
+            classSummarizer.add(label, weight)
         },
         combOp = (c1, c2) => (c1, c2) match {
           case (classSummarizer1: MultiClassSummarizer, classSummarizer2: MultiClassSummarizer) =>
@@ -989,25 +955,26 @@ class LogisticRegressionSuite
     assert(model2.coefficients ~= coefficientsTheory absTol 1E-6)
 
     /*
-       TODO: why is this needed? The correctness of L1 regularization is already checked elsewhere
        Using the following R code to load the data and train the model using glmnet package.
 
        library("glmnet")
        data <- read.csv("path", header=FALSE)
        label = factor(data$V1)
-       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       coefficients = coef(glmnet(features,label, family="binomial", alpha = 1.0, lambda = 6.0))
+       w = data$V2
+       features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
+       coefficients = coef(glmnet(features, label, weights=w, family="binomial", alpha = 1.0,
+       lambda = 6.0))
        coefficients
 
        5 x 1 sparse Matrix of class "dgCMatrix"
-                            s0
-       (Intercept) -0.2480643
-       data.V2      0.0000000
-       data.V3       .
-       data.V4       .
-       data.V5       .
+                           s0
+       (Intercept) -0.2516986
+       data.V3      0.0000000
+       data.V4      .
+       data.V5      .
+       data.V6      .
      */
-    val interceptR = -0.248065
+    val interceptR = -0.2516986
     val coefficientsR = Vectors.dense(0.0, 0.0, 0.0, 0.0)
 
     assert(model1.intercept ~== interceptR relTol 1E-5)
@@ -1015,9 +982,9 @@ class LogisticRegressionSuite
   }
 
   test("multinomial logistic regression with intercept with strong L1 regularization") {
-    val trainer1 = (new LogisticRegression).setFitIntercept(true)
+    val trainer1 = (new LogisticRegression).setFitIntercept(true).setWeightCol("weight")
       .setElasticNetParam(1.0).setRegParam(6.0).setStandardization(true)
-    val trainer2 = (new LogisticRegression).setFitIntercept(true)
+    val trainer2 = (new LogisticRegression).setFitIntercept(true).setWeightCol("weight")
       .setElasticNetParam(1.0).setRegParam(6.0).setStandardization(false)
 
     val sqlContext = multinomialDataset.sqlContext
@@ -1025,16 +992,17 @@ class LogisticRegressionSuite
     val model1 = trainer1.fit(multinomialDataset)
     val model2 = trainer2.fit(multinomialDataset)
 
-    val histogram = multinomialDataset.as[LabeledPoint].rdd.map(_.label)
+    val histogram = multinomialDataset.as[Instance].rdd.map(i => (i.label, i.weight))
       .treeAggregate(new MultiClassSummarizer)(
         seqOp = (c, v) => (c, v) match {
-          case (classSummarizer: MultiClassSummarizer, label: Double) => classSummarizer.add(label)
+          case (classSummarizer: MultiClassSummarizer, (label: Double, weight: Double)) =>
+            classSummarizer.add(label, weight)
         },
         combOp = (c1, c2) => (c1, c2) match {
           case (classSummarizer1: MultiClassSummarizer, classSummarizer2: MultiClassSummarizer) =>
             classSummarizer1.merge(classSummarizer2)
         }).histogram
-    val numFeatures = multinomialDataset.as[LabeledPoint].first().features.size
+    val numFeatures = multinomialDataset.as[Instance].first().features.size
     val numClasses = histogram.length
 
     /*
@@ -1068,52 +1036,58 @@ class LogisticRegressionSuite
   test("multinomial logistic regression with intercept without regularization") {
 
     val trainer1 = (new LogisticRegression).setFitIntercept(true)
-      .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(true).setMaxIter(100)
+      .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(true).setWeightCol("weight")
     val trainer2 = (new LogisticRegression).setFitIntercept(true)
-      .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(false)
+      .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(false).setWeightCol("weight")
 
     val model1 = trainer1.fit(multinomialDataset)
     val model2 = trainer2.fit(multinomialDataset)
 
     /*
-       Using the following R code to load the data and train the model using glmnet package.
-       > library("glmnet")
-       > data <- read.csv("path", header=FALSE)
-       > label = as.factor(data$V1)
-       > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       > coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0, lambda = 0))
-       > coefficients
-        $`0`
-        5 x 1 sparse Matrix of class "dgCMatrix"
-                    s0
-           -2.24493379
-        V2  0.25096771
-        V3 -0.03915938
-        V4  0.14766639
-        V5  0.36810817
-        $`1`
-        5 x 1 sparse Matrix of class "dgCMatrix"
-                   s0
-            0.3778931
-        V2 -0.3327489
-        V3  0.8893666
-        V4 -0.2306948
-        V5 -0.4442330
-        $`2`
-        5 x 1 sparse Matrix of class "dgCMatrix"
-                    s0
-            1.86704066
-        V2  0.08178121
-        V3 -0.85020722
-        V4  0.08302840
-        V5  0.07612480
-     */
+      Use the following R code to load the data and train the model using glmnet package.
 
+      library("glmnet")
+      data <- read.csv("path", header=FALSE)
+      label = as.factor(data$V1)
+      w = data$V2
+      features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
+      coefficients = coef(glmnet(features, label, weights=w, family="multinomial",
+      alpha = 0, lambda = 0))
+      coefficients
+      $`0`
+      5 x 1 sparse Matrix of class "dgCMatrix"
+                       s0
+              -2.10320093
+      data.V3  0.24337896
+      data.V4 -0.05916156
+      data.V5  0.14446790
+      data.V6  0.35976165
+
+      $`1`
+      5 x 1 sparse Matrix of class "dgCMatrix"
+                      s0
+               0.3394473
+      data.V3 -0.3443375
+      data.V4  0.9181331
+      data.V5 -0.2283959
+      data.V6 -0.4388066
+
+      $`2`
+      5 x 1 sparse Matrix of class "dgCMatrix"
+                       s0
+               1.76375361
+      data.V3  0.10095851
+      data.V4 -0.85897154
+      data.V5  0.08392798
+      data.V6  0.07904499
+
+
+     */
     val coefficientsR = new DenseMatrix(3, 4, Array(
-      0.2509677, -0.0391594, 0.1476664, 0.3681082,
-      -0.3327489, 0.8893666, -0.2306948, -0.4442330,
-      0.0817812, -0.8502072, 0.0830284, 0.0761248), isTransposed = true)
-    val interceptsR = Vectors.dense(-2.2449338, 0.3778931, 1.8670407)
+      0.24337896, -0.05916156, 0.14446790, 0.35976165,
+      -0.3443375, 0.9181331, -0.2283959, -0.4388066,
+      0.10095851, -0.85897154, 0.08392798, 0.07904499), isTransposed = true)
+    val interceptsR = Vectors.dense(-2.10320093, 0.3394473, 1.76375361)
 
     assert(model1.coefficientMatrix ~== coefficientsR relTol 0.05)
     assert(model1.coefficientMatrix.toArray.sum ~== 0.0 absTol eps)
@@ -1128,52 +1102,57 @@ class LogisticRegressionSuite
   test("multinomial logistic regression without intercept without regularization") {
 
     val trainer1 = (new LogisticRegression).setFitIntercept(false)
-      .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(true)
+      .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(true).setWeightCol("weight")
     val trainer2 = (new LogisticRegression).setFitIntercept(false)
-      .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(false)
+      .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(false).setWeightCol("weight")
 
     val model1 = trainer1.fit(multinomialDataset)
     val model2 = trainer2.fit(multinomialDataset)
 
     /*
-       Using the following R code to load the data and train the model using glmnet package.
-       library("glmnet")
-       data <- read.csv("path", header=FALSE)
-       label = as.factor(data$V1)
-       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0, lambda = 0,
-        intercept=F))
-       > coefficients
-        $`0`
-        5 x 1 sparse Matrix of class "dgCMatrix"
-                    s0
-            .
-        V2  0.06992464
-        V3 -0.36562784
-        V4  0.12142680
-        V5  0.32052211
-        $`1`
-        5 x 1 sparse Matrix of class "dgCMatrix"
-                   s0
-            .
-        V2 -0.3036269
-        V3  0.9449630
-        V4 -0.2271038
-        V5 -0.4364839
-        $`2`
-        5 x 1 sparse Matrix of class "dgCMatrix"
-                   s0
-            .
-        V2  0.2337022
-        V3 -0.5793351
-        V4  0.1056770
-        V5  0.1159618
-     */
+      Use the following R code to load the data and train the model using glmnet package.
+
+      library("glmnet")
+      data <- read.csv("path", header=FALSE)
+      label = as.factor(data$V1)
+      w = data$V2
+      features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
+      coefficients = coef(glmnet(features, label, weights=w, family="multinomial", alpha = 0,
+      lambda = 0, intercept=F))
+      coefficients
+      $`0`
+      5 x 1 sparse Matrix of class "dgCMatrix"
+                       s0
+               .
+      data.V3  0.07276291
+      data.V4 -0.36325496
+      data.V5  0.12015088
+      data.V6  0.31397340
+
+      $`1`
+      5 x 1 sparse Matrix of class "dgCMatrix"
+                      s0
+               .
+      data.V3 -0.3180040
+      data.V4  0.9679074
+      data.V5 -0.2252219
+      data.V6 -0.4319914
 
+      $`2`
+      5 x 1 sparse Matrix of class "dgCMatrix"
+                      s0
+               .
+      data.V3  0.2452411
+      data.V4 -0.6046524
+      data.V5  0.1050710
+      data.V6  0.1180180
+
+
+     */
     val coefficientsR = new DenseMatrix(3, 4, Array(
-      0.0699246, -0.3656278, 0.1214268, 0.3205221,
-      -0.3036269, 0.9449630, -0.2271038, -0.4364839,
-      0.2337022, -0.5793351, 0.1056770, 0.1159618), isTransposed = true)
+      0.07276291, -0.36325496, 0.12015088, 0.31397340,
+      -0.3180040, 0.9679074, -0.2252219, -0.4319914,
+      0.2452411, -0.6046524, 0.1050710, 0.1180180), isTransposed = true)
 
     assert(model1.coefficientMatrix ~== coefficientsR relTol 0.05)
     assert(model1.coefficientMatrix.toArray.sum ~== 0.0 absTol eps)
@@ -1190,92 +1169,95 @@ class LogisticRegressionSuite
     // use tighter constraints because OWL-QN solver takes longer to converge
     val trainer1 = (new LogisticRegression).setFitIntercept(true)
       .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(true)
-      .setMaxIter(300).setTol(1e-10)
+      .setMaxIter(300).setTol(1e-10).setWeightCol("weight")
     val trainer2 = (new LogisticRegression).setFitIntercept(true)
       .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(false)
-      .setMaxIter(300).setTol(1e-10)
+      .setMaxIter(300).setTol(1e-10).setWeightCol("weight")
 
     val model1 = trainer1.fit(multinomialDataset)
     val model2 = trainer2.fit(multinomialDataset)
 
     /*
-       Use the following R code to load the data and train the model using glmnet package.
-       library("glmnet")
-       data <- read.csv("path", header=FALSE)
-       label = as.factor(data$V1)
-       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-       coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 1,
-        lambda = 0.05, standardization=T))
-       coefficients = coef(glmnet(features, label, family="multinomial", alpha = 1, lambda = 0.05,
-        standardization=F))
-       > coefficientsStd
-        $`0`
-        5 x 1 sparse Matrix of class "dgCMatrix"
-                    s0
-           -0.68988825
-        V2  .
-        V3  .
-        V4  .
-        V5  0.09404023
-
-        $`1`
-        5 x 1 sparse Matrix of class "dgCMatrix"
-                   s0
-           -0.2303499
-        V2 -0.1232443
-        V3  0.3258380
-        V4 -0.1564688
-        V5 -0.2053965
-
-        $`2`
-        5 x 1 sparse Matrix of class "dgCMatrix"
-                   s0
-            0.9202381
-        V2  .
-        V3 -0.4803856
-        V4  .
-        V5  .
-
-       > coefficients
-        $`0`
-        5 x 1 sparse Matrix of class "dgCMatrix"
-                    s0
-           -0.44893320
-        V2  .
-        V3  .
-        V4  0.01933812
-        V5  0.03666044
-
-        $`1`
-        5 x 1 sparse Matrix of class "dgCMatrix"
-                   s0
-            0.7376760
-        V2 -0.0577182
-        V3  .
-        V4 -0.2081718
-        V5 -0.1304592
-
-        $`2`
-        5 x 1 sparse Matrix of class "dgCMatrix"
-                   s0
-           -0.2887428
-        V2  .
-        V3  .
-        V4  .
-        V5  .
-     */
+      Use the following R code to load the data and train the model using glmnet package.
 
-    val coefficientsRStd = new DenseMatrix(3, 4, Array(
-      0.0, 0.0, 0.0, 0.09404023,
-      -0.1232443, 0.3258380, -0.1564688, -0.2053965,
-      0.0, -0.4803856, 0.0, 0.0), isTransposed = true)
-    val interceptsRStd = Vectors.dense(-0.68988825, -0.2303499, 0.9202381)
+      library("glmnet")
+      data <- read.csv("path", header=FALSE)
+      label = as.factor(data$V1)
+      w = data$V2
+      features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
+      coefficientsStd = coef(glmnet(features, label, weights=w, family="multinomial",
+      alpha = 1, lambda = 0.05, standardize=T))
+      coefficients = coef(glmnet(features, label, weights=w, family="multinomial", alpha = 1,
+      lambda = 0.05, standardize=F))
+      coefficientsStd
+      $`0`
+      5 x 1 sparse Matrix of class "dgCMatrix"
+                       s0
+              -0.62244703
+      data.V3  .
+      data.V4  .
+      data.V5  .
+      data.V6  0.08419825
 
+      $`1`
+      5 x 1 sparse Matrix of class "dgCMatrix"
+                      s0
+              -0.2804845
+      data.V3 -0.1336960
+      data.V4  0.3717091
+      data.V5 -0.1530363
+      data.V6 -0.2035286
+
+      $`2`
+      5 x 1 sparse Matrix of class "dgCMatrix"
+                      s0
+               0.9029315
+      data.V3  .
+      data.V4 -0.4629737
+      data.V5  .
+      data.V6  .
+
+
+      coefficients
+      $`0`
+      5 x 1 sparse Matrix of class "dgCMatrix"
+                       s0
+              -0.44215290
+      data.V3  .
+      data.V4  .
+      data.V5  0.01767089
+      data.V6  0.02542866
+
+      $`1`
+      5 x 1 sparse Matrix of class "dgCMatrix"
+                       s0
+               0.76308326
+      data.V3 -0.06818576
+      data.V4  .
+      data.V5 -0.20446351
+      data.V6 -0.13017924
+
+      $`2`
+      5 x 1 sparse Matrix of class "dgCMatrix"
+                      s0
+              -0.3209304
+      data.V3  .
+      data.V4  .
+      data.V5  .
+      data.V6  .
+
+
+     */
+    val coefficientsRStd = new DenseMatrix(3, 4, Array(
+      0.0, 0.0, 0.0, 0.08419825,
+      -0.1336960, 0.3717091, -0.1530363, -0.2035286,
+      0.0, -0.4629737, 0.0, 0.0), isTransposed = true)
+    val interceptsRStd = Vectors.dense(-0.62244703, -0.2804845, 0.9029315)
     val coefficientsR = new DenseMatrix(3, 4, Array(
-      0.0, 0.0, 0.01933812, 0.03666044,
-      -0.0577182, 0.0, -0.2081718, -0.1304592,
+      0.0, 0.0, 0.01767089, 0.02542866,
+      -0.06818576, 0.0, -0.20446351, -0.13017924,
       0.0, 0.0, 0.0, 0.0), isTransposed = true)
-    val interceptsR = Vectors.dense(-0.44893320, 0.7376760, -0.2887428)
+    val interceptsR = Vectors.dense(-0.44215290, 0.76308326, -0.3209304)
 
     assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.02)
     assert(model1.interceptVector ~== interceptsRStd relTol 0.1)
@@ -1287,87 +1269,91 @@ class LogisticRegressionSuite
 
   test("multinomial logistic regression without intercept with L1 regularization") {
     val trainer1 = (new LogisticRegression).setFitIntercept(false)
-      .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(true)
+      .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(true).setWeightCol("weight")
     val trainer2 = (new LogisticRegression).setFitIntercept(false)
-      .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(false)
+      .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(false).setWeightCol("weight")
 
     val model1 = trainer1.fit(multinomialDataset)
     val model2 = trainer2.fit(multinomialDataset)
     /*
       Use the following R code to load the data and train the model using glmnet package.
+
       library("glmnet")
       data <- read.csv("path", header=FALSE)
       label = as.factor(data$V1)
-      features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-      coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 1,
-      lambda = 0.05, intercept=F, standardization=T))
-      coefficients = coef(glmnet(features, label, family="multinomial", alpha = 1, lambda = 0.05,
-      intercept=F, standardization=F))
-      > coefficientsStd
+      w = data$V2
+      features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
+      coefficientsStd = coef(glmnet(features, label, weights=w, family="multinomial", alpha = 1,
+      lambda = 0.05, intercept=F, standardize=T))
+      coefficients = coef(glmnet(features, label, weights=w, family="multinomial", alpha = 1,
+      lambda = 0.05, intercept=F, standardize=F))
+      coefficientsStd
       $`0`
       5 x 1 sparse Matrix of class "dgCMatrix"
-                 s0
-         .
-      V2 .
-      V3 .
-      V4 .
-      V5 0.01525105
+                      s0
+              .
+      data.V3 .
+      data.V4 .
+      data.V5 .
+      data.V6 0.01144225
 
       $`1`
       5 x 1 sparse Matrix of class "dgCMatrix"
-                 s0
-          .
-      V2 -0.1502410
-      V3  0.5134658
-      V4 -0.1601146
-      V5 -0.2500232
+                      s0
+               .
+      data.V3 -0.1678787
+      data.V4  0.5385351
+      data.V5 -0.1573039
+      data.V6 -0.2471624
 
       $`2`
       5 x 1 sparse Matrix of class "dgCMatrix"
-                  s0
-         .
-      V2 0.003301875
-      V3 .
-      V4 .
-      V5 .
-
-      > coefficients
+              s0
+               .
+      data.V3  .
+      data.V4  .
+      data.V5  .
+      data.V6  .
+
+
+      coefficients
       $`0`
       5 x 1 sparse Matrix of class "dgCMatrix"
-         s0
-          .
-      V2  .
-      V3  .
-      V4  .
-      V5  .
+              s0
+               .
+      data.V3  .
+      data.V4  .
+      data.V5  .
+      data.V6  .
 
       $`1`
       5 x 1 sparse Matrix of class "dgCMatrix"
-                 s0
-          .
-      V2  .
-      V3  0.1943624
-      V4 -0.1902577
-      V5 -0.1028789
+                      s0
+               .
+      data.V3  .
+      data.V4  0.1929409
+      data.V5 -0.1889121
+      data.V6 -0.1010413
 
       $`2`
       5 x 1 sparse Matrix of class "dgCMatrix"
-         s0
-          .
-      V2  .
-      V3  .
-      V4  .
-      V5  .
-     */
+              s0
+               .
+      data.V3  .
+      data.V4  .
+      data.V5  .
+      data.V6  .
 
+
+     */
     val coefficientsRStd = new DenseMatrix(3, 4, Array(
-      0.0, 0.0, 0.0, 0.01525105,
-      -0.1502410, 0.5134658, -0.1601146, -0.2500232,
-      0.003301875, 0.0, 0.0, 0.0), isTransposed = true)
+      0.0, 0.0, 0.0, 0.01144225,
+      -0.1678787, 0.5385351, -0.1573039, -0.2471624,
+      0.0, 0.0, 0.0, 0.0), isTransposed = true)
 
     val coefficientsR = new DenseMatrix(3, 4, Array(
       0.0, 0.0, 0.0, 0.0,
-      0.0, 0.1943624, -0.1902577, -0.1028789,
+      0.0, 0.1929409, -0.1889121, -0.1010413,
       0.0, 0.0, 0.0, 0.0), isTransposed = true)
 
     assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01)
@@ -1380,92 +1366,95 @@ class LogisticRegressionSuite
 
   test("multinomial logistic regression with intercept with L2 regularization") {
     val trainer1 = (new LogisticRegression).setFitIntercept(true)
-      .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(true)
+      .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(true).setWeightCol("weight")
     val trainer2 = (new LogisticRegression).setFitIntercept(true)
-      .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(false)
+      .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(false).setWeightCol("weight")
 
     val model1 = trainer1.fit(multinomialDataset)
     val model2 = trainer2.fit(multinomialDataset)
     /*
       Use the following R code to load the data and train the model using glmnet package.
+
       library("glmnet")
       data <- read.csv("path", header=FALSE)
       label = as.factor(data$V1)
-      features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-      coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 0,
-      lambda = 0.1, intercept=T, standardization=T))
-      coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0,
-      lambda = 0.1, intercept=T, standardization=F))
-      > coefficientsStd
+      w = data$V2
+      features = as.matrix(data.frame( data$V3, data$V4, data$V5, data$V6))
+      coefficientsStd = coef(glmnet(features, label, weights=w, family="multinomial",
+      alpha = 0, lambda = 0.1, intercept=T, standardize=T))
+      coefficients = coef(glmnet(features, label, weights=w, family="multinomial", alpha = 0,
+      lambda = 0.1, intercept=T, standardize=F))
+      coefficientsStd
       $`0`
       5 x 1 sparse Matrix of class "dgCMatrix"
-                  s0
-         -1.70040424
-      V2  0.17576070
-      V3  0.01527894
-      V4  0.10216108
-      V5  0.26099531
+                         s0
+              -1.5898288335
+      data.V3  0.1691226336
+      data.V4  0.0002983651
+      data.V5  0.1001732896
+      data.V6  0.2554575585
 
       $`1`
       5 x 1 sparse Matrix of class "dgCMatrix"
-                 s0
-          0.2438590
-      V2 -0.2238875
-      V3  0.5967610
-      V4 -0.1555496
-      V5 -0.3010479
+                      s0
+               0.2125746
+      data.V3 -0.2304586
+      data.V4  0.6153492
+      data.V5 -0.1537017
+      data.V6 -0.2975443
 
       $`2`
       5 x 1 sparse Matrix of class "dgCMatrix"
-                  s0
-          1.45654525
-      V2  0.04812679
-      V3 -0.61203992
-      V4  0.05338850
-      V5  0.04005258
-
-      > coefficients
+                       s0
+               1.37725427
+      data.V3  0.06133600
+      data.V4 -0.61564761
+      data.V5  0.05352840
+      data.V6  0.04208671
+
+
+      coefficients
       $`0`
       5 x 1 sparse Matrix of class "dgCMatrix"
-                  s0
-         -1.65488543
-      V2  0.15715048
-      V3  0.01992903
-      V4  0.12428858
-      V5  0.22130317
+                      s0
+              -1.5681088
+      data.V3  0.1508182
+      data.V4  0.0121955
+      data.V5  0.1217930
+      data.V6  0.2162850
 
       $`1`
       5 x 1 sparse Matrix of class "dgCMatrix"
-                 s0
-          1.1297533
-      V2 -0.1974768
-      V3  0.2776373
-      V4 -0.1869445
-      V5 -0.2510320
+                      s0
+               1.1217130
+      data.V3 -0.2028984
+      data.V4  0.2862431
+      data.V5 -0.1843559
+      data.V6 -0.2481218
 
       $`2`
       5 x 1 sparse Matrix of class "dgCMatrix"
-                  s0
-          0.52513212
-      V2  0.04032627
-      V3 -0.29756637
-      V4  0.06265594
-      V5  0.02972883
-     */
+                       s0
+               0.44639579
+      data.V3  0.05208012
+      data.V4 -0.29843864
+      data.V5  0.06256289
+      data.V6  0.03183676
 
-    val coefficientsRStd = new DenseMatrix(3, 4, Array(
-      0.17576070, 0.01527894, 0.10216108, 0.26099531,
-      -0.2238875, 0.5967610, -0.1555496, -0.3010479,
-      0.04812679, -0.61203992, 0.05338850, 0.04005258), isTransposed = true)
-    val interceptsRStd = Vectors.dense(-1.70040424, 0.2438590, 1.45654525)
 
+     */
+    val coefficientsRStd = new DenseMatrix(3, 4, Array(
+      0.1691226336, 0.0002983651, 0.1001732896, 0.2554575585,
+      -0.2304586, 0.6153492, -0.1537017, -0.2975443,
+      0.06133600, -0.61564761, 0.05352840, 0.04208671), isTransposed = true)
+    val interceptsRStd = Vectors.dense(-1.5898288335, 0.2125746, 1.37725427)
     val coefficientsR = new DenseMatrix(3, 4, Array(
-      0.15715048, 0.01992903, 0.12428858, 0.22130317,
-      -0.1974768, 0.2776373, -0.1869445, -0.2510320,
-      0.04032627, -0.29756637, 0.06265594, 0.02972883), isTransposed = true)
-    val interceptsR = Vectors.dense(-1.65488543, 1.1297533, 0.52513212)
+      0.1508182, 0.0121955, 0.1217930, 0.2162850,
+      -0.2028984, 0.2862431, -0.1843559, -0.2481218,
+      0.05208012, -0.29843864, 0.06256289, 0.03183676), isTransposed = true)
+    val interceptsR = Vectors.dense(-1.5681088, 1.1217130, 0.44639579)
 
-    assert(model1.coefficientMatrix ~== coefficientsRStd relTol 0.05)
+    assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.001)
     assert(model1.interceptVector ~== interceptsRStd relTol 0.05)
     assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps)
     assert(model2.coefficientMatrix ~== coefficientsR relTol 0.05)
@@ -1475,86 +1464,92 @@ class LogisticRegressionSuite
 
   test("multinomial logistic regression without intercept with L2 regularization") {
     val trainer1 = (new LogisticRegression).setFitIntercept(false)
-      .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(true)
+      .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(true).setWeightCol("weight")
     val trainer2 = (new LogisticRegression).setFitIntercept(false)
-      .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(false)
+      .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(false).setWeightCol("weight")
 
     val model1 = trainer1.fit(multinomialDataset)
     val model2 = trainer2.fit(multinomialDataset)
     /*
       Use the following R code to load the data and train the model using glmnet package.
+
       library("glmnet")
       data <- read.csv("path", header=FALSE)
       label = as.factor(data$V1)
-      features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-      coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 0,
-      lambda = 0.1, intercept=F, standardization=T))
-      coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0,
-      lambda = 0.1, intercept=F, standardization=F))
-      > coefficientsStd
+      w = data$V2
+      features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
+      coefficientsStd = coef(glmnet(features, label, weights=w, family="multinomial", alpha = 0,
+      lambda = 0.1, intercept=F, standardize=T))
+      coefficients = coef(glmnet(features, label, weights=w, family="multinomial", alpha = 0,
+      lambda = 0.1, intercept=F, standardize=F))
+      coefficientsStd
       $`0`
       5 x 1 sparse Matrix of class "dgCMatrix"
-                  s0
-          .
-      V2  0.03904171
-      V3 -0.23354322
-      V4  0.08288096
-      V5  0.22706393
+                       s0
+               .
+      data.V3  0.04048126
+      data.V4 -0.23075758
+      data.V5  0.08228864
+      data.V6  0.22277648
 
       $`1`
       5 x 1 sparse Matrix of class "dgCMatrix"
-                 s0
-          .
-      V2 -0.2061848
-      V3  0.6341398
-      V4 -0.1530059
-      V5 -0.2958455
+                      s0
+               .
+      data.V3 -0.2149745
+      data.V4  0.6478666
+      data.V5 -0.1515158
+      data.V6 -0.2930498
 
       $`2`
       5 x 1 sparse Matrix of class "dgCMatrix"
-                  s0
-          .
-      V2  0.16714312
-      V3 -0.40059658
-      V4  0.07012496
-      V5  0.06878158
-      > coefficients
+                       s0
+               .
+      data.V3  0.17449321
+      data.V4 -0.41710901
+      data.V5  0.06922716
+      data.V6  0.07027332
+
+
+      coefficients
       $`0`
       5 x 1 sparse Matrix of class "dgCMatrix"
-                   s0
-          .
-      V2 -0.005704542
-      V3 -0.144466409
-      V4  0.092080736
-      V5  0.182927657
+                        s0
+               .
+      data.V3 -0.003949652
+      data.V4 -0.142982415
+      data.V5  0.091439598
+      data.V6  0.179286241
 
       $`1`
       5 x 1 sparse Matrix of class "dgCMatrix"
-                  s0
-          .
-      V2 -0.08469036
-      V3  0.38996748
-      V4 -0.16468436
-      V5 -0.22522976
+                       s0
+               .
+      data.V3 -0.09071124
+      data.V4  0.39752531
+      data.V5 -0.16233832
+      data.V6 -0.22206059
 
       $`2`
       5 x 1 sparse Matrix of class "dgCMatrix"
-                  s0
-          .
-      V2  0.09039490
-      V3 -0.24550107
-      V4  0.07260362
-      V5  0.04230210
+                       s0
+               .
+      data.V3  0.09466090
+      data.V4 -0.25454290
+      data.V5  0.07089872
+      data.V6  0.04277435
+
+
      */
     val coefficientsRStd = new DenseMatrix(3, 4, Array(
-      0.03904171, -0.23354322, 0.08288096, 0.2270639,
-      -0.2061848, 0.6341398, -0.1530059, -0.2958455,
-      0.16714312, -0.40059658, 0.07012496, 0.06878158), isTransposed = true)
+      0.04048126, -0.23075758, 0.08228864, 0.22277648,
+      -0.2149745, 0.6478666, -0.1515158, -0.2930498,
+      0.17449321, -0.41710901, 0.06922716, 0.07027332), isTransposed = true)
 
     val coefficientsR = new DenseMatrix(3, 4, Array(
-      -0.005704542, -0.144466409, 0.092080736, 0.182927657,
-      -0.08469036, 0.38996748, -0.16468436, -0.22522976,
-      0.0903949, -0.24550107, 0.07260362, 0.0423021), isTransposed = true)
+      -0.003949652, -0.142982415, 0.091439598, 0.179286241,
+      -0.09071124, 0.39752531, -0.16233832, -0.22206059,
+      0.09466090, -0.25454290, 0.07089872, 0.04277435), isTransposed = true)
 
     assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01)
     assert(model1.interceptVector.toArray === Array.fill(3)(0.0))
@@ -1565,10 +1560,10 @@ class LogisticRegressionSuite
   }
 
   test("multinomial logistic regression with intercept with elasticnet regularization") {
-    val trainer1 = (new LogisticRegression).setFitIntercept(true)
+    val trainer1 = (new LogisticRegression).setFitIntercept(true).setWeightCol("weight")
       .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(true)
       .setMaxIter(300).setTol(1e-10)
-    val trainer2 = (new LogisticRegression).setFitIntercept(true)
+    val trainer2 = (new LogisticRegression).setFitIntercept(true).setWeightCol("weight")
       .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(false)
       .setMaxIter(300).setTol(1e-10)
 
@@ -1576,82 +1571,85 @@ class LogisticRegressionSuite
     val model2 = trainer2.fit(multinomialDataset)
     /*
       Use the following R code to load the data and train the model using glmnet package.
+
       library("glmnet")
       data <- read.csv("path", header=FALSE)
       label = as.factor(data$V1)
-      features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-      coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 0.5,
-      lambda = 0.1, intercept=T, standardization=T))
-      coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0.5,
-      lambda = 0.1, intercept=T, standardization=F))
-      > coefficientsStd
+      w = data$V2
+      features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
+      coefficientsStd = coef(glmnet(features, label, weights=w, family="multinomial", alpha = 0.5,
+      lambda = 0.1, intercept=T, standardize=T))
+      coefficients = coef(glmnet(features, label, weights=w, family="multinomial", alpha = 0.5,
+      lambda = 0.1, intercept=T, standardize=F))
+      coefficientsStd
       $`0`
       5 x 1 sparse Matrix of class "dgCMatrix"
-                    s0
-         -0.5521819483
-      V2  0.0003092611
-      V3  .
-      V4  .
-      V5  0.0913818490
+                       s0
+              -0.50133383
+      data.V3  .
+      data.V4  .
+      data.V5  .
+      data.V6  0.08351653
 
       $`1`
       5 x 1 sparse Matrix of class "dgCMatrix"
-                  s0
-         -0.27531989
-      V2 -0.09790029
-      V3  0.28502034
-      V4 -0.12416487
-      V5 -0.16513373
+                      s0
+              -0.3151913
+      data.V3 -0.1058702
+      data.V4  0.3183251
+      data.V5 -0.1212969
+      data.V6 -0.1629778
 
       $`2`
       5 x 1 sparse Matrix of class "dgCMatrix"
-                 s0
-          0.8275018
-      V2  .
-      V3 -0.4044859
-      V4  .
-      V5  .
-
-      > coefficients
+                      s0
+               0.8165252
+      data.V3  .
+      data.V4 -0.3943069
+      data.V5  .
+      data.V6  .
+
+
+      coefficients
       $`0`
       5 x 1 sparse Matrix of class "dgCMatrix"
-                  s0
-         -0.39876213
-      V2  .
-      V3  .
-      V4  0.02547520
-      V5  0.03893991
+                       s0
+              -0.38857157
+      data.V3  .
+      data.V4  .
+      data.V5  0.02384198
+      data.V6  0.03127749
 
       $`1`
       5 x 1 sparse Matrix of class "dgCMatrix"
-                  s0
-          0.61089869
-      V2 -0.04224269
-      V3  .
-      V4 -0.18923970
-      V5 -0.09104249
+                       s0
+               0.62492165
+      data.V3 -0.04949061
+      data.V4  .
+      data.V5 -0.18584462
+      data.V6 -0.08952455
 
       $`2`
       5 x 1 sparse Matrix of class "dgCMatrix"
-                 s0
-         -0.2121366
-      V2  .
-      V3  .
-      V4  .
-      V5  .
-     */
+                      s0
+              -0.2363501
+      data.V3  .
+      data.V4  .
+      data.V5  .
+      data.V6  .
 
-    val coefficientsRStd = new DenseMatrix(3, 4, Array(
-      0.0003092611, 0.0, 0.0, 0.091381849,
-      -0.09790029, 0.28502034, -0.12416487, -0.16513373,
-      0.0, -0.4044859, 0.0, 0.0), isTransposed = true)
-    val interceptsRStd = Vectors.dense(-0.5521819483, -0.27531989, 0.8275018)
 
+     */
+    val coefficientsRStd = new DenseMatrix(3, 4, Array(
+      0.0, 0.0, 0.0, 0.08351653,
+      -0.1058702, 0.3183251, -0.1212969, -0.1629778,
+      0.0, -0.3943069, 0.0, 0.0), isTransposed = true)
+    val interceptsRStd = Vectors.dense(-0.50133383, -0.3151913, 0.8165252)
     val coefficientsR = new DenseMatrix(3, 4, Array(
-      0.0, 0.0, 0.0254752, 0.03893991,
-      -0.04224269, 0.0, -0.1892397, -0.09104249,
+      0.0, 0.0, 0.02384198, 0.03127749,
+      -0.04949061, 0.0, -0.18584462, -0.08952455,
       0.0, 0.0, 0.0, 0.0), isTransposed = true)
-    val interceptsR = Vectors.dense(-0.39876213, 0.61089869, -0.2121366)
+    val interceptsR = Vectors.dense(-0.38857157, 0.62492165, -0.2363501)
 
     assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01)
     assert(model1.interceptVector ~== interceptsRStd absTol 0.01)
@@ -1662,10 +1660,10 @@ class LogisticRegressionSuite
   }
 
   test("multinomial logistic regression without intercept with elasticnet regularization") {
-    val trainer1 = (new LogisticRegression).setFitIntercept(false)
+    val trainer1 = (new LogisticRegression).setFitIntercept(false).setWeightCol("weight")
       .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(true)
       .setMaxIter(300).setTol(1e-10)
-    val trainer2 = (new LogisticRegression).setFitIntercept(false)
+    val trainer2 = (new LogisticRegression).setFitIntercept(false).setWeightCol("weight")
       .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(false)
       .setMaxIter(300).setTol(1e-10)
 
@@ -1673,78 +1671,83 @@ class LogisticRegressionSuite
     val model2 = trainer2.fit(multinomialDataset)
     /*
       Use the following R code to load the data and train the model using glmnet package.
+
       library("glmnet")
       data <- read.csv("path", header=FALSE)
       label = as.factor(data$V1)
-      features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-      coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 0.5,
-      lambda = 0.1, intercept=F, standardization=T))
-      coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0.5,
-      lambda = 0.1, intercept=F, standardization=F))
-      > coefficientsStd
+      w = data$V2
+      features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
+      coefficientsStd = coef(glmnet(features, label, weights=w, family="multinomial", alpha = 0.5,
+      lambda = 0.1, intercept=F, standardize=T))
+      coefficients = coef(glmnet(features, label, weights=w, family="multinomial", alpha = 0.5,
+      lambda = 0.1, intercept=F, standardize=F))
+      coefficientsStd
       $`0`
       5 x 1 sparse Matrix of class "dgCMatrix"
-                 s0
-         .
-      V2 .
-      V3 .
-      V4 .
-      V5 0.03543706
+                      s0
+              .
+      data.V3 .
+      data.V4 .
+      data.V5 .
+      data.V6 0.03238285
 
       $`1`
       5 x 1 sparse Matrix of class "dgCMatrix"
-                 s0
-          .
-      V2 -0.1187387
-      V3  0.4025482
-      V4 -0.1270969
-      V5 -0.1918386
+                      s0
+               .
+      data.V3 -0.1328284
+      data.V4  0.4219321
+      data.V5 -0.1247544
+      data.V6 -0.1893318
 
       $`2`
       5 x 1 sparse Matrix of class "dgCMatrix"
-                 s0
-         .
-      V2 0.00774365
-      V3 .
-      V4 .
-      V5 .
-
-      > coefficients
+                       s0
+              .
+      data.V3 0.004572312
+      data.V4 .
+      data.V5 .
+      data.V6 .
+
+
+      coefficients
       $`0`
       5 x 1 sparse Matrix of class "dgCMatrix"
-         s0
-          .
-      V2  .
-      V3  .
-      V4  .
-      V5  .
+              s0
+               .
+      data.V3  .
+      data.V4  .
+      data.V5  .
+      data.V6  .
 
       $`1`
       5 x 1 sparse Matrix of class "dgCMatrix"
-                  s0
-          .
-      V2  .
-      V3  0.14666497
-      V4 -0.16570638
-      V5 -0.05982875
+                       s0
+               .
+      data.V3  .
+      data.V4  0.14571623
+      data.V5 -0.16456351
+      data.V6 -0.05866264
 
       $`2`
       5 x 1 sparse Matrix of class "dgCMatrix"
-         s0
-          .
-      V2  .
-      V3  .
-      V4  .
-      V5  .
+              s0
+               .
+      data.V3  .
+      data.V4  .
+      data.V5  .
+      data.V6  .
+
+
      */
     val coefficientsRStd = new DenseMatrix(3, 4, Array(
-      0.0, 0.0, 0.0, 0.03543706,
-      -0.1187387, 0.4025482, -0.1270969, -0.1918386,
-      0.0, 0.0, 0.0, 0.00774365), isTransposed = true)
+      0.0, 0.0, 0.0, 0.03238285,
+      -0.1328284, 0.4219321, -0.1247544, -0.1893318,
+      0.004572312, 0.0, 0.0, 0.0), isTransposed = true)
 
     val coefficientsR = new DenseMatrix(3, 4, Array(
       0.0, 0.0, 0.0, 0.0,
-      0.0, 0.14666497, -0.16570638, -0.05982875,
+      0.0, 0.14571623, -0.16456351, -0.05866264,
       0.0, 0.0, 0.0, 0.0), isTransposed = true)
 
     assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01)

From 7ab86244e30ca81eb4fa40ea77b4c2b8881cbab2 Mon Sep 17 00:00:00 2001
From: Dilip Biswal <dbiswal@us.ibm.com>
Date: Fri, 14 Oct 2016 13:22:59 -0700
Subject: [PATCH 020/162] [SPARK-17620][SQL] Determine Serde by
 hive.default.fileformat when Creating Hive Serde Tables

## What changes were proposed in this pull request?
Make sure the hive.default.fileformat is used to when creating the storage format metadata.

Output
``` SQL
scala> spark.sql("SET hive.default.fileformat=orc")
res1: org.apache.spark.sql.DataFrame = [key: string, value: string]

scala> spark.sql("CREATE TABLE tmp_default(id INT)")
res2: org.apache.spark.sql.DataFrame = []
```
Before
```SQL
scala> spark.sql("DESC FORMATTED tmp_default").collect.foreach(println)
..
[# Storage Information,,]
[SerDe Library:,org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe,]
[InputFormat:,org.apache.hadoop.hive.ql.io.orc.OrcInputFormat,]
[OutputFormat:,org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat,]
[Compressed:,No,]
[Storage Desc Parameters:,,]
[  serialization.format,1,]
```
After
```SQL
scala> spark.sql("DESC FORMATTED tmp_default").collect.foreach(println)
..
[# Storage Information,,]
[SerDe Library:,org.apache.hadoop.hive.ql.io.orc.OrcSerde,]
[InputFormat:,org.apache.hadoop.hive.ql.io.orc.OrcInputFormat,]
[OutputFormat:,org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat,]
[Compressed:,No,]
[Storage Desc Parameters:,,]
[  serialization.format,1,]

```

## How was this patch tested?
Added new tests to HiveDDLCommandSuite

Author: Dilip Biswal <dbiswal@us.ibm.com>

Closes #15190 from dilipbiswal/orc.
---
 .../spark/sql/execution/SparkSqlParser.scala  |  4 +-
 .../spark/sql/hive/HiveDDLCommandSuite.scala  | 26 ++++++++++++-
 .../sql/hive/execution/SQLQuerySuite.scala    | 39 +++++++++++++++++--
 3 files changed, 60 insertions(+), 9 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index be2eddbb0e423..8c68d1e3a2379 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -1010,9 +1010,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
           .orElse(Some("org.apache.hadoop.mapred.TextInputFormat")),
         outputFormat = defaultHiveSerde.flatMap(_.outputFormat)
           .orElse(Some("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat")),
-        // Note: Keep this unspecified because we use the presence of the serde to decide
-        // whether to convert a table created by CTAS to a datasource table.
-        serde = None,
+        serde = defaultHiveSerde.flatMap(_.serde),
         compressed = false,
         properties = Map())
     }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDDLCommandSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDDLCommandSuite.scala
index 9ce3338647398..81337493c7f28 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDDLCommandSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDDLCommandSuite.scala
@@ -30,10 +30,12 @@ import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical.{Generate, ScriptTransformation}
 import org.apache.spark.sql.execution.command._
 import org.apache.spark.sql.execution.datasources.CreateTable
-import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.hive.test.{TestHive, TestHiveSingleton}
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types.StructType
 
-class HiveDDLCommandSuite extends PlanTest {
+class HiveDDLCommandSuite extends PlanTest with SQLTestUtils with TestHiveSingleton {
   val parser = TestHive.sessionState.sqlParser
 
   private def extractTableDesc(sql: String): (CatalogTable, Boolean) = {
@@ -556,4 +558,24 @@ class HiveDDLCommandSuite extends PlanTest {
     assert(partition2.get.apply("c") == "1" && partition2.get.apply("d") == "2")
   }
 
+  test("Test the default fileformat for Hive-serde tables") {
+    withSQLConf("hive.default.fileformat" -> "orc") {
+      val (desc, exists) = extractTableDesc("CREATE TABLE IF NOT EXISTS fileformat_test (id int)")
+      assert(exists)
+      assert(desc.storage.inputFormat == Some("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat"))
+      assert(desc.storage.outputFormat == Some("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat"))
+      assert(desc.storage.serde == Some("org.apache.hadoop.hive.ql.io.orc.OrcSerde"))
+    }
+
+    withSQLConf("hive.default.fileformat" -> "parquet") {
+      val (desc, exists) = extractTableDesc("CREATE TABLE IF NOT EXISTS fileformat_test (id int)")
+      assert(exists)
+      val input = desc.storage.inputFormat
+      val output = desc.storage.outputFormat
+      val serde = desc.storage.serde
+      assert(input == Some("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat"))
+      assert(output == Some("org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat"))
+      assert(serde == Some("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe"))
+    }
+   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 6f2a16662bf10..5798f47228216 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -492,7 +492,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
 
   def checkRelation(
       tableName: String,
-      isDataSourceParquet: Boolean,
+      isDataSourceTable: Boolean,
       format: String,
       userSpecifiedLocation: Option[String] = None): Unit = {
     val relation = EliminateSubqueryAliases(
@@ -501,7 +501,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
       sessionState.catalog.getTableMetadata(TableIdentifier(tableName))
     relation match {
       case LogicalRelation(r: HadoopFsRelation, _, _) =>
-        if (!isDataSourceParquet) {
+        if (!isDataSourceTable) {
           fail(
             s"${classOf[MetastoreRelation].getCanonicalName} is expected, but found " +
               s"${HadoopFsRelation.getClass.getCanonicalName}.")
@@ -514,7 +514,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
         assert(catalogTable.provider.get === format)
 
       case r: MetastoreRelation =>
-        if (isDataSourceParquet) {
+        if (isDataSourceTable) {
           fail(
             s"${HadoopFsRelation.getClass.getCanonicalName} is expected, but found " +
               s"${classOf[MetastoreRelation].getCanonicalName}.")
@@ -524,8 +524,15 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
             assert(r.catalogTable.storage.locationUri.get === location)
           case None => // OK.
         }
-        // Also make sure that the format is the desired format.
+        // Also make sure that the format and serde are as desired.
         assert(catalogTable.storage.inputFormat.get.toLowerCase.contains(format))
+        assert(catalogTable.storage.outputFormat.get.toLowerCase.contains(format))
+        val serde = catalogTable.storage.serde.get
+        format match {
+          case "sequence" | "text" => assert(serde.contains("LazySimpleSerDe"))
+          case "rcfile" => assert(serde.contains("LazyBinaryColumnarSerDe"))
+          case _ => assert(serde.toLowerCase.contains(format))
+        }
     }
 
     // When a user-specified location is defined, the table type needs to be EXTERNAL.
@@ -587,6 +594,30 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     }
   }
 
+  test("CTAS with default fileformat") {
+    val table = "ctas1"
+    val ctas = s"CREATE TABLE IF NOT EXISTS $table SELECT key k, value FROM src"
+    withSQLConf(SQLConf.CONVERT_CTAS.key -> "true") {
+      withSQLConf("hive.default.fileformat" -> "textfile") {
+        withTable(table) {
+          sql(ctas)
+          // We should use parquet here as that is the default datasource fileformat. The default
+          // datasource file format is controlled by `spark.sql.sources.default` configuration.
+          // This testcase verifies that setting `hive.default.fileformat` has no impact on
+          // the target table's fileformat in case of CTAS.
+          assert(sessionState.conf.defaultDataSourceName === "parquet")
+          checkRelation(table, isDataSourceTable = true, "parquet")
+        }
+      }
+      withSQLConf("spark.sql.sources.default" -> "orc") {
+        withTable(table) {
+          sql(ctas)
+          checkRelation(table, isDataSourceTable = true, "orc")
+         }
+      }
+    }
+  }
+
   test("CTAS without serde with location") {
     withSQLConf(SQLConf.CONVERT_CTAS.key -> "true") {
       withTempDir { dir =>

From 522dd0d0e5af83e45a3c3526c191aa4b8bcaeeeb Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Fri, 14 Oct 2016 14:09:35 -0700
Subject: [PATCH 021/162] Revert "[SPARK-17620][SQL] Determine Serde by
 hive.default.fileformat when Creating Hive Serde Tables"

This reverts commit 7ab86244e30ca81eb4fa40ea77b4c2b8881cbab2.
---
 .../spark/sql/execution/SparkSqlParser.scala  |  4 +-
 .../spark/sql/hive/HiveDDLCommandSuite.scala  | 26 +------------
 .../sql/hive/execution/SQLQuerySuite.scala    | 39 ++-----------------
 3 files changed, 9 insertions(+), 60 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index 8c68d1e3a2379..be2eddbb0e423 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -1010,7 +1010,9 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
           .orElse(Some("org.apache.hadoop.mapred.TextInputFormat")),
         outputFormat = defaultHiveSerde.flatMap(_.outputFormat)
           .orElse(Some("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat")),
-        serde = defaultHiveSerde.flatMap(_.serde),
+        // Note: Keep this unspecified because we use the presence of the serde to decide
+        // whether to convert a table created by CTAS to a datasource table.
+        serde = None,
         compressed = false,
         properties = Map())
     }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDDLCommandSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDDLCommandSuite.scala
index 81337493c7f28..9ce3338647398 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDDLCommandSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDDLCommandSuite.scala
@@ -30,12 +30,10 @@ import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical.{Generate, ScriptTransformation}
 import org.apache.spark.sql.execution.command._
 import org.apache.spark.sql.execution.datasources.CreateTable
-import org.apache.spark.sql.hive.test.{TestHive, TestHiveSingleton}
-import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.test.SQLTestUtils
+import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.types.StructType
 
-class HiveDDLCommandSuite extends PlanTest with SQLTestUtils with TestHiveSingleton {
+class HiveDDLCommandSuite extends PlanTest {
   val parser = TestHive.sessionState.sqlParser
 
   private def extractTableDesc(sql: String): (CatalogTable, Boolean) = {
@@ -558,24 +556,4 @@ class HiveDDLCommandSuite extends PlanTest with SQLTestUtils with TestHiveSingle
     assert(partition2.get.apply("c") == "1" && partition2.get.apply("d") == "2")
   }
 
-  test("Test the default fileformat for Hive-serde tables") {
-    withSQLConf("hive.default.fileformat" -> "orc") {
-      val (desc, exists) = extractTableDesc("CREATE TABLE IF NOT EXISTS fileformat_test (id int)")
-      assert(exists)
-      assert(desc.storage.inputFormat == Some("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat"))
-      assert(desc.storage.outputFormat == Some("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat"))
-      assert(desc.storage.serde == Some("org.apache.hadoop.hive.ql.io.orc.OrcSerde"))
-    }
-
-    withSQLConf("hive.default.fileformat" -> "parquet") {
-      val (desc, exists) = extractTableDesc("CREATE TABLE IF NOT EXISTS fileformat_test (id int)")
-      assert(exists)
-      val input = desc.storage.inputFormat
-      val output = desc.storage.outputFormat
-      val serde = desc.storage.serde
-      assert(input == Some("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat"))
-      assert(output == Some("org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat"))
-      assert(serde == Some("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe"))
-    }
-   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 5798f47228216..6f2a16662bf10 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -492,7 +492,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
 
   def checkRelation(
       tableName: String,
-      isDataSourceTable: Boolean,
+      isDataSourceParquet: Boolean,
       format: String,
       userSpecifiedLocation: Option[String] = None): Unit = {
     val relation = EliminateSubqueryAliases(
@@ -501,7 +501,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
       sessionState.catalog.getTableMetadata(TableIdentifier(tableName))
     relation match {
       case LogicalRelation(r: HadoopFsRelation, _, _) =>
-        if (!isDataSourceTable) {
+        if (!isDataSourceParquet) {
           fail(
             s"${classOf[MetastoreRelation].getCanonicalName} is expected, but found " +
               s"${HadoopFsRelation.getClass.getCanonicalName}.")
@@ -514,7 +514,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
         assert(catalogTable.provider.get === format)
 
       case r: MetastoreRelation =>
-        if (isDataSourceTable) {
+        if (isDataSourceParquet) {
           fail(
             s"${HadoopFsRelation.getClass.getCanonicalName} is expected, but found " +
               s"${classOf[MetastoreRelation].getCanonicalName}.")
@@ -524,15 +524,8 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
             assert(r.catalogTable.storage.locationUri.get === location)
           case None => // OK.
         }
-        // Also make sure that the format and serde are as desired.
+        // Also make sure that the format is the desired format.
         assert(catalogTable.storage.inputFormat.get.toLowerCase.contains(format))
-        assert(catalogTable.storage.outputFormat.get.toLowerCase.contains(format))
-        val serde = catalogTable.storage.serde.get
-        format match {
-          case "sequence" | "text" => assert(serde.contains("LazySimpleSerDe"))
-          case "rcfile" => assert(serde.contains("LazyBinaryColumnarSerDe"))
-          case _ => assert(serde.toLowerCase.contains(format))
-        }
     }
 
     // When a user-specified location is defined, the table type needs to be EXTERNAL.
@@ -594,30 +587,6 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     }
   }
 
-  test("CTAS with default fileformat") {
-    val table = "ctas1"
-    val ctas = s"CREATE TABLE IF NOT EXISTS $table SELECT key k, value FROM src"
-    withSQLConf(SQLConf.CONVERT_CTAS.key -> "true") {
-      withSQLConf("hive.default.fileformat" -> "textfile") {
-        withTable(table) {
-          sql(ctas)
-          // We should use parquet here as that is the default datasource fileformat. The default
-          // datasource file format is controlled by `spark.sql.sources.default` configuration.
-          // This testcase verifies that setting `hive.default.fileformat` has no impact on
-          // the target table's fileformat in case of CTAS.
-          assert(sessionState.conf.defaultDataSourceName === "parquet")
-          checkRelation(table, isDataSourceTable = true, "parquet")
-        }
-      }
-      withSQLConf("spark.sql.sources.default" -> "orc") {
-        withTable(table) {
-          sql(ctas)
-          checkRelation(table, isDataSourceTable = true, "orc")
-         }
-      }
-    }
-  }
-
   test("CTAS without serde with location") {
     withSQLConf(SQLConf.CONVERT_CTAS.key -> "true") {
       withTempDir { dir =>

From da9aeb0fde589f7c21c2f4a32036a68c0353965d Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Fri, 14 Oct 2016 14:45:20 -0700
Subject: [PATCH 022/162] [SPARK-17863][SQL] should not add column into
 Distinct

## What changes were proposed in this pull request?

We are trying to resolve the attribute in sort by pulling up some column for grandchild into child, but that's wrong when the child is Distinct, because the added column will change the behavior of Distinct, we should not do that.

## How was this patch tested?

Added regression test.

Author: Davies Liu <davies@databricks.com>

Closes #15489 from davies/order_distinct.
---
 .../sql/catalyst/analysis/Analyzer.scala      |  2 ++
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 24 +++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 536d38777f89d..f8f4799322b3b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -838,6 +838,8 @@ class Analyzer(
           // attributes that its child might have or could have.
           val missing = missingAttrs -- g.child.outputSet
           g.copy(join = true, child = addMissingAttr(g.child, missing))
+        case d: Distinct =>
+          throw new AnalysisException(s"Can't add $missingAttrs to $d")
         case u: UnaryNode =>
           u.withNewChildren(addMissingAttr(u.child, missingAttrs) :: Nil)
         case other =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 0ee8c959eeb4d..60978efddd7f8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -1106,6 +1106,30 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     )
   }
 
+  test("SPARK-17863: SELECT distinct does not work correctly if order by missing attribute") {
+    checkAnswer(
+      sql("""select distinct struct.a, struct.b
+          |from (
+          |  select named_struct('a', 1, 'b', 2, 'c', 3) as struct
+          |  union all
+          |  select named_struct('a', 1, 'b', 2, 'c', 4) as struct) tmp
+          |order by a, b
+          |""".stripMargin),
+      Row(1, 2) :: Nil)
+
+    val error = intercept[AnalysisException] {
+      sql("""select distinct struct.a, struct.b
+            |from (
+            |  select named_struct('a', 1, 'b', 2, 'c', 3) as struct
+            |  union all
+            |  select named_struct('a', 1, 'b', 2, 'c', 4) as struct) tmp
+            |order by struct.a, struct.b
+            |""".stripMargin)
+    }
+    assert(error.message contains "cannot resolve '`struct.a`' given input columns: [a, b]")
+
+  }
+
   test("cast boolean to string") {
     // TODO Ensure true/false string letter casing is consistent with Hive in all cases.
     checkAnswer(

From 5aeb7384c7aa5f487f031f9ae07d3f1653399d14 Mon Sep 17 00:00:00 2001
From: Nick Pentreath <nickp@za.ibm.com>
Date: Fri, 14 Oct 2016 15:07:32 -0700
Subject: [PATCH 023/162] [SPARK-16063][SQL] Add storageLevel to Dataset

[SPARK-11905](https://issues.apache.org/jira/browse/SPARK-11905) added support for `persist`/`cache` for `Dataset`. However, there is no user-facing API to check if a `Dataset` is cached and if so what the storage level is. This PR adds `getStorageLevel` to `Dataset`, analogous to `RDD.getStorageLevel`.

Updated `DatasetCacheSuite`.

Author: Nick Pentreath <nickp@za.ibm.com>

Closes #13780 from MLnick/ds-storagelevel.

Signed-off-by: Michael Armbrust <michael@databricks.com>
---
 python/pyspark/sql/dataframe.py               | 36 +++++++++++++++----
 .../scala/org/apache/spark/sql/Dataset.scala  | 12 +++++++
 .../apache/spark/sql/DatasetCacheSuite.scala  | 36 +++++++++++++------
 3 files changed, 68 insertions(+), 16 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index ce277eb204d13..7606ac08bae67 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -407,24 +407,48 @@ def foreachPartition(self, f):
 
     @since(1.3)
     def cache(self):
-        """ Persists with the default storage level (C{MEMORY_ONLY}).
+        """Persists the :class:`DataFrame` with the default storage level (C{MEMORY_AND_DISK}).
+
+        .. note:: the default storage level has changed to C{MEMORY_AND_DISK} to match Scala in 2.0.
         """
         self.is_cached = True
         self._jdf.cache()
         return self
 
     @since(1.3)
-    def persist(self, storageLevel=StorageLevel.MEMORY_ONLY):
-        """Sets the storage level to persist its values across operations
-        after the first time it is computed. This can only be used to assign
-        a new storage level if the RDD does not have a storage level set yet.
-        If no storage level is specified defaults to (C{MEMORY_ONLY}).
+    def persist(self, storageLevel=StorageLevel.MEMORY_AND_DISK):
+        """Sets the storage level to persist the contents of the :class:`DataFrame` across
+        operations after the first time it is computed. This can only be used to assign
+        a new storage level if the :class:`DataFrame` does not have a storage level set yet.
+        If no storage level is specified defaults to (C{MEMORY_AND_DISK}).
+
+        .. note:: the default storage level has changed to C{MEMORY_AND_DISK} to match Scala in 2.0.
         """
         self.is_cached = True
         javaStorageLevel = self._sc._getJavaStorageLevel(storageLevel)
         self._jdf.persist(javaStorageLevel)
         return self
 
+    @property
+    @since(2.1)
+    def storageLevel(self):
+        """Get the :class:`DataFrame`'s current storage level.
+
+        >>> df.storageLevel
+        StorageLevel(False, False, False, False, 1)
+        >>> df.cache().storageLevel
+        StorageLevel(True, True, False, True, 1)
+        >>> df2.persist(StorageLevel.DISK_ONLY_2).storageLevel
+        StorageLevel(True, False, False, False, 2)
+        """
+        java_storage_level = self._jdf.storageLevel()
+        storage_level = StorageLevel(java_storage_level.useDisk(),
+                                     java_storage_level.useMemory(),
+                                     java_storage_level.useOffHeap(),
+                                     java_storage_level.deserialized(),
+                                     java_storage_level.replication())
+        return storage_level
+
     @since(1.3)
     def unpersist(self, blocking=False):
         """Marks the :class:`DataFrame` as non-persistent, and remove all blocks for it from
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index e59a483075c94..70c9cf5ae2440 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -2401,6 +2401,18 @@ class Dataset[T] private[sql](
     this
   }
 
+  /**
+   * Get the Dataset's current storage level, or StorageLevel.NONE if not persisted.
+   *
+   * @group basic
+   * @since 2.1.0
+   */
+  def storageLevel: StorageLevel = {
+    sparkSession.sharedState.cacheManager.lookupCachedData(this).map { cachedData =>
+      cachedData.cachedRepresentation.storageLevel
+    }.getOrElse(StorageLevel.NONE)
+  }
+
   /**
    * Mark the Dataset as non-persistent, and remove all blocks for it from memory and disk.
    *
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala
index 8d5e9645df894..e0561ee2797a5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala
@@ -19,11 +19,32 @@ package org.apache.spark.sql
 
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.storage.StorageLevel
 
 
 class DatasetCacheSuite extends QueryTest with SharedSQLContext {
   import testImplicits._
 
+  test("get storage level") {
+    val ds1 = Seq("1", "2").toDS().as("a")
+    val ds2 = Seq(2, 3).toDS().as("b")
+
+    // default storage level
+    ds1.persist()
+    ds2.cache()
+    assert(ds1.storageLevel == StorageLevel.MEMORY_AND_DISK)
+    assert(ds2.storageLevel == StorageLevel.MEMORY_AND_DISK)
+    // unpersist
+    ds1.unpersist()
+    assert(ds1.storageLevel == StorageLevel.NONE)
+    // non-default storage level
+    ds1.persist(StorageLevel.MEMORY_ONLY_2)
+    assert(ds1.storageLevel == StorageLevel.MEMORY_ONLY_2)
+    // joined Dataset should not be persisted
+    val joined = ds1.joinWith(ds2, $"a.value" === $"b.value")
+    assert(joined.storageLevel == StorageLevel.NONE)
+  }
+
   test("persist and unpersist") {
     val ds = Seq(("a", 1), ("b", 2), ("c", 3)).toDS().select(expr("_2 + 1").as[Int])
     val cached = ds.cache()
@@ -37,8 +58,7 @@ class DatasetCacheSuite extends QueryTest with SharedSQLContext {
       2, 3, 4)
     // Drop the cache.
     cached.unpersist()
-    assert(spark.sharedState.cacheManager.lookupCachedData(cached).isEmpty,
-      "The Dataset should not be cached.")
+    assert(cached.storageLevel == StorageLevel.NONE, "The Dataset should not be cached.")
   }
 
   test("persist and then rebind right encoder when join 2 datasets") {
@@ -55,11 +75,9 @@ class DatasetCacheSuite extends QueryTest with SharedSQLContext {
     assertCached(joined, 2)
 
     ds1.unpersist()
-    assert(spark.sharedState.cacheManager.lookupCachedData(ds1).isEmpty,
-      "The Dataset ds1 should not be cached.")
+    assert(ds1.storageLevel == StorageLevel.NONE, "The Dataset ds1 should not be cached.")
     ds2.unpersist()
-    assert(spark.sharedState.cacheManager.lookupCachedData(ds2).isEmpty,
-      "The Dataset ds2 should not be cached.")
+    assert(ds2.storageLevel == StorageLevel.NONE, "The Dataset ds2 should not be cached.")
   }
 
   test("persist and then groupBy columns asKey, map") {
@@ -74,10 +92,8 @@ class DatasetCacheSuite extends QueryTest with SharedSQLContext {
     assertCached(agged.filter(_._1 == "b"))
 
     ds.unpersist()
-    assert(spark.sharedState.cacheManager.lookupCachedData(ds).isEmpty,
-      "The Dataset ds should not be cached.")
+    assert(ds.storageLevel == StorageLevel.NONE, "The Dataset ds should not be cached.")
     agged.unpersist()
-    assert(spark.sharedState.cacheManager.lookupCachedData(agged).isEmpty,
-      "The Dataset agged should not be cached.")
+    assert(agged.storageLevel == StorageLevel.NONE, "The Dataset agged should not be cached.")
   }
 }

From f00df40cfefef0f3fc73f16ada1006e4dcfa5a39 Mon Sep 17 00:00:00 2001
From: Jeff Zhang <zjffdu@apache.org>
Date: Fri, 14 Oct 2016 15:50:35 -0700
Subject: [PATCH 024/162] [SPARK-11775][PYSPARK][SQL] Allow PySpark to register
 Java UDF

Currently pyspark can only call the builtin java UDF, but can not call custom java UDF. It would be better to allow that. 2 benefits:
* Leverage the power of rich third party java library
* Improve the performance. Because if we use python UDF, python daemons will be started on worker which will affect the performance.

Author: Jeff Zhang <zjffdu@apache.org>

Closes #9766 from zjffdu/SPARK-11775.
---
 python/pyspark/sql/context.py                 | 28 ++++++-
 .../sql/catalyst/JavaTypeInference.scala      |  2 +-
 .../apache/spark/sql/UDFRegistration.scala    | 75 ++++++++++++++++++-
 .../apache/spark/sql/JavaStringLength.java    | 30 ++++++++
 .../org/apache/spark/sql/JavaUDFSuite.java    | 21 ++++++
 5 files changed, 152 insertions(+), 4 deletions(-)
 create mode 100644 sql/core/src/test/java/test/org/apache/spark/sql/JavaStringLength.java

diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index 8264dcf8a97d2..de4c335ad2752 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -28,7 +28,7 @@
 from pyspark.sql.dataframe import DataFrame
 from pyspark.sql.readwriter import DataFrameReader
 from pyspark.sql.streaming import DataStreamReader
-from pyspark.sql.types import Row, StringType
+from pyspark.sql.types import IntegerType, Row, StringType
 from pyspark.sql.utils import install_exception_handler
 
 __all__ = ["SQLContext", "HiveContext", "UDFRegistration"]
@@ -202,6 +202,32 @@ def registerFunction(self, name, f, returnType=StringType()):
         """
         self.sparkSession.catalog.registerFunction(name, f, returnType)
 
+    @ignore_unicode_prefix
+    @since(2.1)
+    def registerJavaFunction(self, name, javaClassName, returnType=None):
+        """Register a java UDF so it can be used in SQL statements.
+
+        In addition to a name and the function itself, the return type can be optionally specified.
+        When the return type is not specified we would infer it via reflection.
+        :param name:  name of the UDF
+        :param javaClassName: fully qualified name of java class
+        :param returnType: a :class:`pyspark.sql.types.DataType` object
+
+        >>> sqlContext.registerJavaFunction("javaStringLength",
+        ...   "test.org.apache.spark.sql.JavaStringLength", IntegerType())
+        >>> sqlContext.sql("SELECT javaStringLength('test')").collect()
+        [Row(UDF(test)=4)]
+        >>> sqlContext.registerJavaFunction("javaStringLength2",
+        ...   "test.org.apache.spark.sql.JavaStringLength")
+        >>> sqlContext.sql("SELECT javaStringLength2('test')").collect()
+        [Row(UDF(test)=4)]
+
+        """
+        jdt = None
+        if returnType is not None:
+            jdt = self.sparkSession._jsparkSession.parseDataType(returnType.json())
+        self.sparkSession._jsparkSession.udf().registerJava(name, javaClassName, jdt)
+
     # TODO(andrew): delete this once we refactor things to take in SparkSession
     def _inferSchema(self, rdd, samplingRatio=None):
         """
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
index e6f61b00ebd70..04f0cfce883f2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
@@ -59,7 +59,7 @@ object JavaTypeInference {
    * @param typeToken Java type
    * @return (SQL data type, nullable)
    */
-  private def inferDataType(typeToken: TypeToken[_]): (DataType, Boolean) = {
+  private[sql] def inferDataType(typeToken: TypeToken[_]): (DataType, Boolean) = {
     typeToken.getRawType match {
       case c: Class[_] if c.isAnnotationPresent(classOf[SQLUserDefinedType]) =>
         (c.getAnnotation(classOf[SQLUserDefinedType]).udt().newInstance(), true)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
index 617a14793697b..0444ad10d34fb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
@@ -17,19 +17,25 @@
 
 package org.apache.spark.sql
 
+import java.io.IOException
+import java.lang.reflect.{ParameterizedType, Type}
+
 import scala.reflect.runtime.universe.TypeTag
 import scala.util.Try
 
+import com.google.common.reflect.TypeToken
+
 import org.apache.spark.annotation.InterfaceStability
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.api.java._
+import org.apache.spark.sql.catalyst.{JavaTypeInference, ScalaReflection}
 import org.apache.spark.sql.catalyst.analysis.FunctionRegistry
-import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.expressions.{Expression, ScalaUDF}
 import org.apache.spark.sql.execution.aggregate.ScalaUDAF
 import org.apache.spark.sql.execution.python.UserDefinedPythonFunction
 import org.apache.spark.sql.expressions.{UserDefinedAggregateFunction, UserDefinedFunction}
-import org.apache.spark.sql.types.DataType
+import org.apache.spark.sql.types.{DataType, DataTypes}
+import org.apache.spark.util.Utils
 
 /**
  * Functions for registering user-defined functions. Use [[SQLContext.udf]] to access this.
@@ -413,6 +419,71 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   //////////////////////////////////////////////////////////////////////////////////////////////
   //////////////////////////////////////////////////////////////////////////////////////////////
 
+  /**
+   * Register a Java UDF class using reflection, for use from pyspark
+   *
+   * @param name   udf name
+   * @param className   fully qualified class name of udf
+   * @param returnDataType  return type of udf. If it is null, spark would try to infer
+   *                        via reflection.
+   */
+  private[sql] def registerJava(name: String, className: String, returnDataType: DataType): Unit = {
+
+    try {
+      val clazz = Utils.classForName(className)
+      val udfInterfaces = clazz.getGenericInterfaces
+        .filter(_.isInstanceOf[ParameterizedType])
+        .map(_.asInstanceOf[ParameterizedType])
+        .filter(e => e.getRawType.isInstanceOf[Class[_]] && e.getRawType.asInstanceOf[Class[_]].getCanonicalName.startsWith("org.apache.spark.sql.api.java.UDF"))
+      if (udfInterfaces.length == 0) {
+        throw new IOException(s"UDF class ${className} doesn't implement any UDF interface")
+      } else if (udfInterfaces.length > 1) {
+        throw new IOException(s"It is invalid to implement multiple UDF interfaces, UDF class ${className}")
+      } else {
+        try {
+          val udf = clazz.newInstance()
+          val udfReturnType = udfInterfaces(0).getActualTypeArguments.last
+          var returnType = returnDataType
+          if (returnType == null) {
+            returnType = JavaTypeInference.inferDataType(TypeToken.of(udfReturnType))._1
+          }
+
+          udfInterfaces(0).getActualTypeArguments.length match {
+            case 2 => register(name, udf.asInstanceOf[UDF1[_, _]], returnType)
+            case 3 => register(name, udf.asInstanceOf[UDF2[_, _, _]], returnType)
+            case 4 => register(name, udf.asInstanceOf[UDF3[_, _, _, _]], returnType)
+            case 5 => register(name, udf.asInstanceOf[UDF4[_, _, _, _, _]], returnType)
+            case 6 => register(name, udf.asInstanceOf[UDF5[_, _, _, _, _, _]], returnType)
+            case 7 => register(name, udf.asInstanceOf[UDF6[_, _, _, _, _, _, _]], returnType)
+            case 8 => register(name, udf.asInstanceOf[UDF7[_, _, _, _, _, _, _, _]], returnType)
+            case 9 => register(name, udf.asInstanceOf[UDF8[_, _, _, _, _, _, _, _, _]], returnType)
+            case 10 => register(name, udf.asInstanceOf[UDF9[_, _, _, _, _, _, _, _, _, _]], returnType)
+            case 11 => register(name, udf.asInstanceOf[UDF10[_, _, _, _, _, _, _, _, _, _, _]], returnType)
+            case 12 => register(name, udf.asInstanceOf[UDF11[_, _, _, _, _, _, _, _, _, _, _, _]], returnType)
+            case 13 => register(name, udf.asInstanceOf[UDF12[_, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
+            case 14 => register(name, udf.asInstanceOf[UDF13[_, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
+            case 15 => register(name, udf.asInstanceOf[UDF14[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
+            case 16 => register(name, udf.asInstanceOf[UDF15[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
+            case 17 => register(name, udf.asInstanceOf[UDF16[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
+            case 18 => register(name, udf.asInstanceOf[UDF17[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
+            case 19 => register(name, udf.asInstanceOf[UDF18[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
+            case 20 => register(name, udf.asInstanceOf[UDF19[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
+            case 21 => register(name, udf.asInstanceOf[UDF20[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
+            case 22 => register(name, udf.asInstanceOf[UDF21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
+            case 23 => register(name, udf.asInstanceOf[UDF22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
+            case n => logError(s"UDF class with ${n} type arguments is not supported ")
+          }
+        } catch {
+          case e @ (_: InstantiationException | _: IllegalArgumentException) =>
+            logError(s"Can not instantiate class ${className}, please make sure it has public non argument constructor")
+        }
+      }
+    } catch {
+      case e: ClassNotFoundException => logError(s"Can not load class ${className}, please make sure it is on the classpath")
+    }
+
+  }
+
   /**
    * Register a user-defined function with 1 arguments.
    * @since 1.3.0
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaStringLength.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaStringLength.java
new file mode 100644
index 0000000000000..b90224f2ae397
--- /dev/null
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaStringLength.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package test.org.apache.spark.sql;
+
+import org.apache.spark.sql.api.java.UDF1;
+
+/**
+ * It is used for register Java UDF from PySpark
+ */
+public class JavaStringLength implements UDF1<String, Integer> {
+  @Override
+  public Integer call(String str) throws Exception {
+    return new Integer(str.length());
+  }
+}
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDFSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDFSuite.java
index 2274912521a56..8bf3278c43880 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDFSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDFSuite.java
@@ -87,4 +87,25 @@ public Integer call(String str1, String str2) {
     Row result = spark.sql("SELECT stringLengthTest('test', 'test2')").head();
     Assert.assertEquals(9, result.getInt(0));
   }
+
+  public static class StringLengthTest implements UDF2<String, String, Integer> {
+    @Override
+    public Integer call(String str1, String str2) throws Exception {
+      return new Integer(str1.length() + str2.length());
+    }
+  }
+
+  @SuppressWarnings("unchecked")
+  @Test
+  public void udf3Test() {
+    spark.udf().registerJava("stringLengthTest", StringLengthTest.class.getName(),
+        DataTypes.IntegerType);
+    Row result = spark.sql("SELECT stringLengthTest('test', 'test2')").head();
+    Assert.assertEquals(9, result.getInt(0));
+
+    // returnType is not provided
+    spark.udf().registerJava("stringLengthTest2", StringLengthTest.class.getName(), null);
+    result = spark.sql("SELECT stringLengthTest('test', 'test2')").head();
+    Assert.assertEquals(9, result.getInt(0));
+  }
 }

From 72adfbf94ab6a6ce2a5f3111140274476150f201 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Fri, 14 Oct 2016 16:13:42 -0700
Subject: [PATCH 025/162] [SPARK-17900][SQL] Graduate a list of Spark SQL APIs
 to stable

## What changes were proposed in this pull request?
This patch graduates a list of Spark SQL APIs and mark them stable.

The following are marked stable:

Dataset/DataFrame
- functions, since 1.3
- ColumnName, since 1.3
- DataFrameNaFunctions, since 1.3.1
- DataFrameStatFunctions, since 1.4
- UserDefinedFunction, since 1.3
- UserDefinedAggregateFunction, since 1.5
- Window and WindowSpec, since 1.4

Data sources:
- DataSourceRegister, since 1.5
- RelationProvider, since 1.3
- SchemaRelationProvider, since 1.3
- CreatableRelationProvider, since 1.3
- BaseRelation, since 1.3
- TableScan, since 1.3
- PrunedScan, since 1.3
- PrunedFilteredScan, since 1.3
- InsertableRelation, since 1.3

The following are kept experimental / evolving:

Data sources:
- CatalystScan (tied to internal logical plans so it is not stable by definition)

Structured streaming:
- all classes (introduced new in 2.0 and will likely change)

Dataset typed operations (introduced in 1.6 and 2.0 and might change, although probability is low)
- all typed methods on Dataset
- KeyValueGroupedDataset
- o.a.s.sql.expressions.javalang.typed
- o.a.s.sql.expressions.scalalang.typed
- methods that return typed Dataset in SparkSession

We should discuss more whether we want to mark Dataset typed operations stable in 2.1.

## How was this patch tested?
N/A - just annotation changes.

Author: Reynold Xin <rxin@databricks.com>

Closes #15469 from rxin/SPARK-17900.
---
 .../scala/org/apache/spark/sql/Column.scala   |  6 ++--
 .../spark/sql/DataFrameNaFunctions.scala      |  6 ++--
 .../spark/sql/DataFrameStatFunctions.scala    |  6 ++--
 .../sql/expressions/UserDefinedFunction.scala | 10 ++++--
 .../apache/spark/sql/expressions/Window.scala | 10 ++----
 .../spark/sql/expressions/WindowSpec.scala    |  6 ++--
 .../apache/spark/sql/expressions/udaf.scala   | 30 ++++++++++++----
 .../org/apache/spark/sql/functions.scala      |  4 +--
 .../apache/spark/sql/sources/interfaces.scala | 35 +++++--------------
 9 files changed, 51 insertions(+), 62 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index d22bb17934ce7..05e867bf5be96 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql
 
 import scala.language.implicitConversions
 
-import org.apache.spark.annotation.{Experimental, InterfaceStability}
+import org.apache.spark.annotation.InterfaceStability
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.encoders.{encoderFor, ExpressionEncoder}
@@ -1181,13 +1181,11 @@ class Column(protected[sql] val expr: Expression) extends Logging {
 
 
 /**
- * :: Experimental ::
  * A convenient class used for constructing schema.
  *
  * @since 1.3.0
  */
-@Experimental
-@InterfaceStability.Evolving
+@InterfaceStability.Stable
 class ColumnName(name: String) extends Column(name) {
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
index 65a9c008f9650..0d43f09bc54cd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
@@ -21,20 +21,18 @@ import java.{lang => jl}
 
 import scala.collection.JavaConverters._
 
-import org.apache.spark.annotation.{Experimental, InterfaceStability}
+import org.apache.spark.annotation.InterfaceStability
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
 
 
 /**
- * :: Experimental ::
  * Functionality for working with missing data in [[DataFrame]]s.
  *
  * @since 1.3.1
  */
-@Experimental
-@InterfaceStability.Evolving
+@InterfaceStability.Stable
 final class DataFrameNaFunctions private[sql](df: DataFrame) {
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
index a212bb6205328..b5bbcee37150f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -21,20 +21,18 @@ import java.{lang => jl, util => ju}
 
 import scala.collection.JavaConverters._
 
-import org.apache.spark.annotation.{Experimental, InterfaceStability}
+import org.apache.spark.annotation.InterfaceStability
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.execution.stat._
 import org.apache.spark.sql.types._
 import org.apache.spark.util.sketch.{BloomFilter, CountMinSketch}
 
 /**
- * :: Experimental ::
  * Statistic functions for [[DataFrame]]s.
  *
  * @since 1.4.0
  */
-@Experimental
-@InterfaceStability.Evolving
+@InterfaceStability.Stable
 final class DataFrameStatFunctions private[sql](df: DataFrame) {
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
index 2e0e937e4aff7..28598af781653 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.expressions
 
-import org.apache.spark.annotation.{Experimental, InterfaceStability}
+import org.apache.spark.annotation.InterfaceStability
 import org.apache.spark.sql.catalyst.expressions.ScalaUDF
 import org.apache.spark.sql.Column
 import org.apache.spark.sql.functions
@@ -39,13 +39,17 @@ import org.apache.spark.sql.types.DataType
  *
  * @since 1.3.0
  */
-@Experimental
-@InterfaceStability.Evolving
+@InterfaceStability.Stable
 case class UserDefinedFunction protected[sql] (
     f: AnyRef,
     dataType: DataType,
     inputTypes: Option[Seq[DataType]]) {
 
+  /**
+   * Returns an expression that invokes the UDF, using the given arguments.
+   *
+   * @since 1.3.0
+   */
   def apply(exprs: Column*): Column = {
     Column(ScalaUDF(f, dataType, exprs.map(_.expr), inputTypes.getOrElse(Nil)))
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala
index 07ef60183f6fb..0b26d863cac5d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala
@@ -17,12 +17,11 @@
 
 package org.apache.spark.sql.expressions
 
-import org.apache.spark.annotation.{Experimental, InterfaceStability}
+import org.apache.spark.annotation.InterfaceStability
 import org.apache.spark.sql.Column
 import org.apache.spark.sql.catalyst.expressions._
 
 /**
- * :: Experimental ::
  * Utility functions for defining window in DataFrames.
  *
  * {{{
@@ -36,8 +35,7 @@ import org.apache.spark.sql.catalyst.expressions._
  *
  * @since 1.4.0
  */
-@Experimental
-@InterfaceStability.Evolving
+@InterfaceStability.Stable
 object Window {
 
   /**
@@ -164,7 +162,6 @@ object Window {
 }
 
 /**
- * :: Experimental ::
  * Utility functions for defining window in DataFrames.
  *
  * {{{
@@ -177,6 +174,5 @@ object Window {
  *
  * @since 1.4.0
  */
-@Experimental
-@InterfaceStability.Evolving
+@InterfaceStability.Stable
 class Window private()  // So we can see Window in JavaDoc.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala
index 18778c8d1c294..1e85b6e7881ad 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala
@@ -17,20 +17,18 @@
 
 package org.apache.spark.sql.expressions
 
-import org.apache.spark.annotation.{Experimental, InterfaceStability}
+import org.apache.spark.annotation.InterfaceStability
 import org.apache.spark.sql.Column
 import org.apache.spark.sql.catalyst.expressions._
 
 /**
- * :: Experimental ::
  * A window specification that defines the partitioning, ordering, and frame boundaries.
  *
  * Use the static methods in [[Window]] to create a [[WindowSpec]].
  *
  * @since 1.4.0
  */
-@Experimental
-@InterfaceStability.Evolving
+@InterfaceStability.Stable
 class WindowSpec private[sql](
     partitionSpec: Seq[Expression],
     orderSpec: Seq[SortOrder],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala
index ef7c09c72b82d..bc9788d81fe6a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala
@@ -17,20 +17,18 @@
 
 package org.apache.spark.sql.expressions
 
-import org.apache.spark.annotation.{Experimental, InterfaceStability}
+import org.apache.spark.annotation.InterfaceStability
 import org.apache.spark.sql.{Column, Row}
 import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete}
 import org.apache.spark.sql.execution.aggregate.ScalaUDAF
 import org.apache.spark.sql.types._
 
 /**
- * :: Experimental ::
  * The base class for implementing user-defined aggregate functions (UDAF).
  *
  * @since 1.5.0
  */
-@Experimental
-@InterfaceStability.Evolving
+@InterfaceStability.Stable
 abstract class UserDefinedAggregateFunction extends Serializable {
 
   /**
@@ -46,6 +44,8 @@ abstract class UserDefinedAggregateFunction extends Serializable {
    *
    * The name of a field of this [[StructType]] is only used to identify the corresponding
    * input argument. Users can choose names to identify the input arguments.
+   *
+   * @since 1.5.0
    */
   def inputSchema: StructType
 
@@ -63,17 +63,23 @@ abstract class UserDefinedAggregateFunction extends Serializable {
    *
    * The name of a field of this [[StructType]] is only used to identify the corresponding
    * buffer value. Users can choose names to identify the input arguments.
+   *
+   * @since 1.5.0
    */
   def bufferSchema: StructType
 
   /**
    * The [[DataType]] of the returned value of this [[UserDefinedAggregateFunction]].
+   *
+   * @since 1.5.0
    */
   def dataType: DataType
 
   /**
    * Returns true iff this function is deterministic, i.e. given the same input,
    * always return the same output.
+   *
+   * @since 1.5.0
    */
   def deterministic: Boolean
 
@@ -83,6 +89,8 @@ abstract class UserDefinedAggregateFunction extends Serializable {
    * The contract should be that applying the merge function on two initial buffers should just
    * return the initial buffer itself, i.e.
    * `merge(initialBuffer, initialBuffer)` should equal `initialBuffer`.
+   *
+   * @since 1.5.0
    */
   def initialize(buffer: MutableAggregationBuffer): Unit
 
@@ -90,6 +98,8 @@ abstract class UserDefinedAggregateFunction extends Serializable {
    * Updates the given aggregation buffer `buffer` with new input data from `input`.
    *
    * This is called once per input row.
+   *
+   * @since 1.5.0
    */
   def update(buffer: MutableAggregationBuffer, input: Row): Unit
 
@@ -97,17 +107,23 @@ abstract class UserDefinedAggregateFunction extends Serializable {
    * Merges two aggregation buffers and stores the updated buffer values back to `buffer1`.
    *
    * This is called when we merge two partially aggregated data together.
+   *
+   * @since 1.5.0
    */
   def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit
 
   /**
    * Calculates the final result of this [[UserDefinedAggregateFunction]] based on the given
    * aggregation buffer.
+   *
+   * @since 1.5.0
    */
   def evaluate(buffer: Row): Any
 
   /**
    * Creates a [[Column]] for this UDAF using given [[Column]]s as input arguments.
+   *
+   * @since 1.5.0
    */
   @scala.annotation.varargs
   def apply(exprs: Column*): Column = {
@@ -122,6 +138,8 @@ abstract class UserDefinedAggregateFunction extends Serializable {
   /**
    * Creates a [[Column]] for this UDAF using the distinct values of the given
    * [[Column]]s as input arguments.
+   *
+   * @since 1.5.0
    */
   @scala.annotation.varargs
   def distinct(exprs: Column*): Column = {
@@ -135,15 +153,13 @@ abstract class UserDefinedAggregateFunction extends Serializable {
 }
 
 /**
- * :: Experimental ::
  * A [[Row]] representing a mutable aggregation buffer.
  *
  * This is not meant to be extended outside of Spark.
  *
  * @since 1.5.0
  */
-@Experimental
-@InterfaceStability.Evolving
+@InterfaceStability.Stable
 abstract class MutableAggregationBuffer extends Row {
 
   /** Update the ith value of this buffer. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index de4943152720c..5f1efd22d8204 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -37,7 +37,6 @@ import org.apache.spark.util.Utils
 
 
 /**
- * :: Experimental ::
  * Functions available for DataFrame operations.
  *
  * @groupname udf_funcs UDF functions
@@ -53,8 +52,7 @@ import org.apache.spark.util.Utils
  * @groupname Ungrouped Support functions for DataFrames
  * @since 1.3.0
  */
-@Experimental
-@InterfaceStability.Evolving
+@InterfaceStability.Stable
 // scalastyle:off
 object functions {
 // scalastyle:on
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index 3172d5ded9504..15a48072525b2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -27,7 +27,6 @@ import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.StructType
 
 /**
- * ::DeveloperApi::
  * Data sources should implement this trait so that they can register an alias to their data source.
  * This allows users to give the data source alias as the format type over the fully qualified
  * class name.
@@ -36,8 +35,7 @@ import org.apache.spark.sql.types.StructType
  *
  * @since 1.5.0
  */
-@DeveloperApi
-@InterfaceStability.Evolving
+@InterfaceStability.Stable
 trait DataSourceRegister {
 
   /**
@@ -54,7 +52,6 @@ trait DataSourceRegister {
 }
 
 /**
- * ::DeveloperApi::
  * Implemented by objects that produce relations for a specific kind of data source.  When
  * Spark SQL is given a DDL operation with a USING clause specified (to specify the implemented
  * RelationProvider), this interface is used to pass in the parameters specified by a user.
@@ -68,8 +65,7 @@ trait DataSourceRegister {
  *
  * @since 1.3.0
  */
-@DeveloperApi
-@InterfaceStability.Evolving
+@InterfaceStability.Stable
 trait RelationProvider {
   /**
    * Returns a new base relation with the given parameters.
@@ -80,7 +76,6 @@ trait RelationProvider {
 }
 
 /**
- * ::DeveloperApi::
  * Implemented by objects that produce relations for a specific kind of data source
  * with a given schema.  When Spark SQL is given a DDL operation with a USING clause specified (
  * to specify the implemented SchemaRelationProvider) and a user defined schema, this interface
@@ -100,8 +95,7 @@ trait RelationProvider {
  *
  * @since 1.3.0
  */
-@DeveloperApi
-@InterfaceStability.Evolving
+@InterfaceStability.Stable
 trait SchemaRelationProvider {
   /**
    * Returns a new base relation with the given parameters and user defined schema.
@@ -164,8 +158,7 @@ trait StreamSinkProvider {
 /**
  * @since 1.3.0
  */
-@DeveloperApi
-@InterfaceStability.Evolving
+@InterfaceStability.Stable
 trait CreatableRelationProvider {
   /**
    * Save the DataFrame to the destination and return a relation with the given parameters based on
@@ -189,7 +182,6 @@ trait CreatableRelationProvider {
 }
 
 /**
- * ::DeveloperApi::
  * Represents a collection of tuples with a known schema. Classes that extend BaseRelation must
  * be able to produce the schema of their data in the form of a [[StructType]]. Concrete
  * implementation should inherit from one of the descendant `Scan` classes, which define various
@@ -201,8 +193,7 @@ trait CreatableRelationProvider {
  *
  * @since 1.3.0
  */
-@DeveloperApi
-@InterfaceStability.Evolving
+@InterfaceStability.Stable
 abstract class BaseRelation {
   def sqlContext: SQLContext
   def schema: StructType
@@ -248,32 +239,27 @@ abstract class BaseRelation {
 }
 
 /**
- * ::DeveloperApi::
  * A BaseRelation that can produce all of its tuples as an RDD of Row objects.
  *
  * @since 1.3.0
  */
-@DeveloperApi
-@InterfaceStability.Evolving
+@InterfaceStability.Stable
 trait TableScan {
   def buildScan(): RDD[Row]
 }
 
 /**
- * ::DeveloperApi::
  * A BaseRelation that can eliminate unneeded columns before producing an RDD
  * containing all of its tuples as Row objects.
  *
  * @since 1.3.0
  */
-@DeveloperApi
-@InterfaceStability.Evolving
+@InterfaceStability.Stable
 trait PrunedScan {
   def buildScan(requiredColumns: Array[String]): RDD[Row]
 }
 
 /**
- * ::DeveloperApi::
  * A BaseRelation that can eliminate unneeded columns and filter using selected
  * predicates before producing an RDD containing all matching tuples as Row objects.
  *
@@ -286,14 +272,12 @@ trait PrunedScan {
  *
  * @since 1.3.0
  */
-@DeveloperApi
-@InterfaceStability.Evolving
+@InterfaceStability.Stable
 trait PrunedFilteredScan {
   def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row]
 }
 
 /**
- * ::DeveloperApi::
  * A BaseRelation that can be used to insert data into it through the insert method.
  * If overwrite in insert method is true, the old data in the relation should be overwritten with
  * the new data. If overwrite in insert method is false, the new data should be appended.
@@ -310,8 +294,7 @@ trait PrunedFilteredScan {
  *
  * @since 1.3.0
  */
-@DeveloperApi
-@InterfaceStability.Evolving
+@InterfaceStability.Stable
 trait InsertableRelation {
   def insert(data: DataFrame, overwrite: Boolean): Unit
 }

From 2d96d35dc0fed6df249606d9ce9272c0f0109fa2 Mon Sep 17 00:00:00 2001
From: Srinath Shankar <srinath@databricks.com>
Date: Fri, 14 Oct 2016 18:24:47 -0700
Subject: [PATCH 026/162] [SPARK-17946][PYSPARK] Python crossJoin API similar
 to Scala

## What changes were proposed in this pull request?

Add a crossJoin function to the DataFrame API similar to that in Scala. Joins with no condition (cartesian products) must be specified with the crossJoin API

## How was this patch tested?
Added python tests to ensure that an AnalysisException if a cartesian product is specified without crossJoin(), and that cartesian products can execute if specified via crossJoin()

(Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests)
(If this patch involves UI changes, please attach a screenshot; otherwise, remove this)

Please review https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark before opening a pull request.

Author: Srinath Shankar <srinath@databricks.com>

Closes #15493 from srinathshankar/crosspython.
---
 python/pyspark/sql/dataframe.py               | 26 +++++++++++++++----
 python/pyspark/sql/tests.py                   | 15 ++++++++++-
 .../scala/org/apache/spark/sql/Dataset.scala  |  2 +-
 3 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 7606ac08bae67..29710acf54c4f 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -650,6 +650,25 @@ def alias(self, alias):
         assert isinstance(alias, basestring), "alias should be a string"
         return DataFrame(getattr(self._jdf, "as")(alias), self.sql_ctx)
 
+    @ignore_unicode_prefix
+    @since(2.1)
+    def crossJoin(self, other):
+        """Returns the cartesian product with another :class:`DataFrame`.
+
+        :param other: Right side of the cartesian product.
+
+        >>> df.select("age", "name").collect()
+        [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
+        >>> df2.select("name", "height").collect()
+        [Row(name=u'Tom', height=80), Row(name=u'Bob', height=85)]
+        >>> df.crossJoin(df2.select("height")).select("age", "name", "height").collect()
+        [Row(age=2, name=u'Alice', height=80), Row(age=2, name=u'Alice', height=85),
+         Row(age=5, name=u'Bob', height=80), Row(age=5, name=u'Bob', height=85)]
+        """
+
+        jdf = self._jdf.crossJoin(other._jdf)
+        return DataFrame(jdf, self.sql_ctx)
+
     @ignore_unicode_prefix
     @since(1.3)
     def join(self, other, on=None, how=None):
@@ -690,14 +709,11 @@ def join(self, other, on=None, how=None):
                 on = self._jseq(on)
             else:
                 assert isinstance(on[0], Column), "on should be Column or list of Column"
-                if len(on) > 1:
-                    on = reduce(lambda x, y: x.__and__(y), on)
-                else:
-                    on = on[0]
+                on = reduce(lambda x, y: x.__and__(y), on)
                 on = on._jc
 
         if on is None and how is None:
-            jdf = self._jdf.crossJoin(other._jdf)
+            jdf = self._jdf.join(other._jdf)
         else:
             if how is None:
                 how = "inner"
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 51d5e7ab0568e..3d46b852c52e1 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -1466,7 +1466,7 @@ def test_functions_broadcast(self):
         self.assertEqual(1, plan1.toString().count("BroadcastHashJoin"))
 
         # no join key -- should not be a broadcast join
-        plan2 = df1.join(broadcast(df2))._jdf.queryExecution().executedPlan()
+        plan2 = df1.crossJoin(broadcast(df2))._jdf.queryExecution().executedPlan()
         self.assertEqual(0, plan2.toString().count("BroadcastHashJoin"))
 
         # planner should not crash without a join
@@ -1514,6 +1514,19 @@ def test_invalid_join_method(self):
         df2 = self.spark.createDataFrame([("Alice", 80), ("Bob", 90)], ["name", "height"])
         self.assertRaises(IllegalArgumentException, lambda: df1.join(df2, how="invalid-join-type"))
 
+    # Cartesian products require cross join syntax
+    def test_require_cross(self):
+        from pyspark.sql.functions import broadcast
+
+        df1 = self.spark.createDataFrame([(1, "1")], ("key", "value"))
+        df2 = self.spark.createDataFrame([(1, "1")], ("key", "value"))
+
+        # joins without conditions require cross join syntax
+        self.assertRaises(AnalysisException, lambda: df1.join(df2).collect())
+
+        # works with crossJoin
+        self.assertEqual(1, df1.crossJoin(df2).count())
+
     def test_conf(self):
         spark = self.spark
         spark.conf.set("bogo", "sipeo")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 70c9cf5ae2440..7ae3275245c5d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -774,7 +774,7 @@ class Dataset[T] private[sql](
    * @param right Right side of the join operation.
    *
    * @group untypedrel
-   * @since 2.0.0
+   * @since 2.1.0
    */
   def crossJoin(right: Dataset[_]): DataFrame = withPlan {
     Join(logicalPlan, right.logicalPlan, joinType = Cross, None)

From 6ce1b675ee9fc9a6034439c3ca00441f9f172f84 Mon Sep 17 00:00:00 2001
From: Michael Allman <michael@videoamp.com>
Date: Fri, 14 Oct 2016 18:26:18 -0700
Subject: [PATCH 027/162] [SPARK-16980][SQL] Load only catalog table partition
 metadata required to answer a query

(This PR addresses https://issues.apache.org/jira/browse/SPARK-16980.)

## What changes were proposed in this pull request?

In a new Spark session, when a partitioned Hive table is converted to use Spark's `HadoopFsRelation` in `HiveMetastoreCatalog`, metadata for every partition of that table are retrieved from the metastore and loaded into driver memory. In addition, every partition's metadata files are read from the filesystem to perform schema inference.

If a user queries such a table with predicates which prune that table's partitions, we would like to be able to answer that query without consulting partition metadata which are not involved in the query. When querying a table with a large number of partitions for some data from a small number of partitions (maybe even a single partition), the current conversion strategy is highly inefficient. I suspect this scenario is not uncommon in the wild.

In addition to being inefficient in running time, the current strategy is inefficient in its use of driver memory. When the sum of the number of partitions of all tables loaded in a driver reaches a certain level (somewhere in the tens of thousands), their cached data exhaust all driver heap memory in the default configuration. I suspect this scenario is less common (in that not too many deployments work with tables with tens of thousands of partitions), however this does illustrate how large the memory footprint of this metadata can be. With tables with hundreds or thousands of partitions, I would expect the `HiveMetastoreCatalog` table cache to represent a significant portion of the driver's heap space.

This PR proposes an alternative approach. Basically, it makes four changes:

1. It adds a new method, `listPartitionsByFilter` to the Catalyst `ExternalCatalog` trait which returns the partition metadata for a given sequence of partition pruning predicates.
1. It refactors the `FileCatalog` type hierarchy to include a new `TableFileCatalog` to efficiently return files only for partitions matching a sequence of partition pruning predicates.
1. It removes partition loading and caching from `HiveMetastoreCatalog`.
1. It adds a new Catalyst optimizer rule, `PruneFileSourcePartitions`, which applies a plan's partition-pruning predicates to prune out unnecessary partition files from a `HadoopFsRelation`'s underlying file catalog.

The net effect is that when a query over a partitioned Hive table is planned, the analyzer retrieves the table metadata from `HiveMetastoreCatalog`. As part of this operation, the `HiveMetastoreCatalog` builds a `HadoopFsRelation` with a `TableFileCatalog`. It does not load any partition metadata or scan any files. The optimizer prunes-away unnecessary table partitions by sending the partition-pruning predicates to the relation's `TableFileCatalog `. The `TableFileCatalog` in turn calls the `listPartitionsByFilter` method on its external catalog. This queries the Hive metastore, passing along those filters.

As a bonus, performing partition pruning during optimization leads to a more accurate relation size estimate. This, along with c481bdf, can lead to automatic, safe application of the broadcast optimization in a join where it might previously have been omitted.

## Open Issues

1. This PR omits partition metadata caching. I can add this once the overall strategy for the cold path is established, perhaps in a future PR.
1. This PR removes and omits partitioned Hive table schema reconciliation. As a result, it fails to find Parquet schema columns with upper case letters because of the Hive metastore's case-insensitivity. This issue may be fixed by #14750, but that PR appears to have stalled. ericl has contributed to this PR a workaround for Parquet wherein schema reconciliation occurs at query execution time instead of planning. Whether ORC requires a similar patch is an open issue.
1. This PR omits an implementation of `listPartitionsByFilter` for the `InMemoryCatalog`.
1. This PR breaks parquet log output redirection during query execution. I can work around this by running `Class.forName("org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$")` first thing in a Spark shell session, but I haven't figured out how to fix this properly.

## How was this patch tested?

The current Spark unit tests were run, and some ad-hoc tests were performed to validate that only the necessary partition metadata is loaded.

Author: Michael Allman <michael@videoamp.com>
Author: Eric Liang <ekl@databricks.com>
Author: Eric Liang <ekhliang@gmail.com>

Closes #14690 from mallman/spark-16980-lazy_partition_fetching.
---
 .../spark/metrics/source/StaticSources.scala  |  34 ++-
 .../catalyst/catalog/ExternalCatalog.scala    |   5 +-
 .../catalyst/catalog/InMemoryCatalog.scala    |   4 +-
 .../sql/catalyst/catalog/interface.scala      |  15 +-
 .../scala/org/apache/spark/sql/Dataset.scala  |   4 +-
 .../spark/sql/execution/CacheManager.scala    |   2 +-
 .../sql/execution/DataSourceScanExec.scala    |  28 ++-
 .../spark/sql/execution/SparkOptimizer.scala  |   2 +
 .../command/createDataSourceTables.scala      |   2 +-
 .../execution/datasources/DataSource.scala    |   4 +-
 .../datasources/DataSourceStrategy.scala      |   8 +-
 .../execution/datasources/FileFormat.scala    |  46 +++-
 .../datasources/HadoopFsRelation.scala        |  16 +-
 .../datasources/ListingFileCatalog.scala      | 197 +--------------
 .../datasources/LogicalRelation.scala         |   2 +-
 .../PartitioningAwareFileCatalog.scala        |  24 +-
 .../PruneFileSourcePartitions.scala           |  72 ++++++
 .../datasources/SessionFileCatalog.scala      | 225 ++++++++++++++++++
 .../datasources/TableFileCatalog.scala        | 113 +++++++++
 .../parquet/ParquetReadSupport.scala          |   6 +-
 .../streaming/MetadataLogFileCatalog.scala    |   2 +-
 .../apache/spark/sql/internal/SQLConf.scala   |   9 +
 .../datasources/FileCatalogSuite.scala        |   5 +-
 .../datasources/FileSourceStrategySuite.scala |   2 +-
 ...te.scala => SessionFileCatalogSuite.scala} |  16 +-
 .../ParquetPartitionDiscoverySuite.scala      |   6 +-
 .../parquet/ParquetSchemaSuite.scala          |  28 +++
 .../spark/sql/hive/HiveExternalCatalog.scala  |  37 ++-
 .../spark/sql/hive/HiveMetastoreCatalog.scala | 126 ++++------
 .../spark/sql/hive/client/HiveClient.scala    |  15 +-
 .../sql/hive/client/HiveClientImpl.scala      |  19 +-
 .../spark/sql/hive/orc/OrcFileFormat.scala    |  12 +-
 .../spark/sql/hive/HiveDataFrameSuite.scala   | 109 ++++++++-
 .../sql/hive/HiveMetadataCacheSuite.scala     |  41 ++++
 .../spark/sql/hive/client/VersionsSuite.scala |   4 +-
 .../spark/sql/hive/orc/OrcQuerySuite.scala    |  22 ++
 .../apache/spark/sql/hive/parquetSuites.scala |  20 +-
 37 files changed, 914 insertions(+), 368 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
 rename sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/{ListingFileCatalogSuite.scala => SessionFileCatalogSuite.scala} (66%)

diff --git a/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala b/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
index 6bba259acc391..cf92a10deabd5 100644
--- a/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
@@ -26,7 +26,7 @@ private[spark] object StaticSources {
    * The set of all static sources. These sources may be reported to from any class, including
    * static classes, without requiring reference to a SparkEnv.
    */
-  val allSources = Seq(CodegenMetrics)
+  val allSources = Seq(CodegenMetrics, HiveCatalogMetrics)
 }
 
 /**
@@ -60,3 +60,35 @@ object CodegenMetrics extends Source {
   val METRIC_GENERATED_METHOD_BYTECODE_SIZE =
     metricRegistry.histogram(MetricRegistry.name("generatedMethodSize"))
 }
+
+/**
+ * :: Experimental ::
+ * Metrics for access to the hive external catalog.
+ */
+@Experimental
+object HiveCatalogMetrics extends Source {
+  override val sourceName: String = "HiveExternalCatalog"
+  override val metricRegistry: MetricRegistry = new MetricRegistry()
+
+  /**
+   * Tracks the total number of partition metadata entries fetched via the client api.
+   */
+  val METRIC_PARTITIONS_FETCHED = metricRegistry.counter(MetricRegistry.name("partitionsFetched"))
+
+  /**
+   * Tracks the total number of files discovered off of the filesystem by ListingFileCatalog.
+   */
+  val METRIC_FILES_DISCOVERED = metricRegistry.counter(MetricRegistry.name("filesDiscovered"))
+
+  /**
+   * Resets the values of all metrics to zero. This is useful in tests.
+   */
+  def reset(): Unit = {
+    METRIC_PARTITIONS_FETCHED.dec(METRIC_PARTITIONS_FETCHED.getCount())
+    METRIC_FILES_DISCOVERED.dec(METRIC_FILES_DISCOVERED.getCount())
+  }
+
+  // clients can use these to avoid classloader issues with the codahale classes
+  def incrementFetchedPartitions(n: Int): Unit = METRIC_PARTITIONS_FETCHED.inc(n)
+  def incrementFilesDiscovered(n: Int): Unit = METRIC_FILES_DISCOVERED.inc(n)
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
index 348d3d0be2152..a5e02523d2889 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
@@ -198,11 +198,12 @@ abstract class ExternalCatalog {
       partialSpec: Option[TablePartitionSpec] = None): Seq[CatalogTablePartition]
 
   /**
-   * List the metadata of selected partitions according to the given partition predicates.
+   * List the metadata of partitions that belong to the specified table, assuming it exists, that
+   * satisfy the given partition-pruning predicate expressions.
    *
    * @param db database name
    * @param table table name
-   * @param predicates partition predicated
+   * @param predicates  partition-pruning predicates
    */
   def listPartitionsByFilter(
       db: String,
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
index 49280f82e20be..f95c9f8cfa2d4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
@@ -482,7 +482,9 @@ class InMemoryCatalog(
       db: String,
       table: String,
       predicates: Seq[Expression]): Seq[CatalogTablePartition] = {
-    throw new UnsupportedOperationException("listPartitionsByFilter is not implemented.")
+    // TODO: Provide an implementation
+    throw new UnsupportedOperationException(
+      "listPartitionsByFilter is not implemented for InMemoryCatalog")
   }
 
   // --------------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index 51326ca25e9cc..1a57a7707caa1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -20,11 +20,11 @@ package org.apache.spark.sql.catalyst.catalog
 import java.util.Date
 
 import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
-import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.{FunctionIdentifier, InternalRow, TableIdentifier}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Literal}
 import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics}
 import org.apache.spark.sql.catalyst.util.quoteIdentifier
-import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.types.{StructField, StructType}
 
 
 /**
@@ -97,6 +97,15 @@ case class CatalogTablePartition(
 
     output.filter(_.nonEmpty).mkString("CatalogPartition(\n\t", "\n\t", ")")
   }
+
+  /**
+   * Given the partition schema, returns a row with that schema holding the partition values.
+   */
+  def toRow(partitionSchema: StructType): InternalRow = {
+    InternalRow.fromSeq(partitionSchema.map { case StructField(name, dataType, _, _) =>
+      Cast(Literal(spec(name)), dataType).eval()
+    })
+  }
 }
 
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 7ae3275245c5d..7dccbbd3f0a5b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -43,7 +43,7 @@ import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.util.usePrettyExpression
 import org.apache.spark.sql.execution.{FileRelation, LogicalRDD, QueryExecution, SQLExecution}
 import org.apache.spark.sql.execution.command.{CreateViewCommand, ExplainCommand, GlobalTempView, LocalTempView}
-import org.apache.spark.sql.execution.datasources.LogicalRelation
+import org.apache.spark.sql.execution.datasources.{FileCatalog, HadoopFsRelation, LogicalRelation}
 import org.apache.spark.sql.execution.datasources.json.JacksonGenerator
 import org.apache.spark.sql.execution.python.EvaluatePython
 import org.apache.spark.sql.streaming.{DataStreamWriter, StreamingQuery}
@@ -2614,7 +2614,7 @@ class Dataset[T] private[sql](
    * @since 2.0.0
    */
   def inputFiles: Array[String] = {
-    val files: Seq[String] = logicalPlan.collect {
+    val files: Seq[String] = queryExecution.optimizedPlan.collect {
       case LogicalRelation(fsBasedRelation: FileRelation, _, _) =>
         fsBasedRelation.inputFiles
       case fr: FileRelation =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
index 83b7c779ab818..92fd366e101fd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
@@ -185,7 +185,7 @@ class CacheManager extends Logging {
     plan match {
       case lr: LogicalRelation => lr.relation match {
         case hr: HadoopFsRelation =>
-          val invalidate = hr.location.paths
+          val invalidate = hr.location.rootPaths
             .map(_.makeQualified(fs.getUri, fs.getWorkingDirectory))
             .contains(qualifiedPath)
           if (invalidate) hr.location.refresh()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
index 6cdba406937de..623d2be55dcec 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -225,13 +225,27 @@ case class FileSourceScanExec(
   }
 
   // These metadata values make scan plans uniquely identifiable for equality checking.
-  override val metadata: Map[String, String] = Map(
-    "Format" -> relation.fileFormat.toString,
-    "ReadSchema" -> outputSchema.catalogString,
-    "Batched" -> supportsBatch.toString,
-    "PartitionFilters" -> partitionFilters.mkString("[", ", ", "]"),
-    "PushedFilters" -> dataFilters.mkString("[", ", ", "]"),
-    "InputPaths" -> relation.location.paths.mkString(", "))
+  override val metadata: Map[String, String] = {
+    def seqToString(seq: Seq[Any]) = seq.mkString("[", ", ", "]")
+    val location = relation.location
+    val locationDesc =
+      location.getClass.getSimpleName + seqToString(location.rootPaths)
+    val metadata =
+      Map(
+        "Format" -> relation.fileFormat.toString,
+        "ReadSchema" -> outputSchema.catalogString,
+        "Batched" -> supportsBatch.toString,
+        "PartitionFilters" -> seqToString(partitionFilters),
+        "PushedFilters" -> seqToString(dataFilters),
+        "Location" -> locationDesc)
+    val withOptPartitionCount =
+      relation.partitionSchemaOption.map { _ =>
+        metadata + ("PartitionCount" -> selectedPartitions.size.toString)
+      } getOrElse {
+        metadata
+      }
+    withOptPartitionCount
+  }
 
   private lazy val inputRDD: RDD[InternalRow] = {
     val readFile: (PartitionedFile) => Iterator[InternalRow] =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
index 8b762b5d6c5f2..981728331d361 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.execution
 import org.apache.spark.sql.ExperimentalMethods
 import org.apache.spark.sql.catalyst.catalog.SessionCatalog
 import org.apache.spark.sql.catalyst.optimizer.Optimizer
+import org.apache.spark.sql.execution.datasources.PruneFileSourcePartitions
 import org.apache.spark.sql.execution.python.ExtractPythonUDFFromAggregate
 import org.apache.spark.sql.internal.SQLConf
 
@@ -32,5 +33,6 @@ class SparkOptimizer(
   override def batches: Seq[Batch] = super.batches :+
     Batch("Optimize Metadata Only Query", Once, OptimizeMetadataOnlyQuery(catalog, conf)) :+
     Batch("Extract Python UDF from Aggregate", Once, ExtractPythonUDFFromAggregate) :+
+    Batch("Prune File Source Table Partitions", Once, PruneFileSourcePartitions) :+
     Batch("User Provided Optimizers", fixedPoint, experimentalMethods.extraOptimizations: _*)
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
index a04a13e698c43..a8c75a7f29cef 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -67,7 +67,7 @@ case class CreateDataSourceTableCommand(table: CatalogTable, ignoreIfExists: Boo
 
     dataSource match {
       case fs: HadoopFsRelation =>
-        if (table.tableType == CatalogTableType.EXTERNAL && fs.location.paths.isEmpty) {
+        if (table.tableType == CatalogTableType.EXTERNAL && fs.location.rootPaths.isEmpty) {
           throw new AnalysisException(
             "Cannot create a file-based external data source table without path")
         }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index e75e7d2770b4e..92b1fff7d8127 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -471,9 +471,7 @@ case class DataSource(
           val existingPartitionColumns = Try {
             resolveRelation()
               .asInstanceOf[HadoopFsRelation]
-              .location
-              .partitionSpec()
-              .partitionColumns
+              .partitionSchema
               .fieldNames
               .toSeq
           }.getOrElse(Seq.empty[String])
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 6f9ed50a02b09..7d0abe86a44df 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -163,14 +163,14 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] {
         if query.resolved && t.schema.asNullable == query.schema.asNullable =>
 
       // Sanity checks
-      if (t.location.paths.size != 1) {
+      if (t.location.rootPaths.size != 1) {
         throw new AnalysisException(
           "Can only write data to relations with a single path.")
       }
 
-      val outputPath = t.location.paths.head
+      val outputPath = t.location.rootPaths.head
       val inputPaths = query.collect {
-        case LogicalRelation(r: HadoopFsRelation, _, _) => r.location.paths
+        case LogicalRelation(r: HadoopFsRelation, _, _) => r.location.rootPaths
       }.flatten
 
       val mode = if (overwrite) SaveMode.Overwrite else SaveMode.Append
@@ -184,7 +184,7 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] {
         query.resolve(t.partitionSchema, t.sparkSession.sessionState.analyzer.resolver),
         t.bucketSpec,
         t.fileFormat,
-        () => t.refresh(),
+        () => t.location.refresh(),
         t.options,
         query,
         mode)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
index bde2d2b89d56f..e7239ef91b326 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
@@ -29,6 +29,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjectio
 import org.apache.spark.sql.sources.Filter
 import org.apache.spark.sql.types.StructType
 
+
 /**
  * Used to read and write data stored in files to/from the [[InternalRow]] format.
  */
@@ -182,16 +183,17 @@ abstract class TextBasedFileFormat extends FileFormat {
 case class Partition(values: InternalRow, files: Seq[FileStatus])
 
 /**
- * An interface for objects capable of enumerating the files that comprise a relation as well
- * as the partitioning characteristics of those files.
+ * An interface for objects capable of enumerating the root paths of a relation as well as the
+ * partitions of a relation subject to some pruning expressions.
  */
-trait FileCatalog {
-
-  /** Returns the list of input paths from which the catalog will get files. */
-  def paths: Seq[Path]
+trait BasicFileCatalog {
 
-  /** Returns the specification of the partitions inferred from the data. */
-  def partitionSpec(): PartitionSpec
+  /**
+   * Returns the list of root input paths from which the catalog will get files. There may be a
+   * single root path from which partitions are discovered, or individual partitions may be
+   * specified by each path.
+   */
+  def rootPaths: Seq[Path]
 
   /**
    * Returns all valid files grouped into partitions when the data is partitioned. If the data is
@@ -204,9 +206,33 @@ trait FileCatalog {
    */
   def listFiles(filters: Seq[Expression]): Seq[Partition]
 
+  /** Returns the list of files that will be read when scanning this relation. */
+  def inputFiles: Array[String]
+
+  /** Refresh any cached file listings */
+  def refresh(): Unit
+
+  /** Sum of table file sizes, in bytes */
+  def sizeInBytes: Long
+}
+
+/**
+ * A [[BasicFileCatalog]] which can enumerate all of the files comprising a relation and, from
+ * those, infer the relation's partition specification.
+ */
+// TODO: Consider a more descriptive, appropriate name which suggests this is a file catalog for
+// which it is safe to list all of its files?
+trait FileCatalog extends BasicFileCatalog {
+
+  /** Returns the specification of the partitions inferred from the data. */
+  def partitionSpec(): PartitionSpec
+
   /** Returns all the valid files. */
   def allFiles(): Seq[FileStatus]
 
-  /** Refresh the file listing */
-  def refresh(): Unit
+  /** Returns the list of files that will be read when scanning this relation. */
+  override def inputFiles: Array[String] =
+    allFiles().map(_.getPath.toUri.toString).toArray
+
+  override def sizeInBytes: Long = allFiles().map(_.getLen).sum
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
index c7ebe0b76a150..db889edf032d6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
@@ -28,8 +28,8 @@ import org.apache.spark.sql.types.StructType
  * Acts as a container for all of the metadata required to read from a datasource. All discovery,
  * resolution and merging logic for schemas and partitions has been removed.
  *
- * @param location A [[FileCatalog]] that can enumerate the locations of all the files that comprise
- *                 this relation.
+ * @param location A [[BasicFileCatalog]] that can enumerate the locations of all the files that
+ *                 comprise this relation.
  * @param partitionSchema The schema of the columns (if any) that are used to partition the relation
  * @param dataSchema The schema of any remaining columns.  Note that if any partition columns are
  *                   present in the actual data files as well, they are preserved.
@@ -38,7 +38,7 @@ import org.apache.spark.sql.types.StructType
  * @param options Configuration used when reading / writing data.
  */
 case class HadoopFsRelation(
-    location: FileCatalog,
+    location: BasicFileCatalog,
     partitionSchema: StructType,
     dataSchema: StructType,
     bucketSpec: Option[BucketSpec],
@@ -58,10 +58,6 @@ case class HadoopFsRelation(
   def partitionSchemaOption: Option[StructType] =
     if (partitionSchema.isEmpty) None else Some(partitionSchema)
 
-  def partitionSpec: PartitionSpec = location.partitionSpec()
-
-  def refresh(): Unit = location.refresh()
-
   override def toString: String = {
     fileFormat match {
       case source: DataSourceRegister => source.shortName()
@@ -69,9 +65,7 @@ case class HadoopFsRelation(
     }
   }
 
-  /** Returns the list of files that will be read when scanning this relation. */
-  override def inputFiles: Array[String] =
-    location.allFiles().map(_.getPath.toUri.toString).toArray
+  override def sizeInBytes: Long = location.sizeInBytes
 
-  override def sizeInBytes: Long = location.allFiles().map(_.getLen).sum
+  override def inputFiles: Array[String] = location.inputFiles
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
index a68ae523e0faa..6d10501b7265d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
@@ -17,32 +17,26 @@
 
 package org.apache.spark.sql.execution.datasources
 
-import java.io.FileNotFoundException
-
 import scala.collection.mutable
 
-import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs._
-import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
 
-import org.apache.spark.internal.Logging
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.types.StructType
-import org.apache.spark.util.SerializableConfiguration
 
 
 /**
  * A [[FileCatalog]] that generates the list of files to process by recursively listing all the
  * files present in `paths`.
  *
+ * @param rootPaths the list of root table paths to scan
  * @param parameters as set of options to control discovery
- * @param paths a list of paths to scan
  * @param partitionSchema an optional partition schema that will be use to provide types for the
  *                        discovered partitions
  */
 class ListingFileCatalog(
     sparkSession: SparkSession,
-    override val paths: Seq[Path],
+    override val rootPaths: Seq[Path],
     parameters: Map[String, String],
     partitionSchema: Option[StructType])
   extends PartitioningAwareFileCatalog(sparkSession, parameters, partitionSchema) {
@@ -70,198 +64,17 @@ class ListingFileCatalog(
   }
 
   override def refresh(): Unit = {
-    val files = listLeafFiles(paths)
+    val files = listLeafFiles(rootPaths)
     cachedLeafFiles =
       new mutable.LinkedHashMap[Path, FileStatus]() ++= files.map(f => f.getPath -> f)
     cachedLeafDirToChildrenFiles = files.toArray.groupBy(_.getPath.getParent)
     cachedPartitionSpec = null
   }
 
-  /**
-   * List leaf files of given paths. This method will submit a Spark job to do parallel
-   * listing whenever there is a path having more files than the parallel partition discovery
-   * discovery threshold.
-   *
-   * This is publicly visible for testing.
-   */
-  def listLeafFiles(paths: Seq[Path]): mutable.LinkedHashSet[FileStatus] = {
-    val files =
-      if (paths.length >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) {
-        ListingFileCatalog.listLeafFilesInParallel(paths, hadoopConf, sparkSession)
-      } else {
-        ListingFileCatalog.listLeafFilesInSerial(paths, hadoopConf)
-      }
-
-    mutable.LinkedHashSet(files: _*)
-  }
-
   override def equals(other: Any): Boolean = other match {
-    case hdfs: ListingFileCatalog => paths.toSet == hdfs.paths.toSet
+    case hdfs: ListingFileCatalog => rootPaths.toSet == hdfs.rootPaths.toSet
     case _ => false
   }
 
-  override def hashCode(): Int = paths.toSet.hashCode()
-}
-
-
-object ListingFileCatalog extends Logging {
-
-  /** A serializable variant of HDFS's BlockLocation. */
-  private case class SerializableBlockLocation(
-      names: Array[String],
-      hosts: Array[String],
-      offset: Long,
-      length: Long)
-
-  /** A serializable variant of HDFS's FileStatus. */
-  private case class SerializableFileStatus(
-      path: String,
-      length: Long,
-      isDir: Boolean,
-      blockReplication: Short,
-      blockSize: Long,
-      modificationTime: Long,
-      accessTime: Long,
-      blockLocations: Array[SerializableBlockLocation])
-
-  /**
-   * List a collection of path recursively.
-   */
-  private def listLeafFilesInSerial(
-      paths: Seq[Path],
-      hadoopConf: Configuration): Seq[FileStatus] = {
-    // Dummy jobconf to get to the pathFilter defined in configuration
-    val jobConf = new JobConf(hadoopConf, this.getClass)
-    val filter = FileInputFormat.getInputPathFilter(jobConf)
-
-    paths.flatMap { path =>
-      val fs = path.getFileSystem(hadoopConf)
-      listLeafFiles0(fs, path, filter)
-    }
-  }
-
-  /**
-   * List a collection of path recursively in parallel (using Spark executors).
-   * Each task launched will use [[listLeafFilesInSerial]] to list.
-   */
-  private def listLeafFilesInParallel(
-      paths: Seq[Path],
-      hadoopConf: Configuration,
-      sparkSession: SparkSession): Seq[FileStatus] = {
-    assert(paths.size >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold)
-    logInfo(s"Listing leaf files and directories in parallel under: ${paths.mkString(", ")}")
-
-    val sparkContext = sparkSession.sparkContext
-    val serializableConfiguration = new SerializableConfiguration(hadoopConf)
-    val serializedPaths = paths.map(_.toString)
-
-    // Set the number of parallelism to prevent following file listing from generating many tasks
-    // in case of large #defaultParallelism.
-    val numParallelism = Math.min(paths.size, 10000)
-
-    val statuses = sparkContext
-      .parallelize(serializedPaths, numParallelism)
-      .mapPartitions { paths =>
-        val hadoopConf = serializableConfiguration.value
-        listLeafFilesInSerial(paths.map(new Path(_)).toSeq, hadoopConf).iterator
-      }.map { status =>
-        // Turn FileStatus into SerializableFileStatus so we can send it back to the driver
-        val blockLocations = status match {
-          case f: LocatedFileStatus =>
-            f.getBlockLocations.map { loc =>
-              SerializableBlockLocation(
-                loc.getNames,
-                loc.getHosts,
-                loc.getOffset,
-                loc.getLength)
-            }
-
-          case _ =>
-            Array.empty[SerializableBlockLocation]
-        }
-
-        SerializableFileStatus(
-          status.getPath.toString,
-          status.getLen,
-          status.isDirectory,
-          status.getReplication,
-          status.getBlockSize,
-          status.getModificationTime,
-          status.getAccessTime,
-          blockLocations)
-      }.collect()
-
-    // Turn SerializableFileStatus back to Status
-    statuses.map { f =>
-      val blockLocations = f.blockLocations.map { loc =>
-        new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length)
-      }
-      new LocatedFileStatus(
-        new FileStatus(
-          f.length, f.isDir, f.blockReplication, f.blockSize, f.modificationTime, new Path(f.path)),
-        blockLocations)
-    }
-  }
-
-  /**
-   * List a single path, provided as a FileStatus, in serial.
-   */
-  private def listLeafFiles0(
-      fs: FileSystem, path: Path, filter: PathFilter): Seq[FileStatus] = {
-    logTrace(s"Listing $path")
-    val name = path.getName.toLowerCase
-    if (shouldFilterOut(name)) {
-      Seq.empty[FileStatus]
-    } else {
-      // [SPARK-17599] Prevent ListingFileCatalog from failing if path doesn't exist
-      // Note that statuses only include FileStatus for the files and dirs directly under path,
-      // and does not include anything else recursively.
-      val statuses = try fs.listStatus(path) catch {
-        case _: FileNotFoundException =>
-          logWarning(s"The directory $path was not found. Was it deleted very recently?")
-          Array.empty[FileStatus]
-      }
-
-      val allLeafStatuses = {
-        val (dirs, files) = statuses.partition(_.isDirectory)
-        val stats = files ++ dirs.flatMap(dir => listLeafFiles0(fs, dir.getPath, filter))
-        if (filter != null) stats.filter(f => filter.accept(f.getPath)) else stats
-      }
-
-      allLeafStatuses.filterNot(status => shouldFilterOut(status.getPath.getName)).map {
-        case f: LocatedFileStatus =>
-          f
-
-        // NOTE:
-        //
-        // - Although S3/S3A/S3N file system can be quite slow for remote file metadata
-        //   operations, calling `getFileBlockLocations` does no harm here since these file system
-        //   implementations don't actually issue RPC for this method.
-        //
-        // - Here we are calling `getFileBlockLocations` in a sequential manner, but it should not
-        //   be a big deal since we always use to `listLeafFilesInParallel` when the number of
-        //   paths exceeds threshold.
-        case f =>
-          // The other constructor of LocatedFileStatus will call FileStatus.getPermission(),
-          // which is very slow on some file system (RawLocalFileSystem, which is launch a
-          // subprocess and parse the stdout).
-          val locations = fs.getFileBlockLocations(f, 0, f.getLen)
-          val lfs = new LocatedFileStatus(f.getLen, f.isDirectory, f.getReplication, f.getBlockSize,
-            f.getModificationTime, 0, null, null, null, null, f.getPath, locations)
-          if (f.isSymlink) {
-            lfs.setSymlink(f.getSymlink)
-          }
-          lfs
-      }
-    }
-  }
-
-  /** Checks if we should filter out this path name. */
-  def shouldFilterOut(pathName: String): Boolean = {
-    // We filter everything that starts with _ and ., except _common_metadata and _metadata
-    // because Parquet needs to find those metadata files from leaf files returned by this method.
-    // We should refactor this logic to not mix metadata files with data files.
-    ((pathName.startsWith("_") && !pathName.contains("=")) || pathName.startsWith(".")) &&
-      !pathName.startsWith("_common_metadata") && !pathName.startsWith("_metadata")
-  }
+  override def hashCode(): Int = rootPaths.toSet.hashCode()
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
index d9562fd32e87d..7c28d48f26416 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
@@ -94,7 +94,7 @@ case class LogicalRelation(
   }
 
   override def refresh(): Unit = relation match {
-    case fs: HadoopFsRelation => fs.refresh()
+    case fs: HadoopFsRelation => fs.location.refresh()
     case _ =>  // Do nothing.
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
index 702ba97222e34..b2508115c282f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
@@ -21,7 +21,6 @@ import scala.collection.mutable
 
 import org.apache.hadoop.fs.{FileStatus, Path}
 
-import org.apache.spark.internal.Logging
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.{expressions, InternalRow}
 import org.apache.spark.sql.catalyst.expressions._
@@ -40,9 +39,10 @@ abstract class PartitioningAwareFileCatalog(
     sparkSession: SparkSession,
     parameters: Map[String, String],
     partitionSchema: Option[StructType])
-  extends FileCatalog with Logging {
+  extends SessionFileCatalog(sparkSession) with FileCatalog {
+  import PartitioningAwareFileCatalog.BASE_PATH_PARAM
 
-  protected val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(parameters)
+  override protected val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(parameters)
 
   protected def leafFiles: mutable.LinkedHashMap[Path, FileStatus]
 
@@ -72,8 +72,8 @@ abstract class PartitioningAwareFileCatalog(
 
   override def allFiles(): Seq[FileStatus] = {
     if (partitionSpec().partitionColumns.isEmpty) {
-      // For each of the input paths, get the list of files inside them
-      paths.flatMap { path =>
+      // For each of the root input paths, get the list of files inside them
+      rootPaths.flatMap { path =>
         // Make the path qualified (consistent with listLeafFiles and listLeafFilesInParallel).
         val fs = path.getFileSystem(hadoopConf)
         val qualifiedPathPre = fs.makeQualified(path)
@@ -105,8 +105,6 @@ abstract class PartitioningAwareFileCatalog(
   protected def inferPartitioning(): PartitionSpec = {
     // We use leaf dirs containing data files to discover the schema.
     val leafDirs = leafDirToChildrenFiles.filter { case (_, files) =>
-      // SPARK-15895: Metadata files (e.g. Parquet summary files) and temporary files should not be
-      // counted as data files, so that they shouldn't participate partition discovery.
       files.exists(f => isDataPath(f.getPath))
     }.keys.toSeq
     partitionSchema match {
@@ -194,24 +192,30 @@ abstract class PartitioningAwareFileCatalog(
    * and the returned DataFrame will have the column of `something`.
    */
   private def basePaths: Set[Path] = {
-    parameters.get("basePath").map(new Path(_)) match {
+    parameters.get(BASE_PATH_PARAM).map(new Path(_)) match {
       case Some(userDefinedBasePath) =>
         val fs = userDefinedBasePath.getFileSystem(hadoopConf)
         if (!fs.isDirectory(userDefinedBasePath)) {
-          throw new IllegalArgumentException("Option 'basePath' must be a directory")
+          throw new IllegalArgumentException(s"Option '$BASE_PATH_PARAM' must be a directory")
         }
         Set(fs.makeQualified(userDefinedBasePath))
 
       case None =>
-        paths.map { path =>
+        rootPaths.map { path =>
           // Make the path qualified (consistent with listLeafFiles and listLeafFilesInParallel).
           val qualifiedPath = path.getFileSystem(hadoopConf).makeQualified(path)
           if (leafFiles.contains(qualifiedPath)) qualifiedPath.getParent else qualifiedPath }.toSet
     }
   }
 
+  // SPARK-15895: Metadata files (e.g. Parquet summary files) and temporary files should not be
+  // counted as data files, so that they shouldn't participate partition discovery.
   private def isDataPath(path: Path): Boolean = {
     val name = path.getName
     !((name.startsWith("_") && !name.contains("=")) || name.startsWith("."))
   }
 }
+
+object PartitioningAwareFileCatalog {
+  val BASE_PATH_PARAM = "basePath"
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
new file mode 100644
index 0000000000000..29121a47d92d1
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.planning.PhysicalOperation
+import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project}
+import org.apache.spark.sql.catalyst.rules.Rule
+
+private[sql] object PruneFileSourcePartitions extends Rule[LogicalPlan] {
+  override def apply(plan: LogicalPlan): LogicalPlan = plan transformDown {
+    case op @ PhysicalOperation(projects, filters,
+        logicalRelation @
+          LogicalRelation(fsRelation @
+            HadoopFsRelation(
+              tableFileCatalog: TableFileCatalog,
+              partitionSchema,
+              _,
+              _,
+              _,
+              _),
+            _,
+            _))
+        if filters.nonEmpty && fsRelation.partitionSchemaOption.isDefined =>
+      // The attribute name of predicate could be different than the one in schema in case of
+      // case insensitive, we should change them to match the one in schema, so we donot need to
+      // worry about case sensitivity anymore.
+      val normalizedFilters = filters.map { e =>
+        e transform {
+          case a: AttributeReference =>
+            a.withName(logicalRelation.output.find(_.semanticEquals(a)).get.name)
+        }
+      }
+
+      val sparkSession = fsRelation.sparkSession
+      val partitionColumns =
+        logicalRelation.resolve(
+          partitionSchema, sparkSession.sessionState.analyzer.resolver)
+      val partitionSet = AttributeSet(partitionColumns)
+      val partitionKeyFilters =
+        ExpressionSet(normalizedFilters.filter(_.references.subsetOf(partitionSet)))
+
+      if (partitionKeyFilters.nonEmpty) {
+        val prunedFileCatalog = tableFileCatalog.filterPartitions(partitionKeyFilters.toSeq)
+        val prunedFsRelation =
+          fsRelation.copy(location = prunedFileCatalog)(sparkSession)
+        val prunedLogicalRelation = logicalRelation.copy(relation = prunedFsRelation)
+
+        // Keep partition-pruning predicates so that they are visible in physical planning
+        val filterExpression = filters.reduceLeft(And)
+        val filter = Filter(filterExpression, prunedLogicalRelation)
+        Project(projects, filter)
+      } else {
+        op
+      }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
new file mode 100644
index 0000000000000..4807a92c2e6b8
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
@@ -0,0 +1,225 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import java.io.FileNotFoundException
+
+import scala.collection.mutable
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs._
+import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.metrics.source.HiveCatalogMetrics
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.util.SerializableConfiguration
+
+
+/**
+ * A base class for [[BasicFileCatalog]]s that need a [[SparkSession]] and the ability to find leaf
+ * files in a list of HDFS paths.
+ *
+ * @param sparkSession a [[SparkSession]]
+ * @param ignoreFileNotFound (see [[ListingFileCatalog]])
+ */
+abstract class SessionFileCatalog(sparkSession: SparkSession)
+    extends BasicFileCatalog with Logging {
+  protected val hadoopConf: Configuration
+
+  /**
+   * List leaf files of given paths. This method will submit a Spark job to do parallel
+   * listing whenever there is a path having more files than the parallel partition discovery
+   * discovery threshold.
+   *
+   * This is publicly visible for testing.
+   */
+  def listLeafFiles(paths: Seq[Path]): mutable.LinkedHashSet[FileStatus] = {
+    val files =
+      if (paths.length >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) {
+        SessionFileCatalog.listLeafFilesInParallel(paths, hadoopConf, sparkSession)
+      } else {
+        SessionFileCatalog.listLeafFilesInSerial(paths, hadoopConf)
+      }
+
+    HiveCatalogMetrics.incrementFilesDiscovered(files.size)
+    mutable.LinkedHashSet(files: _*)
+  }
+}
+
+object SessionFileCatalog extends Logging {
+
+  /** A serializable variant of HDFS's BlockLocation. */
+  private case class SerializableBlockLocation(
+      names: Array[String],
+      hosts: Array[String],
+      offset: Long,
+      length: Long)
+
+  /** A serializable variant of HDFS's FileStatus. */
+  private case class SerializableFileStatus(
+      path: String,
+      length: Long,
+      isDir: Boolean,
+      blockReplication: Short,
+      blockSize: Long,
+      modificationTime: Long,
+      accessTime: Long,
+      blockLocations: Array[SerializableBlockLocation])
+
+  /**
+   * List a collection of path recursively.
+   */
+  private def listLeafFilesInSerial(
+      paths: Seq[Path],
+      hadoopConf: Configuration): Seq[FileStatus] = {
+    // Dummy jobconf to get to the pathFilter defined in configuration
+    val jobConf = new JobConf(hadoopConf, this.getClass)
+    val filter = FileInputFormat.getInputPathFilter(jobConf)
+
+    paths.flatMap { path =>
+      val fs = path.getFileSystem(hadoopConf)
+      listLeafFiles0(fs, path, filter)
+    }
+  }
+
+  /**
+   * List a collection of path recursively in parallel (using Spark executors).
+   * Each task launched will use [[listLeafFilesInSerial]] to list.
+   */
+  private def listLeafFilesInParallel(
+      paths: Seq[Path],
+      hadoopConf: Configuration,
+      sparkSession: SparkSession): Seq[FileStatus] = {
+    assert(paths.size >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold)
+    logInfo(s"Listing leaf files and directories in parallel under: ${paths.mkString(", ")}")
+
+    val sparkContext = sparkSession.sparkContext
+    val serializableConfiguration = new SerializableConfiguration(hadoopConf)
+    val serializedPaths = paths.map(_.toString)
+
+    // Set the number of parallelism to prevent following file listing from generating many tasks
+    // in case of large #defaultParallelism.
+    val numParallelism = Math.min(paths.size, 10000)
+
+    val statuses = sparkContext
+      .parallelize(serializedPaths, numParallelism)
+      .mapPartitions { paths =>
+        val hadoopConf = serializableConfiguration.value
+        listLeafFilesInSerial(paths.map(new Path(_)).toSeq, hadoopConf).iterator
+      }.map { status =>
+        // Turn FileStatus into SerializableFileStatus so we can send it back to the driver
+        val blockLocations = status match {
+          case f: LocatedFileStatus =>
+            f.getBlockLocations.map { loc =>
+              SerializableBlockLocation(
+                loc.getNames,
+                loc.getHosts,
+                loc.getOffset,
+                loc.getLength)
+            }
+
+          case _ =>
+            Array.empty[SerializableBlockLocation]
+        }
+
+        SerializableFileStatus(
+          status.getPath.toString,
+          status.getLen,
+          status.isDirectory,
+          status.getReplication,
+          status.getBlockSize,
+          status.getModificationTime,
+          status.getAccessTime,
+          blockLocations)
+      }.collect()
+
+    // Turn SerializableFileStatus back to Status
+    statuses.map { f =>
+      val blockLocations = f.blockLocations.map { loc =>
+        new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length)
+      }
+      new LocatedFileStatus(
+        new FileStatus(
+          f.length, f.isDir, f.blockReplication, f.blockSize, f.modificationTime, new Path(f.path)),
+        blockLocations)
+    }
+  }
+
+  /**
+   * List a single path, provided as a FileStatus, in serial.
+   */
+  private def listLeafFiles0(
+      fs: FileSystem, path: Path, filter: PathFilter): Seq[FileStatus] = {
+    logTrace(s"Listing $path")
+    val name = path.getName.toLowerCase
+    if (shouldFilterOut(name)) {
+      Seq.empty[FileStatus]
+    } else {
+      // [SPARK-17599] Prevent ListingFileCatalog from failing if path doesn't exist
+      // Note that statuses only include FileStatus for the files and dirs directly under path,
+      // and does not include anything else recursively.
+      val statuses = try fs.listStatus(path) catch {
+        case _: FileNotFoundException =>
+          logWarning(s"The directory $path was not found. Was it deleted very recently?")
+          Array.empty[FileStatus]
+      }
+
+      val allLeafStatuses = {
+        val (dirs, files) = statuses.partition(_.isDirectory)
+        val stats = files ++ dirs.flatMap(dir => listLeafFiles0(fs, dir.getPath, filter))
+        if (filter != null) stats.filter(f => filter.accept(f.getPath)) else stats
+      }
+
+      allLeafStatuses.filterNot(status => shouldFilterOut(status.getPath.getName)).map {
+        case f: LocatedFileStatus =>
+          f
+
+        // NOTE:
+        //
+        // - Although S3/S3A/S3N file system can be quite slow for remote file metadata
+        //   operations, calling `getFileBlockLocations` does no harm here since these file system
+        //   implementations don't actually issue RPC for this method.
+        //
+        // - Here we are calling `getFileBlockLocations` in a sequential manner, but it should not
+        //   be a big deal since we always use to `listLeafFilesInParallel` when the number of
+        //   paths exceeds threshold.
+        case f =>
+          // The other constructor of LocatedFileStatus will call FileStatus.getPermission(),
+          // which is very slow on some file system (RawLocalFileSystem, which is launch a
+          // subprocess and parse the stdout).
+          val locations = fs.getFileBlockLocations(f, 0, f.getLen)
+          val lfs = new LocatedFileStatus(f.getLen, f.isDirectory, f.getReplication, f.getBlockSize,
+            f.getModificationTime, 0, null, null, null, null, f.getPath, locations)
+          if (f.isSymlink) {
+            lfs.setSymlink(f.getSymlink)
+          }
+          lfs
+      }
+    }
+  }
+
+  /** Checks if we should filter out this path name. */
+  def shouldFilterOut(pathName: String): Boolean = {
+    // We filter everything that starts with _ and ., except _common_metadata and _metadata
+    // because Parquet needs to find those metadata files from leaf files returned by this method.
+    // We should refactor this logic to not mix metadata files with data files.
+    ((pathName.startsWith("_") && !pathName.contains("=")) || pathName.startsWith(".")) &&
+      !pathName.startsWith("_common_metadata") && !pathName.startsWith("_metadata")
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
new file mode 100644
index 0000000000000..a5c41b244589b
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import org.apache.hadoop.fs.Path
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.types.StructType
+
+
+/**
+ * A [[BasicFileCatalog]] for a metastore catalog table.
+ *
+ * @param sparkSession a [[SparkSession]]
+ * @param db the table's database name
+ * @param table the table's (unqualified) name
+ * @param partitionSchema the schema of a partitioned table's partition columns
+ * @param sizeInBytes the table's data size in bytes
+ */
+class TableFileCatalog(
+    sparkSession: SparkSession,
+    db: String,
+    table: String,
+    partitionSchema: Option[StructType],
+    override val sizeInBytes: Long)
+  extends SessionFileCatalog(sparkSession) {
+
+  override protected val hadoopConf = sparkSession.sessionState.newHadoopConf
+
+  private val externalCatalog = sparkSession.sharedState.externalCatalog
+
+  private val catalogTable = externalCatalog.getTable(db, table)
+
+  private val baseLocation = catalogTable.storage.locationUri
+
+  override def rootPaths: Seq[Path] = baseLocation.map(new Path(_)).toSeq
+
+  override def listFiles(filters: Seq[Expression]): Seq[Partition] = {
+    filterPartitions(filters).listFiles(Nil)
+  }
+
+  override def refresh(): Unit = {}
+
+  /**
+   * Returns a [[ListingFileCatalog]] for this table restricted to the subset of partitions
+   * specified by the given partition-pruning filters.
+   *
+   * @param filters partition-pruning filters
+   */
+  def filterPartitions(filters: Seq[Expression]): ListingFileCatalog = {
+    if (filters.isEmpty) {
+      cachedAllPartitions
+    } else {
+      filterPartitions0(filters)
+    }
+  }
+
+  private def filterPartitions0(filters: Seq[Expression]): ListingFileCatalog = {
+    val parameters = baseLocation
+      .map(loc => Map(PartitioningAwareFileCatalog.BASE_PATH_PARAM -> loc))
+      .getOrElse(Map.empty)
+    partitionSchema match {
+      case Some(schema) =>
+        val selectedPartitions = externalCatalog.listPartitionsByFilter(db, table, filters)
+        val partitions = selectedPartitions.map { p =>
+          PartitionDirectory(p.toRow(schema), p.storage.locationUri.get)
+        }
+        val partitionSpec = PartitionSpec(schema, partitions)
+        new PrunedTableFileCatalog(
+          sparkSession, new Path(baseLocation.get), partitionSpec)
+      case None =>
+        new ListingFileCatalog(sparkSession, rootPaths, parameters, None)
+    }
+  }
+
+  // Not used in the hot path of queries when metastore partition pruning is enabled
+  lazy val cachedAllPartitions: ListingFileCatalog = filterPartitions0(Nil)
+
+  override def inputFiles: Array[String] = cachedAllPartitions.inputFiles
+}
+
+/**
+ * An override of the standard HDFS listing based catalog, that overrides the partition spec with
+ * the information from the metastore.
+ *
+ * @param tableBasePath The default base path of the Hive metastore table
+ * @param partitionSpec The partition specifications from Hive metastore
+ */
+private class PrunedTableFileCatalog(
+    sparkSession: SparkSession,
+    tableBasePath: Path,
+    override val partitionSpec: PartitionSpec)
+  extends ListingFileCatalog(
+    sparkSession,
+    partitionSpec.partitions.map(_.path),
+    Map.empty,
+    Some(partitionSpec.partitionColumns))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala
index f1a35dd8a6200..4dea8cf29ec58 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala
@@ -269,11 +269,15 @@ private[parquet] object ParquetReadSupport {
    */
   private def clipParquetGroupFields(
       parquetRecord: GroupType, structType: StructType): Seq[Type] = {
-    val parquetFieldMap = parquetRecord.getFields.asScala.map(f => f.getName -> f).toMap
+    val parquetFieldMap = parquetRecord.getFields.asScala
+      .map(f => f.getName -> f).toMap
+    val caseInsensitiveParquetFieldMap = parquetRecord.getFields.asScala
+      .map(f => f.getName.toLowerCase -> f).toMap
     val toParquet = new ParquetSchemaConverter(writeLegacyParquetFormat = false)
     structType.map { f =>
       parquetFieldMap
         .get(f.name)
+        .orElse(caseInsensitiveParquetFieldMap.get(f.name.toLowerCase))
         .map(clipParquetType(_, f.dataType))
         .getOrElse(toParquet.convertField(f))
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileCatalog.scala
index a32c4671e3475..82b67cb1ca6ee 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileCatalog.scala
@@ -47,7 +47,7 @@ class MetadataLogFileCatalog(sparkSession: SparkSession, path: Path)
     allFilesFromLog.toArray.groupBy(_.getPath.getParent)
   }
 
-  override def paths: Seq[Path] = path :: Nil
+  override def rootPaths: Seq[Path] = path :: Nil
 
   override def refresh(): Unit = { }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index c8447651dd672..e73d0187b584b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -269,6 +269,13 @@ object SQLConf {
       .booleanConf
       .createWithDefault(false)
 
+  val HIVE_FILESOURCE_PARTITION_PRUNING =
+    SQLConfigBuilder("spark.sql.hive.filesourcePartitionPruning")
+      .doc("When true, enable metastore partition pruning for file source tables as well. " +
+           "This is currently implemented for converted Hive tables only.")
+      .booleanConf
+      .createWithDefault(true)
+
   val OPTIMIZER_METADATA_ONLY = SQLConfigBuilder("spark.sql.optimizer.metadataOnly")
     .doc("When true, enable the metadata-only query optimization that use the table's metadata " +
       "to produce the partition columns instead of table scans. It applies when all the columns " +
@@ -676,6 +683,8 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
 
   def metastorePartitionPruning: Boolean = getConf(HIVE_METASTORE_PARTITION_PRUNING)
 
+  def filesourcePartitionPruning: Boolean = getConf(HIVE_FILESOURCE_PARTITION_PRUNING)
+
   def gatherFastStats: Boolean = getConf(GATHER_FASTSTAT)
 
   def optimizerMetadataOnly: Boolean = getConf(OPTIMIZER_METADATA_ONLY)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala
index fa3abd0098f5b..2695974b84b00 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala
@@ -77,13 +77,14 @@ class FileCatalogSuite extends SharedSQLContext {
       val catalog1 = new ListingFileCatalog(
         spark, Seq(new Path(deletedFolder.getCanonicalPath)), Map.empty, None)
       // doesn't throw an exception
-      assert(catalog1.listLeafFiles(catalog1.paths).isEmpty)
+      assert(catalog1.listLeafFiles(catalog1.rootPaths).isEmpty)
     }
   }
 
   test("SPARK-17613 - PartitioningAwareFileCatalog: base path w/o '/' at end") {
     class MockCatalog(
-      override val paths: Seq[Path]) extends PartitioningAwareFileCatalog(spark, Map.empty, None) {
+      override val rootPaths: Seq[Path])
+      extends PartitioningAwareFileCatalog(spark, Map.empty, None) {
 
       override def refresh(): Unit = {}
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala
index c5deb31fec183..c32254d9dfde2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala
@@ -395,7 +395,7 @@ class FileSourceStrategySuite extends QueryTest with SharedSQLContext with Predi
 
         val fileCatalog = new ListingFileCatalog(
           sparkSession = spark,
-          paths = Seq(new Path(tempDir)),
+          rootPaths = Seq(new Path(tempDir)),
           parameters = Map.empty[String, String],
           partitionSchema = None)
         // This should not fail.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalogSuite.scala
similarity index 66%
rename from sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalogSuite.scala
rename to sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalogSuite.scala
index f15730aeb11f2..df509583377ae 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalogSuite.scala
@@ -19,16 +19,16 @@ package org.apache.spark.sql.execution.datasources
 
 import org.apache.spark.SparkFunSuite
 
-class ListingFileCatalogSuite extends SparkFunSuite {
+class SessionFileCatalogSuite extends SparkFunSuite {
 
   test("file filtering") {
-    assert(!ListingFileCatalog.shouldFilterOut("abcd"))
-    assert(ListingFileCatalog.shouldFilterOut(".ab"))
-    assert(ListingFileCatalog.shouldFilterOut("_cd"))
+    assert(!SessionFileCatalog.shouldFilterOut("abcd"))
+    assert(SessionFileCatalog.shouldFilterOut(".ab"))
+    assert(SessionFileCatalog.shouldFilterOut("_cd"))
 
-    assert(!ListingFileCatalog.shouldFilterOut("_metadata"))
-    assert(!ListingFileCatalog.shouldFilterOut("_common_metadata"))
-    assert(ListingFileCatalog.shouldFilterOut("_ab_metadata"))
-    assert(ListingFileCatalog.shouldFilterOut("_cd_common_metadata"))
+    assert(!SessionFileCatalog.shouldFilterOut("_metadata"))
+    assert(!SessionFileCatalog.shouldFilterOut("_common_metadata"))
+    assert(SessionFileCatalog.shouldFilterOut("_ab_metadata"))
+    assert(SessionFileCatalog.shouldFilterOut("_cd_common_metadata"))
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
index 8d18be9300f7e..43357c97c395a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
@@ -30,7 +30,7 @@ import org.apache.parquet.hadoop.ParquetOutputFormat
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Literal
-import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation, PartitionDirectory => Partition, PartitioningUtils, PartitionSpec}
+import org.apache.spark.sql.execution.datasources.{FileCatalog, HadoopFsRelation, LogicalRelation, PartitionDirectory => Partition, PartitioningUtils, PartitionSpec}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
@@ -626,8 +626,8 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
       (1 to 10).map(i => (i, i.toString)).toDF("a", "b").write.parquet(dir.getCanonicalPath)
       val queryExecution = spark.read.parquet(dir.getCanonicalPath).queryExecution
       queryExecution.analyzed.collectFirst {
-        case LogicalRelation(relation: HadoopFsRelation, _, _) =>
-          assert(relation.partitionSpec === PartitionSpec.emptySpec)
+        case LogicalRelation(HadoopFsRelation(location: FileCatalog, _, _, _, _, _), _, _) =>
+          assert(location.partitionSpec === PartitionSpec.emptySpec)
       }.getOrElse {
         fail(s"Expecting a ParquetRelation2, but got:\n$queryExecution")
       }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
index 8a980a7eb538f..c3d202ced24c8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
@@ -1080,6 +1080,34 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     }
   }
 
+  testSchemaClipping(
+    "falls back to case insensitive resolution",
+
+    parquetSchema =
+      """message root {
+        |  required group A {
+        |    optional int32 B;
+        |  }
+        |  optional int32 c;
+        |}
+      """.stripMargin,
+
+    catalystSchema = {
+      val nestedType = new StructType().add("b", IntegerType, nullable = true)
+      new StructType()
+        .add("a", nestedType, nullable = true)
+        .add("c", IntegerType, nullable = true)
+    },
+
+    expectedSchema =
+      """message root {
+        |  required group A {
+        |    optional int32 B;
+        |  }
+        |  optional int32 c;
+        |}
+      """.stripMargin)
+
   testSchemaClipping(
     "simple nested struct",
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index b5d93c3d7c804..ff59b54f53909 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -29,17 +29,17 @@ import org.apache.thrift.TException
 import org.apache.spark.SparkConf
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier}
 import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException
 import org.apache.spark.sql.catalyst.catalog._
-import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Statistics}
 import org.apache.spark.sql.execution.command.{ColumnStatStruct, DDLUtils}
 import org.apache.spark.sql.execution.datasources.CaseInsensitiveMap
 import org.apache.spark.sql.hive.client.HiveClient
 import org.apache.spark.sql.internal.HiveSerDe
 import org.apache.spark.sql.internal.StaticSQLConf._
-import org.apache.spark.sql.types.{DataType, StructType}
+import org.apache.spark.sql.types.{DataType, StructField, StructType}
 
 
 /**
@@ -650,8 +650,35 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
   override def listPartitionsByFilter(
       db: String,
       table: String,
-      predicates: Seq[Expression]): Seq[CatalogTablePartition] = {
-    client.getPartitionsByFilter(db, table, predicates)
+      predicates: Seq[Expression]): Seq[CatalogTablePartition] = withClient {
+    val catalogTable = client.getTable(db, table)
+    val partitionColumnNames = catalogTable.partitionColumnNames.toSet
+    val nonPartitionPruningPredicates = predicates.filterNot {
+      _.references.map(_.name).toSet.subsetOf(partitionColumnNames)
+    }
+
+    if (nonPartitionPruningPredicates.nonEmpty) {
+        sys.error("Expected only partition pruning predicates: " +
+          predicates.reduceLeft(And))
+    }
+
+    val partitionSchema = catalogTable.partitionSchema
+
+    if (predicates.nonEmpty) {
+      val clientPrunedPartitions =
+        client.getPartitionsByFilter(catalogTable, predicates)
+      val boundPredicate =
+        InterpretedPredicate.create(predicates.reduce(And).transform {
+          case att: AttributeReference =>
+            val index = partitionSchema.indexWhere(_.name == att.name)
+            BoundReference(index, partitionSchema(index).dataType, nullable = true)
+        })
+      clientPrunedPartitions.filter { case p: CatalogTablePartition =>
+        boundPredicate(p.toRow(partitionSchema))
+      }
+    } else {
+      client.getPartitions(catalogTable)
+    }
   }
 
   // --------------------------------------------------------------------------
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index c44f0adda44c0..4a2aaa7d4f6ca 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -135,12 +135,12 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
 
   private def getCached(
       tableIdentifier: QualifiedTableName,
-      pathsInMetastore: Seq[String],
+      pathsInMetastore: Seq[Path],
       metastoreRelation: MetastoreRelation,
       schemaInMetastore: StructType,
       expectedFileFormat: Class[_ <: FileFormat],
       expectedBucketSpec: Option[BucketSpec],
-      partitionSpecInMetastore: Option[PartitionSpec]): Option[LogicalRelation] = {
+      partitionSchema: Option[StructType]): Option[LogicalRelation] = {
 
     cachedDataSourceTables.getIfPresent(tableIdentifier) match {
       case null => None // Cache miss
@@ -152,12 +152,10 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
             // If we have the same paths, same schema, and same partition spec,
             // we will use the cached relation.
             val useCached =
-              relation.location.paths.map(_.toString).toSet == pathsInMetastore.toSet &&
+              relation.location.rootPaths.toSet == pathsInMetastore.toSet &&
                 logical.schema.sameType(schemaInMetastore) &&
                 relation.bucketSpec == expectedBucketSpec &&
-                relation.partitionSpec == partitionSpecInMetastore.getOrElse {
-                  PartitionSpec(StructType(Nil), Array.empty[PartitionDirectory])
-                }
+                relation.partitionSchema == partitionSchema.getOrElse(StructType(Nil))
 
             if (useCached) {
               Some(logical)
@@ -196,61 +194,59 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
       QualifiedTableName(metastoreRelation.databaseName, metastoreRelation.tableName)
     val bucketSpec = None  // We don't support hive bucketed tables, only ones we write out.
 
+    val lazyPruningEnabled = sparkSession.sqlContext.conf.filesourcePartitionPruning
     val result = if (metastoreRelation.hiveQlTable.isPartitioned) {
       val partitionSchema = StructType.fromAttributes(metastoreRelation.partitionKeys)
-      val partitionColumnDataTypes = partitionSchema.map(_.dataType)
-      // We're converting the entire table into HadoopFsRelation, so predicates to Hive metastore
-      // are empty.
-      val partitions = metastoreRelation.getHiveQlPartitions().map { p =>
-        val location = p.getLocation
-        val values = InternalRow.fromSeq(p.getValues.asScala.zip(partitionColumnDataTypes).map {
-          case (rawValue, dataType) => Cast(Literal(rawValue), dataType).eval(null)
-        })
-        PartitionDirectory(values, location)
-      }
-      val partitionSpec = PartitionSpec(partitionSchema, partitions)
-      val partitionPaths = partitions.map(_.path.toString)
-
-      // By convention (for example, see MetaStorePartitionedTableFileCatalog), the definition of a
-      // partitioned table's paths depends on whether that table has any actual partitions.
-      // Partitioned tables without partitions use the location of the table's base path.
-      // Partitioned tables with partitions use the locations of those partitions' data locations,
-      // _omitting_ the table's base path.
-      val paths = if (partitionPaths.isEmpty) {
-        Seq(metastoreRelation.hiveQlTable.getDataLocation.toString)
+
+      val rootPaths: Seq[Path] = if (lazyPruningEnabled) {
+        Seq(metastoreRelation.hiveQlTable.getDataLocation)
       } else {
-        partitionPaths
+        // By convention (for example, see TableFileCatalog), the definition of a
+        // partitioned table's paths depends on whether that table has any actual partitions.
+        // Partitioned tables without partitions use the location of the table's base path.
+        // Partitioned tables with partitions use the locations of those partitions' data
+        // locations,_omitting_ the table's base path.
+        val paths = metastoreRelation.getHiveQlPartitions().map { p =>
+          new Path(p.getLocation)
+        }
+        if (paths.isEmpty) {
+          Seq(metastoreRelation.hiveQlTable.getDataLocation)
+        } else {
+          paths
+        }
       }
 
       val cached = getCached(
         tableIdentifier,
-        paths,
+        rootPaths,
         metastoreRelation,
         metastoreSchema,
         fileFormatClass,
         bucketSpec,
-        Some(partitionSpec))
-
-      val hadoopFsRelation = cached.getOrElse {
-        val fileCatalog = new MetaStorePartitionedTableFileCatalog(
-          sparkSession,
-          new Path(metastoreRelation.catalogTable.storage.locationUri.get),
-          partitionSpec)
-
-        val inferredSchema = if (fileType.equals("parquet")) {
-          val inferredSchema =
-            defaultSource.inferSchema(sparkSession, options, fileCatalog.allFiles())
-          inferredSchema.map { inferred =>
-            ParquetFileFormat.mergeMetastoreParquetSchema(metastoreSchema, inferred)
-          }.getOrElse(metastoreSchema)
-        } else {
-          defaultSource.inferSchema(sparkSession, options, fileCatalog.allFiles()).get
+        Some(partitionSchema))
+
+      val logicalRelation = cached.getOrElse {
+        val db = metastoreRelation.databaseName
+        val table = metastoreRelation.tableName
+        val sizeInBytes = metastoreRelation.statistics.sizeInBytes.toLong
+        val fileCatalog = {
+          val catalog = new TableFileCatalog(
+            sparkSession, db, table, Some(partitionSchema), sizeInBytes)
+          if (lazyPruningEnabled) {
+            catalog
+          } else {
+            catalog.cachedAllPartitions
+          }
         }
+        val partitionSchemaColumnNames = partitionSchema.map(_.name.toLowerCase).toSet
+        val dataSchema =
+          StructType(metastoreSchema
+            .filterNot(field => partitionSchemaColumnNames.contains(field.name.toLowerCase)))
 
         val relation = HadoopFsRelation(
           location = fileCatalog,
           partitionSchema = partitionSchema,
-          dataSchema = inferredSchema,
+          dataSchema = dataSchema,
           bucketSpec = bucketSpec,
           fileFormat = defaultSource,
           options = options)(sparkSession = sparkSession)
@@ -260,12 +256,12 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
         created
       }
 
-      hadoopFsRelation
+      logicalRelation
     } else {
-      val paths = Seq(metastoreRelation.hiveQlTable.getDataLocation.toString)
+      val rootPath = metastoreRelation.hiveQlTable.getDataLocation
 
       val cached = getCached(tableIdentifier,
-        paths,
+        Seq(rootPath),
         metastoreRelation,
         metastoreSchema,
         fileFormatClass,
@@ -276,14 +272,13 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
           LogicalRelation(
             DataSource(
               sparkSession = sparkSession,
-              paths = paths,
+              paths = rootPath.toString :: Nil,
               userSpecifiedSchema = Some(metastoreRelation.schema),
               bucketSpec = bucketSpec,
               options = options,
               className = fileType).resolveRelation(),
               catalogTable = Some(metastoreRelation.catalogTable))
 
-
         cachedDataSourceTables.put(tableIdentifier, created)
         created
       }
@@ -371,34 +366,3 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
     }
   }
 }
-
-/**
- * An override of the standard HDFS listing based catalog, that overrides the partition spec with
- * the information from the metastore.
- *
- * @param tableBasePath The default base path of the Hive metastore table
- * @param partitionSpec The partition specifications from Hive metastore
- */
-private[hive] class MetaStorePartitionedTableFileCatalog(
-    sparkSession: SparkSession,
-    tableBasePath: Path,
-    override val partitionSpec: PartitionSpec)
-  extends ListingFileCatalog(
-    sparkSession,
-    MetaStorePartitionedTableFileCatalog.getPaths(tableBasePath, partitionSpec),
-    Map.empty,
-    Some(partitionSpec.partitionColumns)) {
-}
-
-private[hive] object MetaStorePartitionedTableFileCatalog {
-  /** Get the list of paths to list files in the for a metastore table */
-  def getPaths(tableBasePath: Path, partitionSpec: PartitionSpec): Seq[Path] = {
-    // If there are no partitions currently specified then use base path,
-    // otherwise use the paths corresponding to the partitions.
-    if (partitionSpec.partitions.isEmpty) {
-      Seq(tableBasePath)
-    } else {
-      partitionSpec.partitions.map(_.path)
-    }
-  }
-}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
index 9ee3d629c9977..569a9c11398ea 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
@@ -172,15 +172,24 @@ private[hive] trait HiveClient {
    * Returns the partitions for the given table that match the supplied partition spec.
    * If no partition spec is specified, all partitions are returned.
    */
-  def getPartitions(
+  final def getPartitions(
       db: String,
       table: String,
+      partialSpec: Option[TablePartitionSpec]): Seq[CatalogTablePartition] = {
+    getPartitions(getTable(db, table), partialSpec)
+  }
+
+  /**
+   * Returns the partitions for the given table that match the supplied partition spec.
+   * If no partition spec is specified, all partitions are returned.
+   */
+  def getPartitions(
+      catalogTable: CatalogTable,
       partialSpec: Option[TablePartitionSpec] = None): Seq[CatalogTablePartition]
 
   /** Returns partitions filtered by predicates for the given table. */
   def getPartitionsByFilter(
-      db: String,
-      table: String,
+      catalogTable: CatalogTable,
       predicates: Seq[Expression]): Seq[CatalogTablePartition]
 
   /** Loads a static partition into an existing table. */
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index 5c8f7ff1af9fa..e745a8c5b3589 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -37,6 +37,7 @@ import org.apache.hadoop.security.UserGroupInformation
 
 import org.apache.spark.{SparkConf, SparkException}
 import org.apache.spark.internal.Logging
+import org.apache.spark.metrics.source.HiveCatalogMetrics
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.{NoSuchDatabaseException, NoSuchPartitionException}
@@ -525,22 +526,24 @@ private[hive] class HiveClientImpl(
    * If no partition spec is specified, all partitions are returned.
    */
   override def getPartitions(
-      db: String,
-      table: String,
+      table: CatalogTable,
       spec: Option[TablePartitionSpec]): Seq[CatalogTablePartition] = withHiveState {
-    val hiveTable = toHiveTable(getTable(db, table))
-    spec match {
+    val hiveTable = toHiveTable(table)
+    val parts = spec match {
       case None => shim.getAllPartitions(client, hiveTable).map(fromHivePartition)
       case Some(s) => client.getPartitions(hiveTable, s.asJava).asScala.map(fromHivePartition)
     }
+    HiveCatalogMetrics.incrementFetchedPartitions(parts.length)
+    parts
   }
 
   override def getPartitionsByFilter(
-      db: String,
-      table: String,
+      table: CatalogTable,
       predicates: Seq[Expression]): Seq[CatalogTablePartition] = withHiveState {
-    val hiveTable = toHiveTable(getTable(db, table))
-    shim.getPartitionsByFilter(client, hiveTable, predicates).map(fromHivePartition)
+    val hiveTable = toHiveTable(table)
+    val parts = shim.getPartitionsByFilter(client, hiveTable, predicates).map(fromHivePartition)
+    HiveCatalogMetrics.incrementFetchedPartitions(parts.length)
+    parts
   }
 
   override def listTables(dbName: String): Seq[String] = withHiveState {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
index e94f49ea81177..1af3280e18a89 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
@@ -313,7 +313,17 @@ private[orc] object OrcRelation extends HiveInspectors {
 
   def setRequiredColumns(
       conf: Configuration, physicalSchema: StructType, requestedSchema: StructType): Unit = {
-    val ids = requestedSchema.map(a => physicalSchema.fieldIndex(a.name): Integer)
+    val caseInsensitiveFieldMap: Map[String, Int] = physicalSchema.fieldNames
+      .zipWithIndex
+      .map(f => (f._1.toLowerCase, f._2))
+      .toMap
+    val ids = requestedSchema.map { a =>
+      val exactMatch: Option[Int] = physicalSchema.getFieldIndex(a.name)
+      val res = exactMatch.getOrElse(
+        caseInsensitiveFieldMap.getOrElse(a.name,
+          throw new IllegalArgumentException(s"""Field "$a.name" does not exist.""")))
+      res: Integer
+    }
     val (sortedIDs, sortedNames) = ids.zip(requestedSchema.fieldNames).sorted.unzip
     HiveShim.appendReadColumns(conf, sortedIDs, sortedNames)
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
index 96e9054cd4876..f65e74de87a57 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
@@ -17,10 +17,14 @@
 
 package org.apache.spark.sql.hive
 
+import java.io.File
+
+import org.apache.spark.metrics.source.HiveCatalogMetrics
 import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.QueryTest
 
-class HiveDataFrameSuite extends QueryTest with TestHiveSingleton {
+class HiveDataFrameSuite extends QueryTest with TestHiveSingleton with SQLTestUtils {
   test("table name with schema") {
     // regression test for SPARK-11778
     spark.sql("create schema usrdb")
@@ -34,4 +38,107 @@ class HiveDataFrameSuite extends QueryTest with TestHiveSingleton {
     val hiveClient = spark.sharedState.externalCatalog.asInstanceOf[HiveExternalCatalog].client
     assert(hiveClient.getConf("hive.in.test", "") == "true")
   }
+
+  private def setupPartitionedTable(tableName: String, dir: File): Unit = {
+    spark.range(5).selectExpr("id", "id as partCol1", "id as partCol2").write
+      .partitionBy("partCol1", "partCol2")
+      .mode("overwrite")
+      .parquet(dir.getAbsolutePath)
+
+    spark.sql(s"""
+      |create external table $tableName (id long)
+      |partitioned by (partCol1 int, partCol2 int)
+      |stored as parquet
+      |location "${dir.getAbsolutePath}"""".stripMargin)
+    spark.sql(s"msck repair table $tableName")
+  }
+
+  test("partitioned pruned table reports only selected files") {
+    assert(spark.sqlContext.getConf(HiveUtils.CONVERT_METASTORE_PARQUET.key) == "true")
+    withTable("test") {
+      withTempDir { dir =>
+        setupPartitionedTable("test", dir)
+        val df = spark.sql("select * from test")
+        assert(df.count() == 5)
+        assert(df.inputFiles.length == 5)  // unpruned
+
+        val df2 = spark.sql("select * from test where partCol1 = 3 or partCol2 = 4")
+        assert(df2.count() == 2)
+        assert(df2.inputFiles.length == 2)  // pruned, so we have less files
+
+        val df3 = spark.sql("select * from test where PARTCOL1 = 3 or partcol2 = 4")
+        assert(df3.count() == 2)
+        assert(df3.inputFiles.length == 2)
+
+        val df4 = spark.sql("select * from test where partCol1 = 999")
+        assert(df4.count() == 0)
+        assert(df4.inputFiles.length == 0)
+      }
+    }
+  }
+
+  test("lazy partition pruning reads only necessary partition data") {
+    withSQLConf("spark.sql.hive.filesourcePartitionPruning" -> "true") {
+      withTable("test") {
+        withTempDir { dir =>
+          setupPartitionedTable("test", dir)
+          HiveCatalogMetrics.reset()
+          spark.sql("select * from test where partCol1 = 999").count()
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 0)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 0)
+
+          HiveCatalogMetrics.reset()
+          spark.sql("select * from test where partCol1 < 2").count()
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 2)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 2)
+
+          HiveCatalogMetrics.reset()
+          spark.sql("select * from test where partCol1 < 3").count()
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 3)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 3)
+
+          // should read all
+          HiveCatalogMetrics.reset()
+          spark.sql("select * from test").count()
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 5)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 5)
+
+          // read all should be cached
+          HiveCatalogMetrics.reset()
+          spark.sql("select * from test").count()
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 0)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 0)
+        }
+      }
+    }
+  }
+
+  test("all partitions read and cached when filesource partition pruning is off") {
+    withSQLConf("spark.sql.hive.filesourcePartitionPruning" -> "false") {
+      withTable("test") {
+        withTempDir { dir =>
+          setupPartitionedTable("test", dir)
+
+          // We actually query the partitions from hive each time the table is resolved in this
+          // mode. This is kind of terrible, but is needed to preserve the legacy behavior
+          // of doing plan cache validation based on the entire partition set.
+          HiveCatalogMetrics.reset()
+          spark.sql("select * from test where partCol1 = 999").count()
+          // 5 from table resolution, another 5 from ListingFileCatalog
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 10)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 5)
+
+          HiveCatalogMetrics.reset()
+          spark.sql("select * from test where partCol1 < 2").count()
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 5)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 0)
+
+          HiveCatalogMetrics.reset()
+          spark.sql("select * from test").count()
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 5)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 0)
+        }
+      }
+    }
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
index 3414f5e0409a1..7af81a3a90504 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
@@ -59,4 +59,45 @@ class HiveMetadataCacheSuite extends QueryTest with SQLTestUtils with TestHiveSi
       }
     }
   }
+
+  def testCaching(pruningEnabled: Boolean): Unit = {
+    test(s"partitioned table is cached when partition pruning is $pruningEnabled") {
+      withSQLConf("spark.sql.hive.filesourcePartitionPruning" -> pruningEnabled.toString) {
+        withTable("test") {
+          withTempDir { dir =>
+            spark.range(5).selectExpr("id", "id as f1", "id as f2").write
+              .partitionBy("f1", "f2")
+              .mode("overwrite")
+              .parquet(dir.getAbsolutePath)
+
+            spark.sql(s"""
+              |create external table test (id long)
+              |partitioned by (f1 int, f2 int)
+              |stored as parquet
+              |location "${dir.getAbsolutePath}"""".stripMargin)
+            spark.sql("msck repair table test")
+
+            val df = spark.sql("select * from test")
+            assert(sql("select * from test").count() == 5)
+
+            // Delete a file, then assert that we tried to read it. This means the table was cached.
+            val p = new Path(spark.table("test").inputFiles.head)
+            assert(p.getFileSystem(hiveContext.sessionState.newHadoopConf()).delete(p, true))
+            val e = intercept[SparkException] {
+              sql("select * from test").count()
+            }
+            assert(e.getMessage.contains("FileNotFoundException"))
+
+            // Test refreshing the cache.
+            spark.catalog.refreshTable("test")
+            assert(sql("select * from test").count() == 4)
+          }
+        }
+      }
+    }
+  }
+
+  for (pruningEnabled <- Seq(true, false)) {
+    testCaching(pruningEnabled)
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
index c158bf1ab09cb..9a10957c8efa5 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
@@ -295,12 +295,12 @@ class VersionsSuite extends SparkFunSuite with Logging {
     }
 
     test(s"$version: getPartitions(catalogTable)") {
-      assert(2 == client.getPartitions("default", "src_part").size)
+      assert(2 == client.getPartitions(client.getTable("default", "src_part")).size)
     }
 
     test(s"$version: getPartitionsByFilter") {
       // Only one partition [1, 1] for key2 == 1
-      val result = client.getPartitionsByFilter("default", "src_part",
+      val result = client.getPartitionsByFilter(client.getTable("default", "src_part"),
         Seq(EqualTo(AttributeReference("key2", IntegerType)(), Literal(1))))
 
       // Hive 0.12 doesn't support getPartitionsByFilter, it ignores the filter condition.
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
index b2ee49c441ef2..ecb5972984523 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
@@ -474,6 +474,28 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
     }
   }
 
+  test("converted ORC table supports resolving mixed case field") {
+    withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> "true") {
+      withTable("dummy_orc") {
+        withTempPath { dir =>
+          val df = spark.range(5).selectExpr("id", "id as valueField", "id as partitionValue")
+          df.write
+            .partitionBy("partitionValue")
+            .mode("overwrite")
+            .orc(dir.getAbsolutePath)
+
+          spark.sql(s"""
+            |create external table dummy_orc (id long, valueField long)
+            |partitioned by (partitionValue int)
+            |stored as orc
+            |location "${dir.getAbsolutePath}"""".stripMargin)
+          spark.sql(s"msck repair table dummy_orc")
+          checkAnswer(spark.sql("select * from dummy_orc"), df)
+        }
+      }
+    }
+  }
+
   test("SPARK-14962 Produce correct results on array type with isnotnull") {
     withSQLConf(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key -> "true") {
       val data = (0 until 10).map(i => Tuple1(Array(i)))
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index 2f6d9fb96b825..9fc62a389db4d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -175,7 +175,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
     (1 to 10).map(i => Tuple1(Seq(new Integer(i), null))).toDF("a")
       .createOrReplaceTempView("jt_array")
 
-    setConf(HiveUtils.CONVERT_METASTORE_PARQUET, true)
+    assert(spark.sqlContext.getConf(HiveUtils.CONVERT_METASTORE_PARQUET.key) == "true")
   }
 
   override def afterAll(): Unit = {
@@ -187,7 +187,6 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
       "jt",
       "jt_array",
        "test_parquet")
-    setConf(HiveUtils.CONVERT_METASTORE_PARQUET, false)
   }
 
   test(s"conversion is working") {
@@ -586,6 +585,23 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
         checkAnswer(
           sql("SELECT * FROM test_added_partitions"),
           Seq(("foo", 0), ("bar", 0), ("baz", 1)).toDF("a", "b"))
+
+        // Check it with pruning predicates
+        checkAnswer(
+          sql("SELECT * FROM test_added_partitions where b = 0"),
+          Seq(("foo", 0), ("bar", 0)).toDF("a", "b"))
+        checkAnswer(
+          sql("SELECT * FROM test_added_partitions where b = 1"),
+          Seq(("baz", 1)).toDF("a", "b"))
+        checkAnswer(
+          sql("SELECT * FROM test_added_partitions where b = 2"),
+          Seq[(String, Int)]().toDF("a", "b"))
+
+        // Also verify the inputFiles implementation
+        assert(sql("select * from test_added_partitions").inputFiles.length == 2)
+        assert(sql("select * from test_added_partitions where b = 0").inputFiles.length == 1)
+        assert(sql("select * from test_added_partitions where b = 1").inputFiles.length == 1)
+        assert(sql("select * from test_added_partitions where b = 2").inputFiles.length == 0)
       }
     }
   }

From 36d81c2c68ef4114592b069287743eb5cb078318 Mon Sep 17 00:00:00 2001
From: Jun Kim <i2r.jun@gmail.com>
Date: Sat, 15 Oct 2016 00:36:55 -0700
Subject: [PATCH 028/162] [SPARK-17953][DOCUMENTATION] Fix typo in SparkSession
 scaladoc

## What changes were proposed in this pull request?

### Before:
```scala
SparkSession.builder()
     .master("local")
     .appName("Word Count")
     .config("spark.some.config.option", "some-value").
     .getOrCreate()
```

### After:
```scala
SparkSession.builder()
     .master("local")
     .appName("Word Count")
     .config("spark.some.config.option", "some-value")
     .getOrCreate()
```

There was one unexpected dot!

Author: Jun Kim <i2r.jun@gmail.com>

Closes #15498 from tae-jun/SPARK-17953.
---
 sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
index 137c426b4b88d..baae55013787d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -64,7 +64,7 @@ import org.apache.spark.util.Utils
  *   SparkSession.builder()
  *     .master("local")
  *     .appName("Word Count")
- *     .config("spark.some.config.option", "some-value").
+ *     .config("spark.some.config.option", "some-value")
  *     .getOrCreate()
  * }}}
  */

From ed1463341455830b8867b721a1b34f291139baf3 Mon Sep 17 00:00:00 2001
From: Zhan Zhang <zhanzhang@fb.com>
Date: Sat, 15 Oct 2016 18:45:04 -0700
Subject: [PATCH 029/162] [SPARK-17637][SCHEDULER] Packed scheduling for Spark
 tasks across executors

## What changes were proposed in this pull request?

Restructure the code and implement two new task assigner.
PackedAssigner: try to allocate tasks to the executors with least available cores, so that spark can release reserved executors when dynamic allocation is enabled.

BalancedAssigner: try to allocate tasks to the executors with more available cores in order to balance the workload across all executors.

By default, the original round robin assigner is used.

We test a pipeline, and new PackedAssigner  save around 45% regarding the reserved cpu and memory with dynamic allocation enabled.

## How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests)
Both unit test in TaskSchedulerImplSuite and manual tests in production pipeline.

Author: Zhan Zhang <zhanzhang@fb.com>

Closes #15218 from zhzhan/packed-scheduler.
---
 .../apache/spark/scheduler/TaskAssigner.scala | 154 ++++++++++++++++++
 .../spark/scheduler/TaskSchedulerImpl.scala   |  53 +++---
 .../scheduler/TaskSchedulerImplSuite.scala    |  67 ++++++++
 docs/configuration.md                         |  11 ++
 4 files changed, 266 insertions(+), 19 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/scheduler/TaskAssigner.scala

diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskAssigner.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskAssigner.scala
new file mode 100644
index 0000000000000..62df9657a6ac6
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskAssigner.scala
@@ -0,0 +1,154 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler
+
+import scala.collection.mutable.ArrayBuffer
+import scala.collection.mutable.PriorityQueue
+import scala.util.Random
+
+import org.apache.spark.SparkConf
+
+case class OfferState(workOffer: WorkerOffer, var cores: Int) {
+  // Build a list of tasks to assign to each worker.
+  val tasks = new ArrayBuffer[TaskDescription](cores)
+}
+
+abstract class TaskAssigner(conf: SparkConf) {
+  var offer: Seq[OfferState] = _
+  val CPUS_PER_TASK = conf.getInt("spark.task.cpus", 1)
+
+  // The final assigned offer returned to TaskScheduler.
+  def tasks(): Seq[ArrayBuffer[TaskDescription]] = offer.map(_.tasks)
+
+  // construct the assigner by the workoffer.
+  def construct(workOffer: Seq[WorkerOffer]): Unit = {
+    offer = workOffer.map(o => OfferState(o, o.cores))
+  }
+
+  // Invoked in each round of Taskset assignment to initialize the internal structure.
+  def init(): Unit
+
+  // Indicating whether there is offer available to be used by one round of Taskset assignment.
+  def hasNext(): Boolean
+
+  // Next available offer returned to one round of Taskset assignment.
+  def getNext(): OfferState
+
+  // Called by the TaskScheduler to indicate whether the current offer is accepted
+  // In order to decide whether the current is valid for the next offering.
+  def taskAssigned(assigned: Boolean): Unit
+
+  // Release internally maintained resources. Subclass is responsible to
+  // release its own private resources.
+  def reset: Unit = {
+    offer = null
+  }
+}
+
+class RoundRobinAssigner(conf: SparkConf) extends TaskAssigner(conf) {
+  var i = 0
+  override def construct(workOffer: Seq[WorkerOffer]): Unit = {
+    offer = Random.shuffle(workOffer.map(o => OfferState(o, o.cores)))
+  }
+  override def init(): Unit = {
+    i = 0
+  }
+  override def hasNext: Boolean = {
+    i < offer.size
+  }
+  override def getNext(): OfferState = {
+    offer(i)
+  }
+  override def taskAssigned(assigned: Boolean): Unit = {
+    i += 1
+  }
+  override def reset: Unit = {
+    super.reset
+    i = 0
+  }
+}
+
+class BalancedAssigner(conf: SparkConf) extends TaskAssigner(conf) {
+  var maxHeap: PriorityQueue[OfferState] = _
+  var current: OfferState = _
+
+  override def construct(workOffer: Seq[WorkerOffer]): Unit = {
+    offer = Random.shuffle(workOffer.map(o => OfferState(o, o.cores)))
+  }
+  implicit val ord: Ordering[OfferState] = new Ordering[OfferState] {
+    def compare(x: OfferState, y: OfferState): Int = {
+      return Ordering[Int].compare(x.cores, y.cores)
+    }
+  }
+  def init(): Unit = {
+    maxHeap = new PriorityQueue[OfferState]()
+    offer.filter(_.cores >= CPUS_PER_TASK).foreach(maxHeap.enqueue(_))
+  }
+  override def hasNext: Boolean = {
+    maxHeap.size > 0
+  }
+  override def getNext(): OfferState = {
+    current = maxHeap.dequeue()
+    current
+  }
+
+  override def taskAssigned(assigned: Boolean): Unit = {
+    if (current.cores >= CPUS_PER_TASK && assigned) {
+      maxHeap.enqueue(current)
+    }
+  }
+  override def reset: Unit = {
+    super.reset
+    maxHeap = null
+    current = null
+  }
+}
+
+class PackedAssigner(conf: SparkConf) extends TaskAssigner(conf) {
+
+  var sorted: Seq[OfferState] = _
+  var i = 0
+  var current: OfferState = _
+
+  override def init(): Unit = {
+    i = 0
+    sorted = offer.filter(_.cores >= CPUS_PER_TASK).sortBy(_.cores)
+  }
+
+  override def hasNext: Boolean = {
+    i < sorted.size
+  }
+
+  override def getNext(): OfferState = {
+    current = sorted(i)
+    current
+  }
+
+  def taskAssigned(assigned: Boolean): Unit = {
+    if (current.cores < CPUS_PER_TASK || !assigned) {
+      i += 1
+    }
+  }
+
+  override def reset: Unit = {
+    super.reset
+    sorted = null
+    current = null
+    i = 0
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
index 3e3f1ad031e66..fb732ea8e5a3b 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -22,9 +22,7 @@ import java.util.{Timer, TimerTask}
 import java.util.concurrent.TimeUnit
 import java.util.concurrent.atomic.AtomicLong
 
-import scala.collection.Set
 import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet}
-import scala.util.Random
 
 import org.apache.spark._
 import org.apache.spark.TaskState.TaskState
@@ -61,6 +59,21 @@ private[spark] class TaskSchedulerImpl(
 
   val conf = sc.conf
 
+  val DEFAULT_TASK_ASSIGNER = classOf[RoundRobinAssigner].getName
+  lazy val taskAssigner: TaskAssigner = {
+    val className = conf.get("spark.task.assigner", DEFAULT_TASK_ASSIGNER)
+    try {
+      logInfo(s"""constructing assigner as $className""")
+      val ctor = Utils.classForName(className).getConstructor(classOf[SparkConf])
+      ctor.newInstance(conf).asInstanceOf[TaskAssigner]
+    } catch {
+      case _: Throwable =>
+        logWarning(
+          s"""$className cannot be constructed fallback to default
+             | $DEFAULT_TASK_ASSIGNER""".stripMargin)
+        new RoundRobinAssigner(conf)
+    }
+  }
   // How often to check for speculative tasks
   val SPECULATION_INTERVAL_MS = conf.getTimeAsMs("spark.speculation.interval", "100ms")
 
@@ -250,24 +263,26 @@ private[spark] class TaskSchedulerImpl(
   private def resourceOfferSingleTaskSet(
       taskSet: TaskSetManager,
       maxLocality: TaskLocality,
-      shuffledOffers: Seq[WorkerOffer],
-      availableCpus: Array[Int],
-      tasks: IndexedSeq[ArrayBuffer[TaskDescription]]) : Boolean = {
+      taskAssigner: TaskAssigner) : Boolean = {
     var launchedTask = false
-    for (i <- 0 until shuffledOffers.size) {
-      val execId = shuffledOffers(i).executorId
-      val host = shuffledOffers(i).host
-      if (availableCpus(i) >= CPUS_PER_TASK) {
+    taskAssigner.init()
+    while(taskAssigner.hasNext()) {
+      var assigned = false
+      val current = taskAssigner.getNext()
+      val execId = current.workOffer.executorId
+      val host = current.workOffer.host
+      if (current.cores >= CPUS_PER_TASK) {
         try {
           for (task <- taskSet.resourceOffer(execId, host, maxLocality)) {
-            tasks(i) += task
+            current.tasks += task
             val tid = task.taskId
             taskIdToTaskSetManager(tid) = taskSet
             taskIdToExecutorId(tid) = execId
             executorIdToTaskCount(execId) += 1
-            availableCpus(i) -= CPUS_PER_TASK
-            assert(availableCpus(i) >= 0)
+            current.cores = current.cores - CPUS_PER_TASK
+            assert(current.cores >= 0)
             launchedTask = true
+            assigned = true
           }
         } catch {
           case e: TaskNotSerializableException =>
@@ -277,8 +292,10 @@ private[spark] class TaskSchedulerImpl(
             return launchedTask
         }
       }
+      taskAssigner.taskAssigned(assigned)
     }
     return launchedTask
+
   }
 
   /**
@@ -305,12 +322,8 @@ private[spark] class TaskSchedulerImpl(
         hostsByRack.getOrElseUpdate(rack, new HashSet[String]()) += o.host
       }
     }
+    taskAssigner.construct(offers)
 
-    // Randomly shuffle offers to avoid always placing tasks on the same set of workers.
-    val shuffledOffers = Random.shuffle(offers)
-    // Build a list of tasks to assign to each worker.
-    val tasks = shuffledOffers.map(o => new ArrayBuffer[TaskDescription](o.cores))
-    val availableCpus = shuffledOffers.map(o => o.cores).toArray
     val sortedTaskSets = rootPool.getSortedTaskSetQueue
     for (taskSet <- sortedTaskSets) {
       logDebug("parentName: %s, name: %s, runningTasks: %s".format(
@@ -329,7 +342,7 @@ private[spark] class TaskSchedulerImpl(
       for (currentMaxLocality <- taskSet.myLocalityLevels) {
         do {
           launchedTaskAtCurrentMaxLocality = resourceOfferSingleTaskSet(
-            taskSet, currentMaxLocality, shuffledOffers, availableCpus, tasks)
+            taskSet, currentMaxLocality, taskAssigner)
           launchedAnyTask |= launchedTaskAtCurrentMaxLocality
         } while (launchedTaskAtCurrentMaxLocality)
       }
@@ -337,10 +350,12 @@ private[spark] class TaskSchedulerImpl(
         taskSet.abortIfCompletelyBlacklisted(hostToExecutors)
       }
     }
-
+    val tasks = taskAssigner.tasks
+    taskAssigner.reset
     if (tasks.size > 0) {
       hasLaunchedTask = true
     }
+
     return tasks
   }
 
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
index f5f1947661d9a..2584f85bc553a 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
@@ -109,6 +109,72 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B
     assert(!failedTaskSet)
   }
 
+  test("Scheduler balance the assignment to the worker with more free cores") {
+    val taskScheduler = setupScheduler(("spark.task.assigner", classOf[BalancedAssigner].getName))
+    val workerOffers = IndexedSeq(new WorkerOffer("executor0", "host0", 2),
+      new WorkerOffer("executor1", "host1", 4))
+    val selectedExecutorIds = {
+      val taskSet = FakeTask.createTaskSet(2)
+      taskScheduler.submitTasks(taskSet)
+      val taskDescriptions = taskScheduler.resourceOffers(workerOffers).flatten
+      assert(2 === taskDescriptions.length)
+      taskDescriptions.map(_.executorId)
+    }
+    val count = selectedExecutorIds.count(_ == workerOffers(1).executorId)
+    assert(count == 2)
+    assert(!failedTaskSet)
+  }
+
+  test("Scheduler balance the assignment across workers with same free cores") {
+    val taskScheduler = setupScheduler(("spark.task.assigner", classOf[BalancedAssigner].getName))
+    val workerOffers = IndexedSeq(new WorkerOffer("executor0", "host0", 2),
+      new WorkerOffer("executor1", "host1", 2))
+    val selectedExecutorIds = {
+      val taskSet = FakeTask.createTaskSet(2)
+      taskScheduler.submitTasks(taskSet)
+      val taskDescriptions = taskScheduler.resourceOffers(workerOffers).flatten
+      assert(2 === taskDescriptions.length)
+      taskDescriptions.map(_.executorId)
+    }
+    val count = selectedExecutorIds.count(_ == workerOffers(1).executorId)
+    assert(count == 1)
+    assert(!failedTaskSet)
+  }
+
+  test("Scheduler packs the assignment to workers with less free cores") {
+    val taskScheduler = setupScheduler(("spark.task.assigner", classOf[PackedAssigner].getName))
+    val workerOffers = IndexedSeq(new WorkerOffer("executor0", "host0", 2),
+      new WorkerOffer("executor1", "host1", 4))
+    val selectedExecutorIds = {
+      val taskSet = FakeTask.createTaskSet(2)
+      taskScheduler.submitTasks(taskSet)
+      val taskDescriptions = taskScheduler.resourceOffers(workerOffers).flatten
+      assert(2 === taskDescriptions.length)
+      taskDescriptions.map(_.executorId)
+    }
+    val count = selectedExecutorIds.count(_ == workerOffers(0).executorId)
+    assert(count == 2)
+    assert(!failedTaskSet)
+  }
+
+  test("Scheduler keeps packing the assignment to the same worker") {
+    val taskScheduler = setupScheduler(("spark.task.assigner", classOf[PackedAssigner].getName))
+    val workerOffers = IndexedSeq(new WorkerOffer("executor0", "host0", 4),
+      new WorkerOffer("executor1", "host1", 4))
+    val selectedExecutorIds = {
+      val taskSet = FakeTask.createTaskSet(4)
+      taskScheduler.submitTasks(taskSet)
+      val taskDescriptions = taskScheduler.resourceOffers(workerOffers).flatten
+      assert(4 === taskDescriptions.length)
+      taskDescriptions.map(_.executorId)
+    }
+
+    val count = selectedExecutorIds.count(_ == workerOffers(0).executorId)
+    assert(count == 4)
+    assert(!failedTaskSet)
+  }
+
+
   test("Scheduler correctly accounts for multiple CPUs per task") {
     val taskCpus = 2
     val taskScheduler = setupScheduler("spark.task.cpus" -> taskCpus.toString)
@@ -408,4 +474,5 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B
     assert(thirdTaskDescs.size === 0)
     assert(taskScheduler.getExecutorsAliveOnHost("host1") === Some(Set("executor1", "executor3")))
   }
+
 }
diff --git a/docs/configuration.md b/docs/configuration.md
index 373e22d71a872..6f3fbeb76cc24 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1334,6 +1334,17 @@ Apart from these, the following properties are also available, and may be useful
     Should be greater than or equal to 1. Number of allowed retries = this value - 1.
   </td>
 </tr>
+<tr>
+  <td><code>spark.task.assigner</code></td>
+  <td>org.apache.spark.scheduler.RoundRobinAssigner</td>
+  <td>
+    The strategy of how to allocate tasks among workers with free cores.
+    By default, round robin with randomness is used.
+    org.apache.spark.scheduler.BalancedAssigner tries to balance the task across all workers (allocating tasks to
+    workers with most free cores). org.apache.spark.scheduler.PackedAssigner tries to allocate tasks to workers
+    with the least free cores, which may help releasing the resources when dynamic allocation is enabled.
+  </td>
+</tr>
 </table>
 
 #### Dynamic Allocation

From 72a6e7a57a63aba69f26c84bf68a5fb213d2a521 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sat, 15 Oct 2016 22:31:37 -0700
Subject: [PATCH 030/162] Revert "[SPARK-17637][SCHEDULER] Packed scheduling
 for Spark tasks across executors"

This reverts commit ed1463341455830b8867b721a1b34f291139baf3.

The patch merged had obvious quality and documentation issue. The idea is useful, and we should work towards improving its quality and merging it in again.
---
 .../apache/spark/scheduler/TaskAssigner.scala | 154 ------------------
 .../spark/scheduler/TaskSchedulerImpl.scala   |  53 +++---
 .../scheduler/TaskSchedulerImplSuite.scala    |  67 --------
 docs/configuration.md                         |  11 --
 4 files changed, 19 insertions(+), 266 deletions(-)
 delete mode 100644 core/src/main/scala/org/apache/spark/scheduler/TaskAssigner.scala

diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskAssigner.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskAssigner.scala
deleted file mode 100644
index 62df9657a6ac6..0000000000000
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskAssigner.scala
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.scheduler
-
-import scala.collection.mutable.ArrayBuffer
-import scala.collection.mutable.PriorityQueue
-import scala.util.Random
-
-import org.apache.spark.SparkConf
-
-case class OfferState(workOffer: WorkerOffer, var cores: Int) {
-  // Build a list of tasks to assign to each worker.
-  val tasks = new ArrayBuffer[TaskDescription](cores)
-}
-
-abstract class TaskAssigner(conf: SparkConf) {
-  var offer: Seq[OfferState] = _
-  val CPUS_PER_TASK = conf.getInt("spark.task.cpus", 1)
-
-  // The final assigned offer returned to TaskScheduler.
-  def tasks(): Seq[ArrayBuffer[TaskDescription]] = offer.map(_.tasks)
-
-  // construct the assigner by the workoffer.
-  def construct(workOffer: Seq[WorkerOffer]): Unit = {
-    offer = workOffer.map(o => OfferState(o, o.cores))
-  }
-
-  // Invoked in each round of Taskset assignment to initialize the internal structure.
-  def init(): Unit
-
-  // Indicating whether there is offer available to be used by one round of Taskset assignment.
-  def hasNext(): Boolean
-
-  // Next available offer returned to one round of Taskset assignment.
-  def getNext(): OfferState
-
-  // Called by the TaskScheduler to indicate whether the current offer is accepted
-  // In order to decide whether the current is valid for the next offering.
-  def taskAssigned(assigned: Boolean): Unit
-
-  // Release internally maintained resources. Subclass is responsible to
-  // release its own private resources.
-  def reset: Unit = {
-    offer = null
-  }
-}
-
-class RoundRobinAssigner(conf: SparkConf) extends TaskAssigner(conf) {
-  var i = 0
-  override def construct(workOffer: Seq[WorkerOffer]): Unit = {
-    offer = Random.shuffle(workOffer.map(o => OfferState(o, o.cores)))
-  }
-  override def init(): Unit = {
-    i = 0
-  }
-  override def hasNext: Boolean = {
-    i < offer.size
-  }
-  override def getNext(): OfferState = {
-    offer(i)
-  }
-  override def taskAssigned(assigned: Boolean): Unit = {
-    i += 1
-  }
-  override def reset: Unit = {
-    super.reset
-    i = 0
-  }
-}
-
-class BalancedAssigner(conf: SparkConf) extends TaskAssigner(conf) {
-  var maxHeap: PriorityQueue[OfferState] = _
-  var current: OfferState = _
-
-  override def construct(workOffer: Seq[WorkerOffer]): Unit = {
-    offer = Random.shuffle(workOffer.map(o => OfferState(o, o.cores)))
-  }
-  implicit val ord: Ordering[OfferState] = new Ordering[OfferState] {
-    def compare(x: OfferState, y: OfferState): Int = {
-      return Ordering[Int].compare(x.cores, y.cores)
-    }
-  }
-  def init(): Unit = {
-    maxHeap = new PriorityQueue[OfferState]()
-    offer.filter(_.cores >= CPUS_PER_TASK).foreach(maxHeap.enqueue(_))
-  }
-  override def hasNext: Boolean = {
-    maxHeap.size > 0
-  }
-  override def getNext(): OfferState = {
-    current = maxHeap.dequeue()
-    current
-  }
-
-  override def taskAssigned(assigned: Boolean): Unit = {
-    if (current.cores >= CPUS_PER_TASK && assigned) {
-      maxHeap.enqueue(current)
-    }
-  }
-  override def reset: Unit = {
-    super.reset
-    maxHeap = null
-    current = null
-  }
-}
-
-class PackedAssigner(conf: SparkConf) extends TaskAssigner(conf) {
-
-  var sorted: Seq[OfferState] = _
-  var i = 0
-  var current: OfferState = _
-
-  override def init(): Unit = {
-    i = 0
-    sorted = offer.filter(_.cores >= CPUS_PER_TASK).sortBy(_.cores)
-  }
-
-  override def hasNext: Boolean = {
-    i < sorted.size
-  }
-
-  override def getNext(): OfferState = {
-    current = sorted(i)
-    current
-  }
-
-  def taskAssigned(assigned: Boolean): Unit = {
-    if (current.cores < CPUS_PER_TASK || !assigned) {
-      i += 1
-    }
-  }
-
-  override def reset: Unit = {
-    super.reset
-    sorted = null
-    current = null
-    i = 0
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
index fb732ea8e5a3b..3e3f1ad031e66 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -22,7 +22,9 @@ import java.util.{Timer, TimerTask}
 import java.util.concurrent.TimeUnit
 import java.util.concurrent.atomic.AtomicLong
 
+import scala.collection.Set
 import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet}
+import scala.util.Random
 
 import org.apache.spark._
 import org.apache.spark.TaskState.TaskState
@@ -59,21 +61,6 @@ private[spark] class TaskSchedulerImpl(
 
   val conf = sc.conf
 
-  val DEFAULT_TASK_ASSIGNER = classOf[RoundRobinAssigner].getName
-  lazy val taskAssigner: TaskAssigner = {
-    val className = conf.get("spark.task.assigner", DEFAULT_TASK_ASSIGNER)
-    try {
-      logInfo(s"""constructing assigner as $className""")
-      val ctor = Utils.classForName(className).getConstructor(classOf[SparkConf])
-      ctor.newInstance(conf).asInstanceOf[TaskAssigner]
-    } catch {
-      case _: Throwable =>
-        logWarning(
-          s"""$className cannot be constructed fallback to default
-             | $DEFAULT_TASK_ASSIGNER""".stripMargin)
-        new RoundRobinAssigner(conf)
-    }
-  }
   // How often to check for speculative tasks
   val SPECULATION_INTERVAL_MS = conf.getTimeAsMs("spark.speculation.interval", "100ms")
 
@@ -263,26 +250,24 @@ private[spark] class TaskSchedulerImpl(
   private def resourceOfferSingleTaskSet(
       taskSet: TaskSetManager,
       maxLocality: TaskLocality,
-      taskAssigner: TaskAssigner) : Boolean = {
+      shuffledOffers: Seq[WorkerOffer],
+      availableCpus: Array[Int],
+      tasks: IndexedSeq[ArrayBuffer[TaskDescription]]) : Boolean = {
     var launchedTask = false
-    taskAssigner.init()
-    while(taskAssigner.hasNext()) {
-      var assigned = false
-      val current = taskAssigner.getNext()
-      val execId = current.workOffer.executorId
-      val host = current.workOffer.host
-      if (current.cores >= CPUS_PER_TASK) {
+    for (i <- 0 until shuffledOffers.size) {
+      val execId = shuffledOffers(i).executorId
+      val host = shuffledOffers(i).host
+      if (availableCpus(i) >= CPUS_PER_TASK) {
         try {
           for (task <- taskSet.resourceOffer(execId, host, maxLocality)) {
-            current.tasks += task
+            tasks(i) += task
             val tid = task.taskId
             taskIdToTaskSetManager(tid) = taskSet
             taskIdToExecutorId(tid) = execId
             executorIdToTaskCount(execId) += 1
-            current.cores = current.cores - CPUS_PER_TASK
-            assert(current.cores >= 0)
+            availableCpus(i) -= CPUS_PER_TASK
+            assert(availableCpus(i) >= 0)
             launchedTask = true
-            assigned = true
           }
         } catch {
           case e: TaskNotSerializableException =>
@@ -292,10 +277,8 @@ private[spark] class TaskSchedulerImpl(
             return launchedTask
         }
       }
-      taskAssigner.taskAssigned(assigned)
     }
     return launchedTask
-
   }
 
   /**
@@ -322,8 +305,12 @@ private[spark] class TaskSchedulerImpl(
         hostsByRack.getOrElseUpdate(rack, new HashSet[String]()) += o.host
       }
     }
-    taskAssigner.construct(offers)
 
+    // Randomly shuffle offers to avoid always placing tasks on the same set of workers.
+    val shuffledOffers = Random.shuffle(offers)
+    // Build a list of tasks to assign to each worker.
+    val tasks = shuffledOffers.map(o => new ArrayBuffer[TaskDescription](o.cores))
+    val availableCpus = shuffledOffers.map(o => o.cores).toArray
     val sortedTaskSets = rootPool.getSortedTaskSetQueue
     for (taskSet <- sortedTaskSets) {
       logDebug("parentName: %s, name: %s, runningTasks: %s".format(
@@ -342,7 +329,7 @@ private[spark] class TaskSchedulerImpl(
       for (currentMaxLocality <- taskSet.myLocalityLevels) {
         do {
           launchedTaskAtCurrentMaxLocality = resourceOfferSingleTaskSet(
-            taskSet, currentMaxLocality, taskAssigner)
+            taskSet, currentMaxLocality, shuffledOffers, availableCpus, tasks)
           launchedAnyTask |= launchedTaskAtCurrentMaxLocality
         } while (launchedTaskAtCurrentMaxLocality)
       }
@@ -350,12 +337,10 @@ private[spark] class TaskSchedulerImpl(
         taskSet.abortIfCompletelyBlacklisted(hostToExecutors)
       }
     }
-    val tasks = taskAssigner.tasks
-    taskAssigner.reset
+
     if (tasks.size > 0) {
       hasLaunchedTask = true
     }
-
     return tasks
   }
 
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
index 2584f85bc553a..f5f1947661d9a 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
@@ -109,72 +109,6 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B
     assert(!failedTaskSet)
   }
 
-  test("Scheduler balance the assignment to the worker with more free cores") {
-    val taskScheduler = setupScheduler(("spark.task.assigner", classOf[BalancedAssigner].getName))
-    val workerOffers = IndexedSeq(new WorkerOffer("executor0", "host0", 2),
-      new WorkerOffer("executor1", "host1", 4))
-    val selectedExecutorIds = {
-      val taskSet = FakeTask.createTaskSet(2)
-      taskScheduler.submitTasks(taskSet)
-      val taskDescriptions = taskScheduler.resourceOffers(workerOffers).flatten
-      assert(2 === taskDescriptions.length)
-      taskDescriptions.map(_.executorId)
-    }
-    val count = selectedExecutorIds.count(_ == workerOffers(1).executorId)
-    assert(count == 2)
-    assert(!failedTaskSet)
-  }
-
-  test("Scheduler balance the assignment across workers with same free cores") {
-    val taskScheduler = setupScheduler(("spark.task.assigner", classOf[BalancedAssigner].getName))
-    val workerOffers = IndexedSeq(new WorkerOffer("executor0", "host0", 2),
-      new WorkerOffer("executor1", "host1", 2))
-    val selectedExecutorIds = {
-      val taskSet = FakeTask.createTaskSet(2)
-      taskScheduler.submitTasks(taskSet)
-      val taskDescriptions = taskScheduler.resourceOffers(workerOffers).flatten
-      assert(2 === taskDescriptions.length)
-      taskDescriptions.map(_.executorId)
-    }
-    val count = selectedExecutorIds.count(_ == workerOffers(1).executorId)
-    assert(count == 1)
-    assert(!failedTaskSet)
-  }
-
-  test("Scheduler packs the assignment to workers with less free cores") {
-    val taskScheduler = setupScheduler(("spark.task.assigner", classOf[PackedAssigner].getName))
-    val workerOffers = IndexedSeq(new WorkerOffer("executor0", "host0", 2),
-      new WorkerOffer("executor1", "host1", 4))
-    val selectedExecutorIds = {
-      val taskSet = FakeTask.createTaskSet(2)
-      taskScheduler.submitTasks(taskSet)
-      val taskDescriptions = taskScheduler.resourceOffers(workerOffers).flatten
-      assert(2 === taskDescriptions.length)
-      taskDescriptions.map(_.executorId)
-    }
-    val count = selectedExecutorIds.count(_ == workerOffers(0).executorId)
-    assert(count == 2)
-    assert(!failedTaskSet)
-  }
-
-  test("Scheduler keeps packing the assignment to the same worker") {
-    val taskScheduler = setupScheduler(("spark.task.assigner", classOf[PackedAssigner].getName))
-    val workerOffers = IndexedSeq(new WorkerOffer("executor0", "host0", 4),
-      new WorkerOffer("executor1", "host1", 4))
-    val selectedExecutorIds = {
-      val taskSet = FakeTask.createTaskSet(4)
-      taskScheduler.submitTasks(taskSet)
-      val taskDescriptions = taskScheduler.resourceOffers(workerOffers).flatten
-      assert(4 === taskDescriptions.length)
-      taskDescriptions.map(_.executorId)
-    }
-
-    val count = selectedExecutorIds.count(_ == workerOffers(0).executorId)
-    assert(count == 4)
-    assert(!failedTaskSet)
-  }
-
-
   test("Scheduler correctly accounts for multiple CPUs per task") {
     val taskCpus = 2
     val taskScheduler = setupScheduler("spark.task.cpus" -> taskCpus.toString)
@@ -474,5 +408,4 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B
     assert(thirdTaskDescs.size === 0)
     assert(taskScheduler.getExecutorsAliveOnHost("host1") === Some(Set("executor1", "executor3")))
   }
-
 }
diff --git a/docs/configuration.md b/docs/configuration.md
index 6f3fbeb76cc24..373e22d71a872 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1334,17 +1334,6 @@ Apart from these, the following properties are also available, and may be useful
     Should be greater than or equal to 1. Number of allowed retries = this value - 1.
   </td>
 </tr>
-<tr>
-  <td><code>spark.task.assigner</code></td>
-  <td>org.apache.spark.scheduler.RoundRobinAssigner</td>
-  <td>
-    The strategy of how to allocate tasks among workers with free cores.
-    By default, round robin with randomness is used.
-    org.apache.spark.scheduler.BalancedAssigner tries to balance the task across all workers (allocating tasks to
-    workers with most free cores). org.apache.spark.scheduler.PackedAssigner tries to allocate tasks to workers
-    with the least free cores, which may help releasing the resources when dynamic allocation is enabled.
-  </td>
-</tr>
 </table>
 
 #### Dynamic Allocation

From 59e3eb5af8d0969bbb785af77b66343bda7acc38 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Sun, 16 Oct 2016 20:15:32 -0700
Subject: [PATCH 031/162] [SPARK-17819][SQL] Support default database in
 connection URIs for Spark Thrift Server

## What changes were proposed in this pull request?

Currently, Spark Thrift Server ignores the default database in URI. This PR supports that like the following.

```sql
$ bin/beeline -u jdbc:hive2://localhost:10000 -e "create database testdb"
$ bin/beeline -u jdbc:hive2://localhost:10000/testdb -e "create table t(a int)"
$ bin/beeline -u jdbc:hive2://localhost:10000/testdb -e "show tables"
...
+------------+--------------+--+
| tableName  | isTemporary  |
+------------+--------------+--+
| t          | false        |
+------------+--------------+--+
1 row selected (0.347 seconds)
$ bin/beeline -u jdbc:hive2://localhost:10000 -e "show tables"
...
+------------+--------------+--+
| tableName  | isTemporary  |
+------------+--------------+--+
+------------+--------------+--+
No rows selected (0.098 seconds)
```

## How was this patch tested?

Manual.

Note: I tried to add a test case for this, but I cannot found a suitable testsuite for this. I'll add the testcase if some advice is given.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #15399 from dongjoon-hyun/SPARK-17819.
---
 .../thriftserver/SparkSQLSessionManager.scala |  3 +
 .../thriftserver/JdbcConnectionUriSuite.scala | 70 +++++++++++++++++++
 2 files changed, 73 insertions(+)
 create mode 100644 sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/JdbcConnectionUriSuite.scala

diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala
index 6a5117aea492d..226b7e175a9d9 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala
@@ -79,6 +79,9 @@ private[hive] class SparkSQLSessionManager(hiveServer: HiveServer2, sqlContext:
       sqlContext.newSession()
     }
     ctx.setConf("spark.sql.hive.version", HiveUtils.hiveExecutionVersion)
+    if (sessionConf != null && sessionConf.containsKey("use:database")) {
+      ctx.sql(s"use ${sessionConf.get("use:database")}")
+    }
     sparkSqlOperationManager.sessionToContexts.put(sessionHandle, ctx)
     sessionHandle
   }
diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/JdbcConnectionUriSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/JdbcConnectionUriSuite.scala
new file mode 100644
index 0000000000000..fb8a7e273ae44
--- /dev/null
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/JdbcConnectionUriSuite.scala
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.thriftserver
+
+import java.sql.DriverManager
+
+import org.apache.hive.jdbc.HiveDriver
+
+import org.apache.spark.util.Utils
+
+class JdbcConnectionUriSuite extends HiveThriftServer2Test {
+  Utils.classForName(classOf[HiveDriver].getCanonicalName)
+
+  override def mode: ServerMode.Value = ServerMode.binary
+
+  val JDBC_TEST_DATABASE = "jdbc_test_database"
+  val USER = System.getProperty("user.name")
+  val PASSWORD = ""
+
+  override protected def beforeAll(): Unit = {
+    super.beforeAll()
+
+    val jdbcUri = s"jdbc:hive2://localhost:$serverPort/"
+    val connection = DriverManager.getConnection(jdbcUri, USER, PASSWORD)
+    val statement = connection.createStatement()
+    statement.execute(s"CREATE DATABASE $JDBC_TEST_DATABASE")
+    connection.close()
+  }
+
+  override protected def afterAll(): Unit = {
+    try {
+      val jdbcUri = s"jdbc:hive2://localhost:$serverPort/"
+      val connection = DriverManager.getConnection(jdbcUri, USER, PASSWORD)
+      val statement = connection.createStatement()
+      statement.execute(s"DROP DATABASE $JDBC_TEST_DATABASE")
+      connection.close()
+    } finally {
+      super.afterAll()
+    }
+  }
+
+  test("SPARK-17819 Support default database in connection URIs") {
+    val jdbcUri = s"jdbc:hive2://localhost:$serverPort/$JDBC_TEST_DATABASE"
+    val connection = DriverManager.getConnection(jdbcUri, USER, PASSWORD)
+    val statement = connection.createStatement()
+    try {
+      val resultSet = statement.executeQuery("select current_database()")
+      resultSet.next()
+      assert(resultSet.getString(1) === JDBC_TEST_DATABASE)
+    } finally {
+      statement.close()
+      connection.close()
+    }
+  }
+}

From e18d02c5a8f8af2e42079ab414f5d84b3e1a279e Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Mon, 17 Oct 2016 12:08:25 +0800
Subject: [PATCH 032/162] [SPARK-17947][SQL] Add Doc and Comment about
 spark.sql.debug

### What changes were proposed in this pull request?
Just document the impact of `spark.sql.debug`:

When enabling the debug, Spark SQL internal table properties are not filtered out; however, some related DDL commands (e.g., Analyze Table and CREATE TABLE LIKE) might not work properly.

### How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #15494 from gatorsmile/addDocForSQLDebug.
---
 .../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index e73d0187b584b..a055e0135c136 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -934,8 +934,11 @@ object StaticSQLConf {
     .intConf
     .createWithDefault(4000)
 
+  // When enabling the debug, Spark SQL internal table properties are not filtered out; however,
+  // some related DDL commands (e.g., ANALYZE TABLE and CREATE TABLE LIKE) might not work properly.
   val DEBUG_MODE = buildConf("spark.sql.debug")
     .internal()
+    .doc("Only used for internal debugging. Not all functions are supported when it is enabled.")
     .booleanConf
     .createWithDefault(false)
 }

From 56b0f5f4d1d7826737b81ebc4ec5dad83b6463e3 Mon Sep 17 00:00:00 2001
From: Weiqing Yang <yangweiqing001@gmail.com>
Date: Sun, 16 Oct 2016 22:38:30 -0700
Subject: [PATCH 033/162] [MINOR][SQL] Add prettyName for current_database
 function

## What changes were proposed in this pull request?
Added a `prettyname` for current_database function.

## How was this patch tested?
Manually.

Before:
```
scala> sql("select current_database()").show
+-----------------+
|currentdatabase()|
+-----------------+
|          default|
+-----------------+
```

After:
```
scala> sql("select current_database()").show
+------------------+
|current_database()|
+------------------+
|           default|
+------------------+
```

Author: Weiqing Yang <yangweiqing001@gmail.com>

Closes #15506 from weiqingy/prettyName.
---
 .../scala/org/apache/spark/sql/catalyst/expressions/misc.scala   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
index 138ef2a1dcc01..5ead16908732f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
@@ -618,6 +618,7 @@ case class CurrentDatabase() extends LeafExpression with Unevaluable {
   override def dataType: DataType = StringType
   override def foldable: Boolean = true
   override def nullable: Boolean = false
+  override def prettyName: String = "current_database"
 }
 
 /**

From e3bf37fa3ada43624b2e77bef90ad3d3dbcd8ce1 Mon Sep 17 00:00:00 2001
From: Maxime Rihouey <maxime.rihouey@gmail.com>
Date: Mon, 17 Oct 2016 10:56:22 +0100
Subject: [PATCH 034/162] Fix example of tf_idf with minDocFreq

## What changes were proposed in this pull request?

The python example for tf_idf with the parameter "minDocFreq" is not properly set up because the same variable is used to transform the document for both with and without the "minDocFreq" parameter.
The IDF(minDocFreq=2) is stored in the variable "idfIgnore" but then it is the original variable "idf" used to transform the "tf" instead of the "idfIgnore".

## How was this patch tested?

Before the results for "tfidf" and "tfidfIgnore" were the same:
tfidf:
(1048576,[1046921],[3.75828890549])
(1048576,[1046920],[3.75828890549])
(1048576,[1046923],[3.75828890549])
(1048576,[892732],[3.75828890549])
(1048576,[892733],[3.75828890549])
(1048576,[892734],[3.75828890549])
tfidfIgnore:
(1048576,[1046921],[3.75828890549])
(1048576,[1046920],[3.75828890549])
(1048576,[1046923],[3.75828890549])
(1048576,[892732],[3.75828890549])
(1048576,[892733],[3.75828890549])
(1048576,[892734],[3.75828890549])

After the fix those are how they should be:
tfidf:
(1048576,[1046921],[3.75828890549])
(1048576,[1046920],[3.75828890549])
(1048576,[1046923],[3.75828890549])
(1048576,[892732],[3.75828890549])
(1048576,[892733],[3.75828890549])
(1048576,[892734],[3.75828890549])
tfidfIgnore:
(1048576,[1046921],[0.0])
(1048576,[1046920],[0.0])
(1048576,[1046923],[0.0])
(1048576,[892732],[0.0])
(1048576,[892733],[0.0])
(1048576,[892734],[0.0])

Author: Maxime Rihouey <maxime.rihouey@gmail.com>

Closes #15503 from maximerihouey/patch-1.
---
 examples/src/main/python/mllib/tf_idf_example.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/src/main/python/mllib/tf_idf_example.py b/examples/src/main/python/mllib/tf_idf_example.py
index c4d53333a95a9..b66412b2334e7 100644
--- a/examples/src/main/python/mllib/tf_idf_example.py
+++ b/examples/src/main/python/mllib/tf_idf_example.py
@@ -43,7 +43,7 @@
     # In such cases, the IDF for these terms is set to 0.
     # This feature can be used by passing the minDocFreq value to the IDF constructor.
     idfIgnore = IDF(minDocFreq=2).fit(tf)
-    tfidfIgnore = idf.transform(tf)
+    tfidfIgnore = idfIgnore.transform(tf)
     # $example off$
 
     print("tfidf:")

From c7ac027d5fd7a80d3122a9269b2bb9c28c6a57db Mon Sep 17 00:00:00 2001
From: Sital Kedia <skedia@fb.com>
Date: Mon, 17 Oct 2016 11:03:04 -0700
Subject: [PATCH 035/162] [SPARK-17839][CORE] Use Nio's directbuffer instead of
 BufferedInputStream in order to avoid additional copy from os buffer cache to
 user buffer

## What changes were proposed in this pull request?

Currently we use BufferedInputStream to read the shuffle file which copies the file content from os buffer cache to the user buffer. This adds additional latency in reading the spill files. We made a change to use java nio's direct buffer to read the spill files and for certain pipelines spilling significant amount of data, we see up to 7% speedup for the entire pipeline.

## How was this patch tested?
Tested by running the job in the cluster and observed up to 7% speedup.

Author: Sital Kedia <skedia@fb.com>

Closes #15408 from sitalkedia/skedia/nio_spill_read.
---
 .../spark/io/NioBufferedFileInputStream.java  | 137 ++++++++++++++++++
 .../unsafe/sort/UnsafeSorterSpillReader.java  |   5 +-
 .../shuffle/IndexShuffleBlockResolver.scala   |   3 +-
 .../io/NioBufferedFileInputStreamSuite.java   | 135 +++++++++++++++++
 .../spark/sql/execution/python/RowQueue.scala |   3 +-
 5 files changed, 279 insertions(+), 4 deletions(-)
 create mode 100644 core/src/main/java/org/apache/spark/io/NioBufferedFileInputStream.java
 create mode 100644 core/src/test/java/org/apache/spark/io/NioBufferedFileInputStreamSuite.java

diff --git a/core/src/main/java/org/apache/spark/io/NioBufferedFileInputStream.java b/core/src/main/java/org/apache/spark/io/NioBufferedFileInputStream.java
new file mode 100644
index 0000000000000..f6d1288cb263d
--- /dev/null
+++ b/core/src/main/java/org/apache/spark/io/NioBufferedFileInputStream.java
@@ -0,0 +1,137 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.io;
+
+import org.apache.spark.storage.StorageUtils;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.file.StandardOpenOption;
+
+/**
+ * {@link InputStream} implementation which uses direct buffer
+ * to read a file to avoid extra copy of data between Java and
+ * native memory which happens when using {@link java.io.BufferedInputStream}.
+ * Unfortunately, this is not something already available in JDK,
+ * {@link sun.nio.ch.ChannelInputStream} supports reading a file using nio,
+ * but does not support buffering.
+ */
+public final class NioBufferedFileInputStream extends InputStream {
+
+  private static final int DEFAULT_BUFFER_SIZE_BYTES = 8192;
+
+  private final ByteBuffer byteBuffer;
+
+  private final FileChannel fileChannel;
+
+  public NioBufferedFileInputStream(File file, int bufferSizeInBytes) throws IOException {
+    byteBuffer = ByteBuffer.allocateDirect(bufferSizeInBytes);
+    fileChannel = FileChannel.open(file.toPath(), StandardOpenOption.READ);
+    byteBuffer.flip();
+  }
+
+  public NioBufferedFileInputStream(File file) throws IOException {
+    this(file, DEFAULT_BUFFER_SIZE_BYTES);
+  }
+
+  /**
+   * Checks weather data is left to be read from the input stream.
+   * @return true if data is left, false otherwise
+   * @throws IOException
+   */
+  private boolean refill() throws IOException {
+    if (!byteBuffer.hasRemaining()) {
+      byteBuffer.clear();
+      int nRead = 0;
+      while (nRead == 0) {
+        nRead = fileChannel.read(byteBuffer);
+      }
+      if (nRead < 0) {
+        return false;
+      }
+      byteBuffer.flip();
+    }
+    return true;
+  }
+
+  @Override
+  public synchronized int read() throws IOException {
+    if (!refill()) {
+      return -1;
+    }
+    return byteBuffer.get() & 0xFF;
+  }
+
+  @Override
+  public synchronized int read(byte[] b, int offset, int len) throws IOException {
+    if (offset < 0 || len < 0 || offset + len < 0 || offset + len > b.length) {
+      throw new IndexOutOfBoundsException();
+    }
+    if (!refill()) {
+      return -1;
+    }
+    len = Math.min(len, byteBuffer.remaining());
+    byteBuffer.get(b, offset, len);
+    return len;
+  }
+
+  @Override
+  public synchronized int available() throws IOException {
+    return byteBuffer.remaining();
+  }
+
+  @Override
+  public synchronized long skip(long n) throws IOException {
+    if (n <= 0L) {
+      return 0L;
+    }
+    if (byteBuffer.remaining() >= n) {
+      // The buffered content is enough to skip
+      byteBuffer.position(byteBuffer.position() + (int) n);
+      return n;
+    }
+    long skippedFromBuffer = byteBuffer.remaining();
+    long toSkipFromFileChannel = n - skippedFromBuffer;
+    // Discard everything we have read in the buffer.
+    byteBuffer.position(0);
+    byteBuffer.flip();
+    return skippedFromBuffer + skipFromFileChannel(toSkipFromFileChannel);
+  }
+
+  private long skipFromFileChannel(long n) throws IOException {
+    long currentFilePosition = fileChannel.position();
+    long size = fileChannel.size();
+    if (n > size - currentFilePosition) {
+      fileChannel.position(size);
+      return size - currentFilePosition;
+    } else {
+      fileChannel.position(currentFilePosition + n);
+      return n;
+    }
+  }
+
+  @Override
+  public synchronized void close() throws IOException {
+    fileChannel.close();
+    StorageUtils.dispose(byteBuffer);
+  }
+
+  @Override
+  protected void finalize() throws IOException {
+    close();
+  }
+}
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
index e6d9766c31574..a658e5eb47b78 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
@@ -23,6 +23,7 @@
 import com.google.common.io.Closeables;
 
 import org.apache.spark.SparkEnv;
+import org.apache.spark.io.NioBufferedFileInputStream;
 import org.apache.spark.serializer.SerializerManager;
 import org.apache.spark.storage.BlockId;
 import org.apache.spark.unsafe.Platform;
@@ -69,8 +70,8 @@ public UnsafeSorterSpillReader(
       bufferSizeBytes = DEFAULT_BUFFER_SIZE_BYTES;
     }
 
-    final BufferedInputStream bs =
-        new BufferedInputStream(new FileInputStream(file), (int) bufferSizeBytes);
+    final InputStream bs =
+        new NioBufferedFileInputStream(file, (int) bufferSizeBytes);
     try {
       this.in = serializerManager.wrapStream(blockId, bs);
       this.din = new DataInputStream(this.in);
diff --git a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
index 8d6396bededa9..91858f0912b65 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
@@ -23,6 +23,7 @@ import com.google.common.io.ByteStreams
 
 import org.apache.spark.{SparkConf, SparkEnv}
 import org.apache.spark.internal.Logging
+import org.apache.spark.io.NioBufferedFileInputStream
 import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer}
 import org.apache.spark.network.netty.SparkTransportConf
 import org.apache.spark.shuffle.IndexShuffleBlockResolver.NOOP_REDUCE_ID
@@ -89,7 +90,7 @@ private[spark] class IndexShuffleBlockResolver(
     val lengths = new Array[Long](blocks)
     // Read the lengths of blocks
     val in = try {
-      new DataInputStream(new BufferedInputStream(new FileInputStream(index)))
+      new DataInputStream(new NioBufferedFileInputStream(index))
     } catch {
       case e: IOException =>
         return null
diff --git a/core/src/test/java/org/apache/spark/io/NioBufferedFileInputStreamSuite.java b/core/src/test/java/org/apache/spark/io/NioBufferedFileInputStreamSuite.java
new file mode 100644
index 0000000000000..2c1a34a607592
--- /dev/null
+++ b/core/src/test/java/org/apache/spark/io/NioBufferedFileInputStreamSuite.java
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.io;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang3.RandomUtils;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * Tests functionality of {@link NioBufferedFileInputStream}
+ */
+public class NioBufferedFileInputStreamSuite {
+
+  private byte[] randomBytes;
+
+  private File inputFile;
+
+  @Before
+  public void setUp() throws IOException {
+    // Create a byte array of size 2 MB with random bytes
+    randomBytes =  RandomUtils.nextBytes(2 * 1024 * 1024);
+    inputFile = File.createTempFile("temp-file", ".tmp");
+    FileUtils.writeByteArrayToFile(inputFile, randomBytes);
+  }
+
+  @After
+  public void tearDown() {
+    inputFile.delete();
+  }
+
+  @Test
+  public void testReadOneByte() throws IOException {
+    InputStream inputStream = new NioBufferedFileInputStream(inputFile);
+    for (int i = 0; i < randomBytes.length; i++) {
+      assertEquals(randomBytes[i], (byte) inputStream.read());
+    }
+  }
+
+  @Test
+  public void testReadMultipleBytes() throws IOException {
+    InputStream inputStream = new NioBufferedFileInputStream(inputFile);
+    byte[] readBytes = new byte[8 * 1024];
+    int i = 0;
+    while (i < randomBytes.length) {
+      int read = inputStream.read(readBytes, 0, 8 * 1024);
+      for (int j = 0; j < read; j++) {
+        assertEquals(randomBytes[i], readBytes[j]);
+        i++;
+      }
+    }
+  }
+
+  @Test
+  public void testBytesSkipped() throws IOException {
+    InputStream inputStream = new NioBufferedFileInputStream(inputFile);
+    assertEquals(1024, inputStream.skip(1024));
+    for (int i = 1024; i < randomBytes.length; i++) {
+      assertEquals(randomBytes[i], (byte) inputStream.read());
+    }
+  }
+
+  @Test
+  public void testBytesSkippedAfterRead() throws IOException {
+    InputStream inputStream = new NioBufferedFileInputStream(inputFile);
+    for (int i = 0; i < 1024; i++) {
+      assertEquals(randomBytes[i], (byte) inputStream.read());
+    }
+    assertEquals(1024, inputStream.skip(1024));
+    for (int i = 2048; i < randomBytes.length; i++) {
+      assertEquals(randomBytes[i], (byte) inputStream.read());
+    }
+  }
+
+  @Test
+  public void testNegativeBytesSkippedAfterRead() throws IOException {
+    InputStream inputStream = new NioBufferedFileInputStream(inputFile);
+    for (int i = 0; i < 1024; i++) {
+      assertEquals(randomBytes[i], (byte) inputStream.read());
+    }
+    // Skipping negative bytes should essential be a no-op
+    assertEquals(0, inputStream.skip(-1));
+    assertEquals(0, inputStream.skip(-1024));
+    assertEquals(0, inputStream.skip(Long.MIN_VALUE));
+    assertEquals(1024, inputStream.skip(1024));
+    for (int i = 2048; i < randomBytes.length; i++) {
+      assertEquals(randomBytes[i], (byte) inputStream.read());
+    }
+  }
+
+  @Test
+  public void testSkipFromFileChannel() throws IOException {
+    InputStream inputStream = new NioBufferedFileInputStream(inputFile, 10);
+    // Since the buffer is smaller than the skipped bytes, this will guarantee
+    // we skip from underlying file channel.
+    assertEquals(1024, inputStream.skip(1024));
+    for (int i = 1024; i < 2048; i++) {
+      assertEquals(randomBytes[i], (byte) inputStream.read());
+    }
+    assertEquals(256, inputStream.skip(256));
+    assertEquals(256, inputStream.skip(256));
+    assertEquals(512, inputStream.skip(512));
+    for (int i = 3072; i < randomBytes.length; i++) {
+      assertEquals(randomBytes[i], (byte) inputStream.read());
+    }
+  }
+
+  @Test
+  public void testBytesSkippedAfterEOF() throws IOException {
+    InputStream inputStream = new NioBufferedFileInputStream(inputFile);
+    assertEquals(randomBytes.length, inputStream.skip(randomBytes.length + 1));
+    assertEquals(-1, inputStream.read());
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/RowQueue.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/RowQueue.scala
index 422a3f862d96f..cd1e77f524afd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/RowQueue.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/RowQueue.scala
@@ -22,6 +22,7 @@ import java.io._
 import com.google.common.io.Closeables
 
 import org.apache.spark.SparkException
+import org.apache.spark.io.NioBufferedFileInputStream
 import org.apache.spark.memory.{MemoryConsumer, TaskMemoryManager}
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.unsafe.Platform
@@ -130,7 +131,7 @@ private[python] case class DiskRowQueue(file: File, fields: Int) extends RowQueu
     if (out != null) {
       out.close()
       out = null
-      in = new DataInputStream(new BufferedInputStream(new FileInputStream(file.toString)))
+      in = new DataInputStream(new NioBufferedFileInputStream(file))
     }
 
     if (unreadBytes > 0) {

From d88a1bae6a9c975c39549ec2326d839ea93949b2 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Mon, 17 Oct 2016 11:33:06 -0700
Subject: [PATCH 036/162] [SPARK-17751][SQL] Remove spark.sql.eagerAnalysis and
 Output the Plan if Existed in AnalysisException

### What changes were proposed in this pull request?
Dataset always does eager analysis now. Thus, `spark.sql.eagerAnalysis` is not used any more. Thus, we need to remove it.

This PR also outputs the plan. Without the fix, the analysis error is like
```
cannot resolve '`k1`' given input columns: [k, v]; line 1 pos 12
```

After the fix, the analysis error becomes:
```
org.apache.spark.sql.AnalysisException: cannot resolve '`k1`' given input columns: [k, v]; line 1 pos 12;
'Project [unresolvedalias(CASE WHEN ('k1 = 2) THEN 22 WHEN ('k1 = 4) THEN 44 ELSE 0 END, None), v#6]
+- SubqueryAlias t
   +- Project [_1#2 AS k#5, _2#3 AS v#6]
      +- LocalRelation [_1#2, _2#3]
```

### How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #15316 from gatorsmile/eagerAnalysis.
---
 .../scala/org/apache/spark/sql/AnalysisException.scala |  7 +++++++
 .../org/apache/spark/sql/execution/debug/package.scala |  9 ---------
 .../scala/org/apache/spark/sql/internal/SQLConf.scala  | 10 ----------
 .../scala/org/apache/spark/sql/SQLQueryTestSuite.scala |  3 +++
 4 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/AnalysisException.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/AnalysisException.scala
index f3003306acc6d..7defb9df862c0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/AnalysisException.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/AnalysisException.scala
@@ -42,6 +42,13 @@ class AnalysisException protected[sql] (
   }
 
   override def getMessage: String = {
+    val planAnnotation = plan.map(p => s";\n$p").getOrElse("")
+    getSimpleMessage + planAnnotation
+  }
+
+  // Outputs an exception without the logical plan.
+  // For testing only
+  def getSimpleMessage: String = {
     val lineAnnotation = line.map(l => s" line $l").getOrElse("")
     val positionAnnotation = startPosition.map(p => s" pos $p").getOrElse("")
     s"$message;$lineAnnotation$positionAnnotation"
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
index d321f4cd76877..dd9d83767e221 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
@@ -69,15 +69,6 @@ package object debug {
     output
   }
 
-  /**
-   * Augments [[SparkSession]] with debug methods.
-   */
-  implicit class DebugSQLContext(sparkSession: SparkSession) {
-    def debug(): Unit = {
-      sparkSession.conf.set(SQLConf.DATAFRAME_EAGER_ANALYSIS.key, false)
-    }
-  }
-
   /**
    * Augments [[Dataset]]s with debug methods.
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index a055e0135c136..8afd39d657865 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -388,14 +388,6 @@ object SQLConf {
       .intConf
       .createWithDefault(32)
 
-  // Whether to perform eager analysis when constructing a dataframe.
-  // Set to false when debugging requires the ability to look at invalid query plans.
-  val DATAFRAME_EAGER_ANALYSIS = SQLConfigBuilder("spark.sql.eagerAnalysis")
-    .internal()
-    .doc("When true, eagerly applies query analysis on DataFrame operations.")
-    .booleanConf
-    .createWithDefault(true)
-
   // Whether to automatically resolve ambiguity in join conditions for self-joins.
   // See SPARK-6231.
   val DATAFRAME_SELF_JOIN_AUTO_RESOLVE_AMBIGUITY =
@@ -748,8 +740,6 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
 
   def bucketingEnabled: Boolean = getConf(SQLConf.BUCKETING_ENABLED)
 
-  def dataFrameEagerAnalysis: Boolean = getConf(DATAFRAME_EAGER_ANALYSIS)
-
   def dataFrameSelfJoinAutoResolveAmbiguity: Boolean =
     getConf(DATAFRAME_SELF_JOIN_AUTO_RESOLVE_AMBIGUITY)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
index 55d5a56f1040a..02841d7bb03ff 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
@@ -220,6 +220,9 @@ class SQLQueryTestSuite extends QueryTest with SharedSQLContext {
       if (isSorted(df.queryExecution.analyzed)) (schema, answer) else (schema, answer.sorted)
 
     } catch {
+      case a: AnalysisException if a.plan.nonEmpty =>
+        // Do not output the logical plan tree which contains expression IDs.
+        (StructType(Seq.empty), Seq(a.getClass.getName, a.getSimpleMessage))
       case NonFatal(e) =>
         // If there is an exception, put the exception class followed by the message.
         (StructType(Seq.empty), Seq(e.getClass.getName, e.getMessage))

From 813ab5e02539d17a66a6740d965b9f847d38c258 Mon Sep 17 00:00:00 2001
From: Dilip Biswal <dbiswal@us.ibm.com>
Date: Mon, 17 Oct 2016 20:46:30 -0700
Subject: [PATCH 037/162] [SPARK-17620][SQL] Determine Serde by
 hive.default.fileformat when Creating Hive Serde Tables

## What changes were proposed in this pull request?
Reopens the closed PR https://github.com/apache/spark/pull/15190
(Please refer to the above link for review comments on the PR)

Make sure the hive.default.fileformat is used to when creating the storage format metadata.

Output
``` SQL
scala> spark.sql("SET hive.default.fileformat=orc")
res1: org.apache.spark.sql.DataFrame = [key: string, value: string]

scala> spark.sql("CREATE TABLE tmp_default(id INT)")
res2: org.apache.spark.sql.DataFrame = []
```
Before
```SQL
scala> spark.sql("DESC FORMATTED tmp_default").collect.foreach(println)
..
[# Storage Information,,]
[SerDe Library:,org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe,]
[InputFormat:,org.apache.hadoop.hive.ql.io.orc.OrcInputFormat,]
[OutputFormat:,org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat,]
[Compressed:,No,]
[Storage Desc Parameters:,,]
[  serialization.format,1,]
```
After
```SQL
scala> spark.sql("DESC FORMATTED tmp_default").collect.foreach(println)
..
[# Storage Information,,]
[SerDe Library:,org.apache.hadoop.hive.ql.io.orc.OrcSerde,]
[InputFormat:,org.apache.hadoop.hive.ql.io.orc.OrcInputFormat,]
[OutputFormat:,org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat,]
[Compressed:,No,]
[Storage Desc Parameters:,,]
[  serialization.format,1,]

```
## How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests)
Added new tests to HiveDDLCommandSuite, SQLQuerySuite

Author: Dilip Biswal <dbiswal@us.ibm.com>

Closes #15495 from dilipbiswal/orc2.
---
 .../spark/sql/execution/SparkSqlParser.scala  |  4 +-
 .../spark/sql/hive/HiveDDLCommandSuite.scala  | 26 ++++++++++++-
 .../sql/hive/execution/SQLQuerySuite.scala    | 39 +++++++++++++++++--
 3 files changed, 60 insertions(+), 9 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index be2eddbb0e423..8c68d1e3a2379 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -1010,9 +1010,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
           .orElse(Some("org.apache.hadoop.mapred.TextInputFormat")),
         outputFormat = defaultHiveSerde.flatMap(_.outputFormat)
           .orElse(Some("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat")),
-        // Note: Keep this unspecified because we use the presence of the serde to decide
-        // whether to convert a table created by CTAS to a datasource table.
-        serde = None,
+        serde = defaultHiveSerde.flatMap(_.serde),
         compressed = false,
         properties = Map())
     }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDDLCommandSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDDLCommandSuite.scala
index 9ce3338647398..81337493c7f28 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDDLCommandSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDDLCommandSuite.scala
@@ -30,10 +30,12 @@ import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical.{Generate, ScriptTransformation}
 import org.apache.spark.sql.execution.command._
 import org.apache.spark.sql.execution.datasources.CreateTable
-import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.hive.test.{TestHive, TestHiveSingleton}
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types.StructType
 
-class HiveDDLCommandSuite extends PlanTest {
+class HiveDDLCommandSuite extends PlanTest with SQLTestUtils with TestHiveSingleton {
   val parser = TestHive.sessionState.sqlParser
 
   private def extractTableDesc(sql: String): (CatalogTable, Boolean) = {
@@ -556,4 +558,24 @@ class HiveDDLCommandSuite extends PlanTest {
     assert(partition2.get.apply("c") == "1" && partition2.get.apply("d") == "2")
   }
 
+  test("Test the default fileformat for Hive-serde tables") {
+    withSQLConf("hive.default.fileformat" -> "orc") {
+      val (desc, exists) = extractTableDesc("CREATE TABLE IF NOT EXISTS fileformat_test (id int)")
+      assert(exists)
+      assert(desc.storage.inputFormat == Some("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat"))
+      assert(desc.storage.outputFormat == Some("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat"))
+      assert(desc.storage.serde == Some("org.apache.hadoop.hive.ql.io.orc.OrcSerde"))
+    }
+
+    withSQLConf("hive.default.fileformat" -> "parquet") {
+      val (desc, exists) = extractTableDesc("CREATE TABLE IF NOT EXISTS fileformat_test (id int)")
+      assert(exists)
+      val input = desc.storage.inputFormat
+      val output = desc.storage.outputFormat
+      val serde = desc.storage.serde
+      assert(input == Some("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat"))
+      assert(output == Some("org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat"))
+      assert(serde == Some("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe"))
+    }
+   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 6f2a16662bf10..e26b6b57ef569 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -492,7 +492,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
 
   def checkRelation(
       tableName: String,
-      isDataSourceParquet: Boolean,
+      isDataSourceTable: Boolean,
       format: String,
       userSpecifiedLocation: Option[String] = None): Unit = {
     val relation = EliminateSubqueryAliases(
@@ -501,7 +501,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
       sessionState.catalog.getTableMetadata(TableIdentifier(tableName))
     relation match {
       case LogicalRelation(r: HadoopFsRelation, _, _) =>
-        if (!isDataSourceParquet) {
+        if (!isDataSourceTable) {
           fail(
             s"${classOf[MetastoreRelation].getCanonicalName} is expected, but found " +
               s"${HadoopFsRelation.getClass.getCanonicalName}.")
@@ -514,7 +514,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
         assert(catalogTable.provider.get === format)
 
       case r: MetastoreRelation =>
-        if (isDataSourceParquet) {
+        if (isDataSourceTable) {
           fail(
             s"${HadoopFsRelation.getClass.getCanonicalName} is expected, but found " +
               s"${classOf[MetastoreRelation].getCanonicalName}.")
@@ -524,8 +524,15 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
             assert(r.catalogTable.storage.locationUri.get === location)
           case None => // OK.
         }
-        // Also make sure that the format is the desired format.
+        // Also make sure that the format and serde are as desired.
         assert(catalogTable.storage.inputFormat.get.toLowerCase.contains(format))
+        assert(catalogTable.storage.outputFormat.get.toLowerCase.contains(format))
+        val serde = catalogTable.storage.serde.get
+        format match {
+          case "sequence" | "text" => assert(serde.contains("LazySimpleSerDe"))
+          case "rcfile" => assert(serde.contains("LazyBinaryColumnarSerDe"))
+          case _ => assert(serde.toLowerCase.contains(format))
+        }
     }
 
     // When a user-specified location is defined, the table type needs to be EXTERNAL.
@@ -587,6 +594,30 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     }
   }
 
+  test("CTAS with default fileformat") {
+    val table = "ctas1"
+    val ctas = s"CREATE TABLE IF NOT EXISTS $table SELECT key k, value FROM src"
+    withSQLConf(SQLConf.CONVERT_CTAS.key -> "true") {
+      withSQLConf("hive.default.fileformat" -> "textfile") {
+        withTable(table) {
+          sql(ctas)
+          // We should use parquet here as that is the default datasource fileformat. The default
+          // datasource file format is controlled by `spark.sql.sources.default` configuration.
+          // This testcase verifies that setting `hive.default.fileformat` has no impact on
+          // the target table's fileformat in case of CTAS.
+          assert(sessionState.conf.defaultDataSourceName === "parquet")
+          checkRelation(tableName = table, isDataSourceTable = true, format = "parquet")
+        }
+      }
+      withSQLConf("spark.sql.sources.default" -> "orc") {
+        withTable(table) {
+          sql(ctas)
+          checkRelation(tableName = table, isDataSourceTable = true, format = "orc")
+         }
+      }
+    }
+  }
+
   test("CTAS without serde with location") {
     withSQLConf(SQLConf.CONVERT_CTAS.key -> "true") {
       withTempDir { dir =>

From 8daa1a29b65a9b5337518458e9ece1619e8a01e3 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Mon, 17 Oct 2016 21:01:22 -0700
Subject: [PATCH 038/162] [SPARK-17974] Refactor FileCatalog classes to
 simplify the inheritance tree

## What changes were proposed in this pull request?

This renames `BasicFileCatalog => FileCatalog`, combines  `SessionFileCatalog` with `PartitioningAwareFileCatalog`, and removes the old `FileCatalog` trait.

In summary,
```
MetadataLogFileCatalog extends PartitioningAwareFileCatalog
ListingFileCatalog extends PartitioningAwareFileCatalog
PartitioningAwareFileCatalog extends FileCatalog
TableFileCatalog extends FileCatalog
```

cc cloud-fan mallman

## How was this patch tested?

Existing tests

Author: Eric Liang <ekl@databricks.com>

Closes #15518 from ericl/refactor-session-file-catalog.
---
 .../scala/org/apache/spark/sql/Dataset.scala  |   2 +-
 .../sql/execution/DataSourceScanExec.scala    |   4 +-
 .../execution/datasources/FileCatalog.scala   |  66 +++++
 .../execution/datasources/FileFormat.scala    |  61 -----
 .../datasources/HadoopFsRelation.scala        |   4 +-
 .../PartitioningAwareFileCatalog.scala        | 217 ++++++++++++++++-
 .../datasources/PartitioningUtils.scala       |  12 +-
 .../datasources/SessionFileCatalog.scala      | 225 ------------------
 .../datasources/TableFileCatalog.scala        |  11 +-
 .../datasources/FileCatalogSuite.scala        |  10 +
 .../datasources/SessionFileCatalogSuite.scala |  34 ---
 .../ParquetPartitionDiscoverySuite.scala      |   9 +-
 .../spark/sql/hive/HiveMetastoreCatalog.scala |   2 +-
 13 files changed, 303 insertions(+), 354 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCatalog.scala
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
 delete mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalogSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 7dccbbd3f0a5b..073d2b1512b95 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -43,7 +43,7 @@ import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.util.usePrettyExpression
 import org.apache.spark.sql.execution.{FileRelation, LogicalRDD, QueryExecution, SQLExecution}
 import org.apache.spark.sql.execution.command.{CreateViewCommand, ExplainCommand, GlobalTempView, LocalTempView}
-import org.apache.spark.sql.execution.datasources.{FileCatalog, HadoopFsRelation, LogicalRelation}
+import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}
 import org.apache.spark.sql.execution.datasources.json.JacksonGenerator
 import org.apache.spark.sql.execution.python.EvaluatePython
 import org.apache.spark.sql.streaming.{DataStreamWriter, StreamingQuery}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
index 623d2be55dcec..fdd1fa3648251 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -431,7 +431,7 @@ case class FileSourceScanExec(
   private def createBucketedReadRDD(
       bucketSpec: BucketSpec,
       readFile: (PartitionedFile) => Iterator[InternalRow],
-      selectedPartitions: Seq[Partition],
+      selectedPartitions: Seq[PartitionDirectory],
       fsRelation: HadoopFsRelation): RDD[InternalRow] = {
     logInfo(s"Planning with ${bucketSpec.numBuckets} buckets")
     val bucketed =
@@ -463,7 +463,7 @@ case class FileSourceScanExec(
    */
   private def createNonBucketedReadRDD(
       readFile: (PartitionedFile) => Iterator[InternalRow],
-      selectedPartitions: Seq[Partition],
+      selectedPartitions: Seq[PartitionDirectory],
       fsRelation: HadoopFsRelation): RDD[InternalRow] = {
     val defaultMaxSplitBytes =
       fsRelation.sparkSession.sessionState.conf.filesMaxPartitionBytes
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCatalog.scala
new file mode 100644
index 0000000000000..2bc66ceeebdb4
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCatalog.scala
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import org.apache.hadoop.fs._
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions._
+
+/**
+ * A collection of data files from a partitioned relation, along with the partition values in the
+ * form of an [[InternalRow]].
+ */
+case class PartitionDirectory(values: InternalRow, files: Seq[FileStatus])
+
+/**
+ * An interface for objects capable of enumerating the root paths of a relation as well as the
+ * partitions of a relation subject to some pruning expressions.
+ */
+trait FileCatalog {
+
+  /**
+   * Returns the list of root input paths from which the catalog will get files. There may be a
+   * single root path from which partitions are discovered, or individual partitions may be
+   * specified by each path.
+   */
+  def rootPaths: Seq[Path]
+
+  /**
+   * Returns all valid files grouped into partitions when the data is partitioned. If the data is
+   * unpartitioned, this will return a single partition with no partition values.
+   *
+   * @param filters The filters used to prune which partitions are returned.  These filters must
+   *                only refer to partition columns and this method will only return files
+   *                where these predicates are guaranteed to evaluate to `true`.  Thus, these
+   *                filters will not need to be evaluated again on the returned data.
+   */
+  def listFiles(filters: Seq[Expression]): Seq[PartitionDirectory]
+
+  /**
+   * Returns the list of files that will be read when scanning this relation. This call may be
+   * very expensive for large tables.
+   */
+  def inputFiles: Array[String]
+
+  /** Refresh any cached file listings */
+  def refresh(): Unit
+
+  /** Sum of table file sizes, in bytes */
+  def sizeInBytes: Long
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
index e7239ef91b326..9d153cec731a8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
@@ -175,64 +175,3 @@ abstract class TextBasedFileFormat extends FileFormat {
     codec == null || codec.isInstanceOf[SplittableCompressionCodec]
   }
 }
-
-/**
- * A collection of data files from a partitioned relation, along with the partition values in the
- * form of an [[InternalRow]].
- */
-case class Partition(values: InternalRow, files: Seq[FileStatus])
-
-/**
- * An interface for objects capable of enumerating the root paths of a relation as well as the
- * partitions of a relation subject to some pruning expressions.
- */
-trait BasicFileCatalog {
-
-  /**
-   * Returns the list of root input paths from which the catalog will get files. There may be a
-   * single root path from which partitions are discovered, or individual partitions may be
-   * specified by each path.
-   */
-  def rootPaths: Seq[Path]
-
-  /**
-   * Returns all valid files grouped into partitions when the data is partitioned. If the data is
-   * unpartitioned, this will return a single partition with no partition values.
-   *
-   * @param filters The filters used to prune which partitions are returned.  These filters must
-   *                only refer to partition columns and this method will only return files
-   *                where these predicates are guaranteed to evaluate to `true`.  Thus, these
-   *                filters will not need to be evaluated again on the returned data.
-   */
-  def listFiles(filters: Seq[Expression]): Seq[Partition]
-
-  /** Returns the list of files that will be read when scanning this relation. */
-  def inputFiles: Array[String]
-
-  /** Refresh any cached file listings */
-  def refresh(): Unit
-
-  /** Sum of table file sizes, in bytes */
-  def sizeInBytes: Long
-}
-
-/**
- * A [[BasicFileCatalog]] which can enumerate all of the files comprising a relation and, from
- * those, infer the relation's partition specification.
- */
-// TODO: Consider a more descriptive, appropriate name which suggests this is a file catalog for
-// which it is safe to list all of its files?
-trait FileCatalog extends BasicFileCatalog {
-
-  /** Returns the specification of the partitions inferred from the data. */
-  def partitionSpec(): PartitionSpec
-
-  /** Returns all the valid files. */
-  def allFiles(): Seq[FileStatus]
-
-  /** Returns the list of files that will be read when scanning this relation. */
-  override def inputFiles: Array[String] =
-    allFiles().map(_.getPath.toUri.toString).toArray
-
-  override def sizeInBytes: Long = allFiles().map(_.getLen).sum
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
index db889edf032d6..afad8898089bd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.types.StructType
  * Acts as a container for all of the metadata required to read from a datasource. All discovery,
  * resolution and merging logic for schemas and partitions has been removed.
  *
- * @param location A [[BasicFileCatalog]] that can enumerate the locations of all the files that
+ * @param location A [[FileCatalog]] that can enumerate the locations of all the files that
  *                 comprise this relation.
  * @param partitionSchema The schema of the columns (if any) that are used to partition the relation
  * @param dataSchema The schema of any remaining columns.  Note that if any partition columns are
@@ -38,7 +38,7 @@ import org.apache.spark.sql.types.StructType
  * @param options Configuration used when reading / writing data.
  */
 case class HadoopFsRelation(
-    location: BasicFileCatalog,
+    location: FileCatalog,
     partitionSchema: StructType,
     dataSchema: StructType,
     bucketSpec: Option[BucketSpec],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
index b2508115c282f..5c8eff7ec46b4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
@@ -17,14 +17,21 @@
 
 package org.apache.spark.sql.execution.datasources
 
+import java.io.FileNotFoundException
+
 import scala.collection.mutable
 
-import org.apache.hadoop.fs.{FileStatus, Path}
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs._
+import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
 
+import org.apache.spark.internal.Logging
+import org.apache.spark.metrics.source.HiveCatalogMetrics
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.{expressions, InternalRow}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.types.{StringType, StructType}
+import org.apache.spark.util.SerializableConfiguration
 
 
 /**
@@ -38,22 +45,24 @@ import org.apache.spark.sql.types.{StringType, StructType}
 abstract class PartitioningAwareFileCatalog(
     sparkSession: SparkSession,
     parameters: Map[String, String],
-    partitionSchema: Option[StructType])
-  extends SessionFileCatalog(sparkSession) with FileCatalog {
+    partitionSchema: Option[StructType]) extends FileCatalog with Logging {
   import PartitioningAwareFileCatalog.BASE_PATH_PARAM
 
-  override protected val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(parameters)
+  /** Returns the specification of the partitions inferred from the data. */
+  def partitionSpec(): PartitionSpec
+
+  protected val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(parameters)
 
   protected def leafFiles: mutable.LinkedHashMap[Path, FileStatus]
 
   protected def leafDirToChildrenFiles: Map[Path, Array[FileStatus]]
 
-  override def listFiles(filters: Seq[Expression]): Seq[Partition] = {
+  override def listFiles(filters: Seq[Expression]): Seq[PartitionDirectory] = {
     val selectedPartitions = if (partitionSpec().partitionColumns.isEmpty) {
-      Partition(InternalRow.empty, allFiles().filter(f => isDataPath(f.getPath))) :: Nil
+      PartitionDirectory(InternalRow.empty, allFiles().filter(f => isDataPath(f.getPath))) :: Nil
     } else {
       prunePartitions(filters, partitionSpec()).map {
-        case PartitionDirectory(values, path) =>
+        case PartitionPath(values, path) =>
           val files: Seq[FileStatus] = leafDirToChildrenFiles.get(path) match {
             case Some(existingDir) =>
               // Directory has children files in it, return them
@@ -63,14 +72,20 @@ abstract class PartitioningAwareFileCatalog(
               // Directory does not exist, or has no children files
               Nil
           }
-          Partition(values, files)
+          PartitionDirectory(values, files)
       }
     }
     logTrace("Selected files after partition pruning:\n\t" + selectedPartitions.mkString("\n\t"))
     selectedPartitions
   }
 
-  override def allFiles(): Seq[FileStatus] = {
+  /** Returns the list of files that will be read when scanning this relation. */
+  override def inputFiles: Array[String] =
+    allFiles().map(_.getPath.toUri.toString).toArray
+
+  override def sizeInBytes: Long = allFiles().map(_.getLen).sum
+
+  def allFiles(): Seq[FileStatus] = {
     if (partitionSpec().partitionColumns.isEmpty) {
       // For each of the root input paths, get the list of files inside them
       rootPaths.flatMap { path =>
@@ -139,7 +154,7 @@ abstract class PartitioningAwareFileCatalog(
 
   private def prunePartitions(
       predicates: Seq[Expression],
-      partitionSpec: PartitionSpec): Seq[PartitionDirectory] = {
+      partitionSpec: PartitionSpec): Seq[PartitionPath] = {
     val PartitionSpec(partitionColumns, partitions) = partitionSpec
     val partitionColumnNames = partitionColumns.map(_.name).toSet
     val partitionPruningPredicates = predicates.filter {
@@ -156,7 +171,7 @@ abstract class PartitioningAwareFileCatalog(
       })
 
       val selected = partitions.filter {
-        case PartitionDirectory(values, _) => boundPredicate(values)
+        case PartitionPath(values, _) => boundPredicate(values)
       }
       logInfo {
         val total = partitions.length
@@ -214,8 +229,186 @@ abstract class PartitioningAwareFileCatalog(
     val name = path.getName
     !((name.startsWith("_") && !name.contains("=")) || name.startsWith("."))
   }
+
+  /**
+   * List leaf files of given paths. This method will submit a Spark job to do parallel
+   * listing whenever there is a path having more files than the parallel partition discovery
+   * discovery threshold.
+   *
+   * This is publicly visible for testing.
+   */
+  def listLeafFiles(paths: Seq[Path]): mutable.LinkedHashSet[FileStatus] = {
+    val files =
+      if (paths.length >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) {
+        PartitioningAwareFileCatalog.listLeafFilesInParallel(paths, hadoopConf, sparkSession)
+      } else {
+        PartitioningAwareFileCatalog.listLeafFilesInSerial(paths, hadoopConf)
+      }
+
+    HiveCatalogMetrics.incrementFilesDiscovered(files.size)
+    mutable.LinkedHashSet(files: _*)
+  }
 }
 
-object PartitioningAwareFileCatalog {
+object PartitioningAwareFileCatalog extends Logging {
   val BASE_PATH_PARAM = "basePath"
+
+  /** A serializable variant of HDFS's BlockLocation. */
+  private case class SerializableBlockLocation(
+      names: Array[String],
+      hosts: Array[String],
+      offset: Long,
+      length: Long)
+
+  /** A serializable variant of HDFS's FileStatus. */
+  private case class SerializableFileStatus(
+      path: String,
+      length: Long,
+      isDir: Boolean,
+      blockReplication: Short,
+      blockSize: Long,
+      modificationTime: Long,
+      accessTime: Long,
+      blockLocations: Array[SerializableBlockLocation])
+
+  /**
+   * List a collection of path recursively.
+   */
+  private def listLeafFilesInSerial(
+      paths: Seq[Path],
+      hadoopConf: Configuration): Seq[FileStatus] = {
+    // Dummy jobconf to get to the pathFilter defined in configuration
+    val jobConf = new JobConf(hadoopConf, this.getClass)
+    val filter = FileInputFormat.getInputPathFilter(jobConf)
+
+    paths.flatMap { path =>
+      val fs = path.getFileSystem(hadoopConf)
+      listLeafFiles0(fs, path, filter)
+    }
+  }
+
+  /**
+   * List a collection of path recursively in parallel (using Spark executors).
+   * Each task launched will use [[listLeafFilesInSerial]] to list.
+   */
+  private def listLeafFilesInParallel(
+      paths: Seq[Path],
+      hadoopConf: Configuration,
+      sparkSession: SparkSession): Seq[FileStatus] = {
+    assert(paths.size >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold)
+    logInfo(s"Listing leaf files and directories in parallel under: ${paths.mkString(", ")}")
+
+    val sparkContext = sparkSession.sparkContext
+    val serializableConfiguration = new SerializableConfiguration(hadoopConf)
+    val serializedPaths = paths.map(_.toString)
+
+    // Set the number of parallelism to prevent following file listing from generating many tasks
+    // in case of large #defaultParallelism.
+    val numParallelism = Math.min(paths.size, 10000)
+
+    val statuses = sparkContext
+      .parallelize(serializedPaths, numParallelism)
+      .mapPartitions { paths =>
+        val hadoopConf = serializableConfiguration.value
+        listLeafFilesInSerial(paths.map(new Path(_)).toSeq, hadoopConf).iterator
+      }.map { status =>
+        // Turn FileStatus into SerializableFileStatus so we can send it back to the driver
+        val blockLocations = status match {
+          case f: LocatedFileStatus =>
+            f.getBlockLocations.map { loc =>
+              SerializableBlockLocation(
+                loc.getNames,
+                loc.getHosts,
+                loc.getOffset,
+                loc.getLength)
+            }
+
+          case _ =>
+            Array.empty[SerializableBlockLocation]
+        }
+
+        SerializableFileStatus(
+          status.getPath.toString,
+          status.getLen,
+          status.isDirectory,
+          status.getReplication,
+          status.getBlockSize,
+          status.getModificationTime,
+          status.getAccessTime,
+          blockLocations)
+      }.collect()
+
+    // Turn SerializableFileStatus back to Status
+    statuses.map { f =>
+      val blockLocations = f.blockLocations.map { loc =>
+        new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length)
+      }
+      new LocatedFileStatus(
+        new FileStatus(
+          f.length, f.isDir, f.blockReplication, f.blockSize, f.modificationTime, new Path(f.path)),
+        blockLocations)
+    }
+  }
+
+  /**
+   * List a single path, provided as a FileStatus, in serial.
+   */
+  private def listLeafFiles0(
+      fs: FileSystem, path: Path, filter: PathFilter): Seq[FileStatus] = {
+    logTrace(s"Listing $path")
+    val name = path.getName.toLowerCase
+    if (shouldFilterOut(name)) {
+      Seq.empty[FileStatus]
+    } else {
+      // [SPARK-17599] Prevent ListingFileCatalog from failing if path doesn't exist
+      // Note that statuses only include FileStatus for the files and dirs directly under path,
+      // and does not include anything else recursively.
+      val statuses = try fs.listStatus(path) catch {
+        case _: FileNotFoundException =>
+          logWarning(s"The directory $path was not found. Was it deleted very recently?")
+          Array.empty[FileStatus]
+      }
+
+      val allLeafStatuses = {
+        val (dirs, files) = statuses.partition(_.isDirectory)
+        val stats = files ++ dirs.flatMap(dir => listLeafFiles0(fs, dir.getPath, filter))
+        if (filter != null) stats.filter(f => filter.accept(f.getPath)) else stats
+      }
+
+      allLeafStatuses.filterNot(status => shouldFilterOut(status.getPath.getName)).map {
+        case f: LocatedFileStatus =>
+          f
+
+        // NOTE:
+        //
+        // - Although S3/S3A/S3N file system can be quite slow for remote file metadata
+        //   operations, calling `getFileBlockLocations` does no harm here since these file system
+        //   implementations don't actually issue RPC for this method.
+        //
+        // - Here we are calling `getFileBlockLocations` in a sequential manner, but it should not
+        //   be a big deal since we always use to `listLeafFilesInParallel` when the number of
+        //   paths exceeds threshold.
+        case f =>
+          // The other constructor of LocatedFileStatus will call FileStatus.getPermission(),
+          // which is very slow on some file system (RawLocalFileSystem, which is launch a
+          // subprocess and parse the stdout).
+          val locations = fs.getFileBlockLocations(f, 0, f.getLen)
+          val lfs = new LocatedFileStatus(f.getLen, f.isDirectory, f.getReplication, f.getBlockSize,
+            f.getModificationTime, 0, null, null, null, null, f.getPath, locations)
+          if (f.isSymlink) {
+            lfs.setSymlink(f.getSymlink)
+          }
+          lfs
+      }
+    }
+  }
+
+  /** Checks if we should filter out this path name. */
+  def shouldFilterOut(pathName: String): Boolean = {
+    // We filter everything that starts with _ and ., except _common_metadata and _metadata
+    // because Parquet needs to find those metadata files from leaf files returned by this method.
+    // We should refactor this logic to not mix metadata files with data files.
+    ((pathName.startsWith("_") && !pathName.contains("=")) || pathName.startsWith(".")) &&
+      !pathName.startsWith("_common_metadata") && !pathName.startsWith("_metadata")
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
index 504464216e5a4..ac6795b9a2e7b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
@@ -33,8 +33,8 @@ import org.apache.spark.sql.types._
 
 // TODO: We should tighten up visibility of the classes here once we clean up Hive coupling.
 
-object PartitionDirectory {
-  def apply(values: InternalRow, path: String): PartitionDirectory =
+object PartitionPath {
+  def apply(values: InternalRow, path: String): PartitionPath =
     apply(values, new Path(path))
 }
 
@@ -42,14 +42,14 @@ object PartitionDirectory {
  * Holds a directory in a partitioned collection of files as well as as the partition values
  * in the form of a Row.  Before scanning, the files at `path` need to be enumerated.
  */
-case class PartitionDirectory(values: InternalRow, path: Path)
+case class PartitionPath(values: InternalRow, path: Path)
 
 case class PartitionSpec(
     partitionColumns: StructType,
-    partitions: Seq[PartitionDirectory])
+    partitions: Seq[PartitionPath])
 
 object PartitionSpec {
-  val emptySpec = PartitionSpec(StructType(Seq.empty[StructField]), Seq.empty[PartitionDirectory])
+  val emptySpec = PartitionSpec(StructType(Seq.empty[StructField]), Seq.empty[PartitionPath])
 }
 
 object PartitioningUtils {
@@ -141,7 +141,7 @@ object PartitioningUtils {
       // Finally, we create `Partition`s based on paths and resolved partition values.
       val partitions = resolvedPartitionValues.zip(pathsWithPartitionValues).map {
         case (PartitionValues(_, literals), (path, _)) =>
-          PartitionDirectory(InternalRow.fromSeq(literals.map(_.value)), path)
+          PartitionPath(InternalRow.fromSeq(literals.map(_.value)), path)
       }
 
       PartitionSpec(StructType(fields), partitions)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
deleted file mode 100644
index 4807a92c2e6b8..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
+++ /dev/null
@@ -1,225 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution.datasources
-
-import java.io.FileNotFoundException
-
-import scala.collection.mutable
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs._
-import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
-
-import org.apache.spark.internal.Logging
-import org.apache.spark.metrics.source.HiveCatalogMetrics
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.util.SerializableConfiguration
-
-
-/**
- * A base class for [[BasicFileCatalog]]s that need a [[SparkSession]] and the ability to find leaf
- * files in a list of HDFS paths.
- *
- * @param sparkSession a [[SparkSession]]
- * @param ignoreFileNotFound (see [[ListingFileCatalog]])
- */
-abstract class SessionFileCatalog(sparkSession: SparkSession)
-    extends BasicFileCatalog with Logging {
-  protected val hadoopConf: Configuration
-
-  /**
-   * List leaf files of given paths. This method will submit a Spark job to do parallel
-   * listing whenever there is a path having more files than the parallel partition discovery
-   * discovery threshold.
-   *
-   * This is publicly visible for testing.
-   */
-  def listLeafFiles(paths: Seq[Path]): mutable.LinkedHashSet[FileStatus] = {
-    val files =
-      if (paths.length >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) {
-        SessionFileCatalog.listLeafFilesInParallel(paths, hadoopConf, sparkSession)
-      } else {
-        SessionFileCatalog.listLeafFilesInSerial(paths, hadoopConf)
-      }
-
-    HiveCatalogMetrics.incrementFilesDiscovered(files.size)
-    mutable.LinkedHashSet(files: _*)
-  }
-}
-
-object SessionFileCatalog extends Logging {
-
-  /** A serializable variant of HDFS's BlockLocation. */
-  private case class SerializableBlockLocation(
-      names: Array[String],
-      hosts: Array[String],
-      offset: Long,
-      length: Long)
-
-  /** A serializable variant of HDFS's FileStatus. */
-  private case class SerializableFileStatus(
-      path: String,
-      length: Long,
-      isDir: Boolean,
-      blockReplication: Short,
-      blockSize: Long,
-      modificationTime: Long,
-      accessTime: Long,
-      blockLocations: Array[SerializableBlockLocation])
-
-  /**
-   * List a collection of path recursively.
-   */
-  private def listLeafFilesInSerial(
-      paths: Seq[Path],
-      hadoopConf: Configuration): Seq[FileStatus] = {
-    // Dummy jobconf to get to the pathFilter defined in configuration
-    val jobConf = new JobConf(hadoopConf, this.getClass)
-    val filter = FileInputFormat.getInputPathFilter(jobConf)
-
-    paths.flatMap { path =>
-      val fs = path.getFileSystem(hadoopConf)
-      listLeafFiles0(fs, path, filter)
-    }
-  }
-
-  /**
-   * List a collection of path recursively in parallel (using Spark executors).
-   * Each task launched will use [[listLeafFilesInSerial]] to list.
-   */
-  private def listLeafFilesInParallel(
-      paths: Seq[Path],
-      hadoopConf: Configuration,
-      sparkSession: SparkSession): Seq[FileStatus] = {
-    assert(paths.size >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold)
-    logInfo(s"Listing leaf files and directories in parallel under: ${paths.mkString(", ")}")
-
-    val sparkContext = sparkSession.sparkContext
-    val serializableConfiguration = new SerializableConfiguration(hadoopConf)
-    val serializedPaths = paths.map(_.toString)
-
-    // Set the number of parallelism to prevent following file listing from generating many tasks
-    // in case of large #defaultParallelism.
-    val numParallelism = Math.min(paths.size, 10000)
-
-    val statuses = sparkContext
-      .parallelize(serializedPaths, numParallelism)
-      .mapPartitions { paths =>
-        val hadoopConf = serializableConfiguration.value
-        listLeafFilesInSerial(paths.map(new Path(_)).toSeq, hadoopConf).iterator
-      }.map { status =>
-        // Turn FileStatus into SerializableFileStatus so we can send it back to the driver
-        val blockLocations = status match {
-          case f: LocatedFileStatus =>
-            f.getBlockLocations.map { loc =>
-              SerializableBlockLocation(
-                loc.getNames,
-                loc.getHosts,
-                loc.getOffset,
-                loc.getLength)
-            }
-
-          case _ =>
-            Array.empty[SerializableBlockLocation]
-        }
-
-        SerializableFileStatus(
-          status.getPath.toString,
-          status.getLen,
-          status.isDirectory,
-          status.getReplication,
-          status.getBlockSize,
-          status.getModificationTime,
-          status.getAccessTime,
-          blockLocations)
-      }.collect()
-
-    // Turn SerializableFileStatus back to Status
-    statuses.map { f =>
-      val blockLocations = f.blockLocations.map { loc =>
-        new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length)
-      }
-      new LocatedFileStatus(
-        new FileStatus(
-          f.length, f.isDir, f.blockReplication, f.blockSize, f.modificationTime, new Path(f.path)),
-        blockLocations)
-    }
-  }
-
-  /**
-   * List a single path, provided as a FileStatus, in serial.
-   */
-  private def listLeafFiles0(
-      fs: FileSystem, path: Path, filter: PathFilter): Seq[FileStatus] = {
-    logTrace(s"Listing $path")
-    val name = path.getName.toLowerCase
-    if (shouldFilterOut(name)) {
-      Seq.empty[FileStatus]
-    } else {
-      // [SPARK-17599] Prevent ListingFileCatalog from failing if path doesn't exist
-      // Note that statuses only include FileStatus for the files and dirs directly under path,
-      // and does not include anything else recursively.
-      val statuses = try fs.listStatus(path) catch {
-        case _: FileNotFoundException =>
-          logWarning(s"The directory $path was not found. Was it deleted very recently?")
-          Array.empty[FileStatus]
-      }
-
-      val allLeafStatuses = {
-        val (dirs, files) = statuses.partition(_.isDirectory)
-        val stats = files ++ dirs.flatMap(dir => listLeafFiles0(fs, dir.getPath, filter))
-        if (filter != null) stats.filter(f => filter.accept(f.getPath)) else stats
-      }
-
-      allLeafStatuses.filterNot(status => shouldFilterOut(status.getPath.getName)).map {
-        case f: LocatedFileStatus =>
-          f
-
-        // NOTE:
-        //
-        // - Although S3/S3A/S3N file system can be quite slow for remote file metadata
-        //   operations, calling `getFileBlockLocations` does no harm here since these file system
-        //   implementations don't actually issue RPC for this method.
-        //
-        // - Here we are calling `getFileBlockLocations` in a sequential manner, but it should not
-        //   be a big deal since we always use to `listLeafFilesInParallel` when the number of
-        //   paths exceeds threshold.
-        case f =>
-          // The other constructor of LocatedFileStatus will call FileStatus.getPermission(),
-          // which is very slow on some file system (RawLocalFileSystem, which is launch a
-          // subprocess and parse the stdout).
-          val locations = fs.getFileBlockLocations(f, 0, f.getLen)
-          val lfs = new LocatedFileStatus(f.getLen, f.isDirectory, f.getReplication, f.getBlockSize,
-            f.getModificationTime, 0, null, null, null, null, f.getPath, locations)
-          if (f.isSymlink) {
-            lfs.setSymlink(f.getSymlink)
-          }
-          lfs
-      }
-    }
-  }
-
-  /** Checks if we should filter out this path name. */
-  def shouldFilterOut(pathName: String): Boolean = {
-    // We filter everything that starts with _ and ., except _common_metadata and _metadata
-    // because Parquet needs to find those metadata files from leaf files returned by this method.
-    // We should refactor this logic to not mix metadata files with data files.
-    ((pathName.startsWith("_") && !pathName.contains("=")) || pathName.startsWith(".")) &&
-      !pathName.startsWith("_common_metadata") && !pathName.startsWith("_metadata")
-  }
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index a5c41b244589b..5648ab480a98a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.types.StructType
 
 
 /**
- * A [[BasicFileCatalog]] for a metastore catalog table.
+ * A [[FileCatalog]] for a metastore catalog table.
  *
  * @param sparkSession a [[SparkSession]]
  * @param db the table's database name
@@ -38,10 +38,9 @@ class TableFileCatalog(
     db: String,
     table: String,
     partitionSchema: Option[StructType],
-    override val sizeInBytes: Long)
-  extends SessionFileCatalog(sparkSession) {
+    override val sizeInBytes: Long) extends FileCatalog {
 
-  override protected val hadoopConf = sparkSession.sessionState.newHadoopConf
+  protected val hadoopConf = sparkSession.sessionState.newHadoopConf
 
   private val externalCatalog = sparkSession.sharedState.externalCatalog
 
@@ -51,7 +50,7 @@ class TableFileCatalog(
 
   override def rootPaths: Seq[Path] = baseLocation.map(new Path(_)).toSeq
 
-  override def listFiles(filters: Seq[Expression]): Seq[Partition] = {
+  override def listFiles(filters: Seq[Expression]): Seq[PartitionDirectory] = {
     filterPartitions(filters).listFiles(Nil)
   }
 
@@ -79,7 +78,7 @@ class TableFileCatalog(
       case Some(schema) =>
         val selectedPartitions = externalCatalog.listPartitionsByFilter(db, table, filters)
         val partitions = selectedPartitions.map { p =>
-          PartitionDirectory(p.toRow(schema), p.storage.locationUri.get)
+          PartitionPath(p.toRow(schema), p.storage.locationUri.get)
         }
         val partitionSpec = PartitionSpec(schema, partitions)
         new PrunedTableFileCatalog(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala
index 2695974b84b00..9c43169cbf898 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala
@@ -81,6 +81,16 @@ class FileCatalogSuite extends SharedSQLContext {
     }
   }
 
+  test("PartitioningAwareFileCatalog - file filtering") {
+    assert(!PartitioningAwareFileCatalog.shouldFilterOut("abcd"))
+    assert(PartitioningAwareFileCatalog.shouldFilterOut(".ab"))
+    assert(PartitioningAwareFileCatalog.shouldFilterOut("_cd"))
+    assert(!PartitioningAwareFileCatalog.shouldFilterOut("_metadata"))
+    assert(!PartitioningAwareFileCatalog.shouldFilterOut("_common_metadata"))
+    assert(PartitioningAwareFileCatalog.shouldFilterOut("_ab_metadata"))
+    assert(PartitioningAwareFileCatalog.shouldFilterOut("_cd_common_metadata"))
+  }
+
   test("SPARK-17613 - PartitioningAwareFileCatalog: base path w/o '/' at end") {
     class MockCatalog(
       override val rootPaths: Seq[Path])
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalogSuite.scala
deleted file mode 100644
index df509583377ae..0000000000000
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalogSuite.scala
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution.datasources
-
-import org.apache.spark.SparkFunSuite
-
-class SessionFileCatalogSuite extends SparkFunSuite {
-
-  test("file filtering") {
-    assert(!SessionFileCatalog.shouldFilterOut("abcd"))
-    assert(SessionFileCatalog.shouldFilterOut(".ab"))
-    assert(SessionFileCatalog.shouldFilterOut("_cd"))
-
-    assert(!SessionFileCatalog.shouldFilterOut("_metadata"))
-    assert(!SessionFileCatalog.shouldFilterOut("_common_metadata"))
-    assert(SessionFileCatalog.shouldFilterOut("_ab_metadata"))
-    assert(SessionFileCatalog.shouldFilterOut("_cd_common_metadata"))
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
index 43357c97c395a..36d4df0015ffd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
@@ -30,7 +30,7 @@ import org.apache.parquet.hadoop.ParquetOutputFormat
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Literal
-import org.apache.spark.sql.execution.datasources.{FileCatalog, HadoopFsRelation, LogicalRelation, PartitionDirectory => Partition, PartitioningUtils, PartitionSpec}
+import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation, PartitionPath => Partition, PartitioningAwareFileCatalog, PartitioningUtils, PartitionSpec}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
@@ -626,10 +626,11 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
       (1 to 10).map(i => (i, i.toString)).toDF("a", "b").write.parquet(dir.getCanonicalPath)
       val queryExecution = spark.read.parquet(dir.getCanonicalPath).queryExecution
       queryExecution.analyzed.collectFirst {
-        case LogicalRelation(HadoopFsRelation(location: FileCatalog, _, _, _, _, _), _, _) =>
-          assert(location.partitionSpec === PartitionSpec.emptySpec)
+        case LogicalRelation(
+            HadoopFsRelation(location: PartitioningAwareFileCatalog, _, _, _, _, _), _, _) =>
+          assert(location.partitionSpec() === PartitionSpec.emptySpec)
       }.getOrElse {
-        fail(s"Expecting a ParquetRelation2, but got:\n$queryExecution")
+        fail(s"Expecting a matching HadoopFsRelation, but got:\n$queryExecution")
       }
     }
   }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 4a2aaa7d4f6ca..16e1e37b2fb02 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -30,7 +30,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
 import org.apache.spark.sql.execution.command.DDLUtils
-import org.apache.spark.sql.execution.datasources.{Partition => _, _}
+import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat, ParquetOptions}
 import org.apache.spark.sql.hive.orc.OrcFileFormat
 import org.apache.spark.sql.types._

From 1c5a7d7f64993540baa5558be80130ee6911ba3c Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Mon, 17 Oct 2016 21:26:28 -0700
Subject: [PATCH 039/162] Revert "[SPARK-17974] Refactor FileCatalog classes to
 simplify the inheritance tree"

This reverts commit 8daa1a29b65a9b5337518458e9ece1619e8a01e3.
---
 .../scala/org/apache/spark/sql/Dataset.scala  |   2 +-
 .../sql/execution/DataSourceScanExec.scala    |   4 +-
 .../execution/datasources/FileCatalog.scala   |  66 -----
 .../execution/datasources/FileFormat.scala    |  61 +++++
 .../datasources/HadoopFsRelation.scala        |   4 +-
 .../PartitioningAwareFileCatalog.scala        | 217 +----------------
 .../datasources/PartitioningUtils.scala       |  12 +-
 .../datasources/SessionFileCatalog.scala      | 225 ++++++++++++++++++
 .../datasources/TableFileCatalog.scala        |  11 +-
 .../datasources/FileCatalogSuite.scala        |  10 -
 .../datasources/SessionFileCatalogSuite.scala |  34 +++
 .../ParquetPartitionDiscoverySuite.scala      |   9 +-
 .../spark/sql/hive/HiveMetastoreCatalog.scala |   2 +-
 13 files changed, 354 insertions(+), 303 deletions(-)
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCatalog.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalogSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 073d2b1512b95..7dccbbd3f0a5b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -43,7 +43,7 @@ import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.util.usePrettyExpression
 import org.apache.spark.sql.execution.{FileRelation, LogicalRDD, QueryExecution, SQLExecution}
 import org.apache.spark.sql.execution.command.{CreateViewCommand, ExplainCommand, GlobalTempView, LocalTempView}
-import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}
+import org.apache.spark.sql.execution.datasources.{FileCatalog, HadoopFsRelation, LogicalRelation}
 import org.apache.spark.sql.execution.datasources.json.JacksonGenerator
 import org.apache.spark.sql.execution.python.EvaluatePython
 import org.apache.spark.sql.streaming.{DataStreamWriter, StreamingQuery}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
index fdd1fa3648251..623d2be55dcec 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -431,7 +431,7 @@ case class FileSourceScanExec(
   private def createBucketedReadRDD(
       bucketSpec: BucketSpec,
       readFile: (PartitionedFile) => Iterator[InternalRow],
-      selectedPartitions: Seq[PartitionDirectory],
+      selectedPartitions: Seq[Partition],
       fsRelation: HadoopFsRelation): RDD[InternalRow] = {
     logInfo(s"Planning with ${bucketSpec.numBuckets} buckets")
     val bucketed =
@@ -463,7 +463,7 @@ case class FileSourceScanExec(
    */
   private def createNonBucketedReadRDD(
       readFile: (PartitionedFile) => Iterator[InternalRow],
-      selectedPartitions: Seq[PartitionDirectory],
+      selectedPartitions: Seq[Partition],
       fsRelation: HadoopFsRelation): RDD[InternalRow] = {
     val defaultMaxSplitBytes =
       fsRelation.sparkSession.sessionState.conf.filesMaxPartitionBytes
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCatalog.scala
deleted file mode 100644
index 2bc66ceeebdb4..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCatalog.scala
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution.datasources
-
-import org.apache.hadoop.fs._
-
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions._
-
-/**
- * A collection of data files from a partitioned relation, along with the partition values in the
- * form of an [[InternalRow]].
- */
-case class PartitionDirectory(values: InternalRow, files: Seq[FileStatus])
-
-/**
- * An interface for objects capable of enumerating the root paths of a relation as well as the
- * partitions of a relation subject to some pruning expressions.
- */
-trait FileCatalog {
-
-  /**
-   * Returns the list of root input paths from which the catalog will get files. There may be a
-   * single root path from which partitions are discovered, or individual partitions may be
-   * specified by each path.
-   */
-  def rootPaths: Seq[Path]
-
-  /**
-   * Returns all valid files grouped into partitions when the data is partitioned. If the data is
-   * unpartitioned, this will return a single partition with no partition values.
-   *
-   * @param filters The filters used to prune which partitions are returned.  These filters must
-   *                only refer to partition columns and this method will only return files
-   *                where these predicates are guaranteed to evaluate to `true`.  Thus, these
-   *                filters will not need to be evaluated again on the returned data.
-   */
-  def listFiles(filters: Seq[Expression]): Seq[PartitionDirectory]
-
-  /**
-   * Returns the list of files that will be read when scanning this relation. This call may be
-   * very expensive for large tables.
-   */
-  def inputFiles: Array[String]
-
-  /** Refresh any cached file listings */
-  def refresh(): Unit
-
-  /** Sum of table file sizes, in bytes */
-  def sizeInBytes: Long
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
index 9d153cec731a8..e7239ef91b326 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
@@ -175,3 +175,64 @@ abstract class TextBasedFileFormat extends FileFormat {
     codec == null || codec.isInstanceOf[SplittableCompressionCodec]
   }
 }
+
+/**
+ * A collection of data files from a partitioned relation, along with the partition values in the
+ * form of an [[InternalRow]].
+ */
+case class Partition(values: InternalRow, files: Seq[FileStatus])
+
+/**
+ * An interface for objects capable of enumerating the root paths of a relation as well as the
+ * partitions of a relation subject to some pruning expressions.
+ */
+trait BasicFileCatalog {
+
+  /**
+   * Returns the list of root input paths from which the catalog will get files. There may be a
+   * single root path from which partitions are discovered, or individual partitions may be
+   * specified by each path.
+   */
+  def rootPaths: Seq[Path]
+
+  /**
+   * Returns all valid files grouped into partitions when the data is partitioned. If the data is
+   * unpartitioned, this will return a single partition with no partition values.
+   *
+   * @param filters The filters used to prune which partitions are returned.  These filters must
+   *                only refer to partition columns and this method will only return files
+   *                where these predicates are guaranteed to evaluate to `true`.  Thus, these
+   *                filters will not need to be evaluated again on the returned data.
+   */
+  def listFiles(filters: Seq[Expression]): Seq[Partition]
+
+  /** Returns the list of files that will be read when scanning this relation. */
+  def inputFiles: Array[String]
+
+  /** Refresh any cached file listings */
+  def refresh(): Unit
+
+  /** Sum of table file sizes, in bytes */
+  def sizeInBytes: Long
+}
+
+/**
+ * A [[BasicFileCatalog]] which can enumerate all of the files comprising a relation and, from
+ * those, infer the relation's partition specification.
+ */
+// TODO: Consider a more descriptive, appropriate name which suggests this is a file catalog for
+// which it is safe to list all of its files?
+trait FileCatalog extends BasicFileCatalog {
+
+  /** Returns the specification of the partitions inferred from the data. */
+  def partitionSpec(): PartitionSpec
+
+  /** Returns all the valid files. */
+  def allFiles(): Seq[FileStatus]
+
+  /** Returns the list of files that will be read when scanning this relation. */
+  override def inputFiles: Array[String] =
+    allFiles().map(_.getPath.toUri.toString).toArray
+
+  override def sizeInBytes: Long = allFiles().map(_.getLen).sum
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
index afad8898089bd..db889edf032d6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.types.StructType
  * Acts as a container for all of the metadata required to read from a datasource. All discovery,
  * resolution and merging logic for schemas and partitions has been removed.
  *
- * @param location A [[FileCatalog]] that can enumerate the locations of all the files that
+ * @param location A [[BasicFileCatalog]] that can enumerate the locations of all the files that
  *                 comprise this relation.
  * @param partitionSchema The schema of the columns (if any) that are used to partition the relation
  * @param dataSchema The schema of any remaining columns.  Note that if any partition columns are
@@ -38,7 +38,7 @@ import org.apache.spark.sql.types.StructType
  * @param options Configuration used when reading / writing data.
  */
 case class HadoopFsRelation(
-    location: FileCatalog,
+    location: BasicFileCatalog,
     partitionSchema: StructType,
     dataSchema: StructType,
     bucketSpec: Option[BucketSpec],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
index 5c8eff7ec46b4..b2508115c282f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
@@ -17,21 +17,14 @@
 
 package org.apache.spark.sql.execution.datasources
 
-import java.io.FileNotFoundException
-
 import scala.collection.mutable
 
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs._
-import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
+import org.apache.hadoop.fs.{FileStatus, Path}
 
-import org.apache.spark.internal.Logging
-import org.apache.spark.metrics.source.HiveCatalogMetrics
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.{expressions, InternalRow}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.types.{StringType, StructType}
-import org.apache.spark.util.SerializableConfiguration
 
 
 /**
@@ -45,24 +38,22 @@ import org.apache.spark.util.SerializableConfiguration
 abstract class PartitioningAwareFileCatalog(
     sparkSession: SparkSession,
     parameters: Map[String, String],
-    partitionSchema: Option[StructType]) extends FileCatalog with Logging {
+    partitionSchema: Option[StructType])
+  extends SessionFileCatalog(sparkSession) with FileCatalog {
   import PartitioningAwareFileCatalog.BASE_PATH_PARAM
 
-  /** Returns the specification of the partitions inferred from the data. */
-  def partitionSpec(): PartitionSpec
-
-  protected val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(parameters)
+  override protected val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(parameters)
 
   protected def leafFiles: mutable.LinkedHashMap[Path, FileStatus]
 
   protected def leafDirToChildrenFiles: Map[Path, Array[FileStatus]]
 
-  override def listFiles(filters: Seq[Expression]): Seq[PartitionDirectory] = {
+  override def listFiles(filters: Seq[Expression]): Seq[Partition] = {
     val selectedPartitions = if (partitionSpec().partitionColumns.isEmpty) {
-      PartitionDirectory(InternalRow.empty, allFiles().filter(f => isDataPath(f.getPath))) :: Nil
+      Partition(InternalRow.empty, allFiles().filter(f => isDataPath(f.getPath))) :: Nil
     } else {
       prunePartitions(filters, partitionSpec()).map {
-        case PartitionPath(values, path) =>
+        case PartitionDirectory(values, path) =>
           val files: Seq[FileStatus] = leafDirToChildrenFiles.get(path) match {
             case Some(existingDir) =>
               // Directory has children files in it, return them
@@ -72,20 +63,14 @@ abstract class PartitioningAwareFileCatalog(
               // Directory does not exist, or has no children files
               Nil
           }
-          PartitionDirectory(values, files)
+          Partition(values, files)
       }
     }
     logTrace("Selected files after partition pruning:\n\t" + selectedPartitions.mkString("\n\t"))
     selectedPartitions
   }
 
-  /** Returns the list of files that will be read when scanning this relation. */
-  override def inputFiles: Array[String] =
-    allFiles().map(_.getPath.toUri.toString).toArray
-
-  override def sizeInBytes: Long = allFiles().map(_.getLen).sum
-
-  def allFiles(): Seq[FileStatus] = {
+  override def allFiles(): Seq[FileStatus] = {
     if (partitionSpec().partitionColumns.isEmpty) {
       // For each of the root input paths, get the list of files inside them
       rootPaths.flatMap { path =>
@@ -154,7 +139,7 @@ abstract class PartitioningAwareFileCatalog(
 
   private def prunePartitions(
       predicates: Seq[Expression],
-      partitionSpec: PartitionSpec): Seq[PartitionPath] = {
+      partitionSpec: PartitionSpec): Seq[PartitionDirectory] = {
     val PartitionSpec(partitionColumns, partitions) = partitionSpec
     val partitionColumnNames = partitionColumns.map(_.name).toSet
     val partitionPruningPredicates = predicates.filter {
@@ -171,7 +156,7 @@ abstract class PartitioningAwareFileCatalog(
       })
 
       val selected = partitions.filter {
-        case PartitionPath(values, _) => boundPredicate(values)
+        case PartitionDirectory(values, _) => boundPredicate(values)
       }
       logInfo {
         val total = partitions.length
@@ -229,186 +214,8 @@ abstract class PartitioningAwareFileCatalog(
     val name = path.getName
     !((name.startsWith("_") && !name.contains("=")) || name.startsWith("."))
   }
-
-  /**
-   * List leaf files of given paths. This method will submit a Spark job to do parallel
-   * listing whenever there is a path having more files than the parallel partition discovery
-   * discovery threshold.
-   *
-   * This is publicly visible for testing.
-   */
-  def listLeafFiles(paths: Seq[Path]): mutable.LinkedHashSet[FileStatus] = {
-    val files =
-      if (paths.length >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) {
-        PartitioningAwareFileCatalog.listLeafFilesInParallel(paths, hadoopConf, sparkSession)
-      } else {
-        PartitioningAwareFileCatalog.listLeafFilesInSerial(paths, hadoopConf)
-      }
-
-    HiveCatalogMetrics.incrementFilesDiscovered(files.size)
-    mutable.LinkedHashSet(files: _*)
-  }
 }
 
-object PartitioningAwareFileCatalog extends Logging {
+object PartitioningAwareFileCatalog {
   val BASE_PATH_PARAM = "basePath"
-
-  /** A serializable variant of HDFS's BlockLocation. */
-  private case class SerializableBlockLocation(
-      names: Array[String],
-      hosts: Array[String],
-      offset: Long,
-      length: Long)
-
-  /** A serializable variant of HDFS's FileStatus. */
-  private case class SerializableFileStatus(
-      path: String,
-      length: Long,
-      isDir: Boolean,
-      blockReplication: Short,
-      blockSize: Long,
-      modificationTime: Long,
-      accessTime: Long,
-      blockLocations: Array[SerializableBlockLocation])
-
-  /**
-   * List a collection of path recursively.
-   */
-  private def listLeafFilesInSerial(
-      paths: Seq[Path],
-      hadoopConf: Configuration): Seq[FileStatus] = {
-    // Dummy jobconf to get to the pathFilter defined in configuration
-    val jobConf = new JobConf(hadoopConf, this.getClass)
-    val filter = FileInputFormat.getInputPathFilter(jobConf)
-
-    paths.flatMap { path =>
-      val fs = path.getFileSystem(hadoopConf)
-      listLeafFiles0(fs, path, filter)
-    }
-  }
-
-  /**
-   * List a collection of path recursively in parallel (using Spark executors).
-   * Each task launched will use [[listLeafFilesInSerial]] to list.
-   */
-  private def listLeafFilesInParallel(
-      paths: Seq[Path],
-      hadoopConf: Configuration,
-      sparkSession: SparkSession): Seq[FileStatus] = {
-    assert(paths.size >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold)
-    logInfo(s"Listing leaf files and directories in parallel under: ${paths.mkString(", ")}")
-
-    val sparkContext = sparkSession.sparkContext
-    val serializableConfiguration = new SerializableConfiguration(hadoopConf)
-    val serializedPaths = paths.map(_.toString)
-
-    // Set the number of parallelism to prevent following file listing from generating many tasks
-    // in case of large #defaultParallelism.
-    val numParallelism = Math.min(paths.size, 10000)
-
-    val statuses = sparkContext
-      .parallelize(serializedPaths, numParallelism)
-      .mapPartitions { paths =>
-        val hadoopConf = serializableConfiguration.value
-        listLeafFilesInSerial(paths.map(new Path(_)).toSeq, hadoopConf).iterator
-      }.map { status =>
-        // Turn FileStatus into SerializableFileStatus so we can send it back to the driver
-        val blockLocations = status match {
-          case f: LocatedFileStatus =>
-            f.getBlockLocations.map { loc =>
-              SerializableBlockLocation(
-                loc.getNames,
-                loc.getHosts,
-                loc.getOffset,
-                loc.getLength)
-            }
-
-          case _ =>
-            Array.empty[SerializableBlockLocation]
-        }
-
-        SerializableFileStatus(
-          status.getPath.toString,
-          status.getLen,
-          status.isDirectory,
-          status.getReplication,
-          status.getBlockSize,
-          status.getModificationTime,
-          status.getAccessTime,
-          blockLocations)
-      }.collect()
-
-    // Turn SerializableFileStatus back to Status
-    statuses.map { f =>
-      val blockLocations = f.blockLocations.map { loc =>
-        new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length)
-      }
-      new LocatedFileStatus(
-        new FileStatus(
-          f.length, f.isDir, f.blockReplication, f.blockSize, f.modificationTime, new Path(f.path)),
-        blockLocations)
-    }
-  }
-
-  /**
-   * List a single path, provided as a FileStatus, in serial.
-   */
-  private def listLeafFiles0(
-      fs: FileSystem, path: Path, filter: PathFilter): Seq[FileStatus] = {
-    logTrace(s"Listing $path")
-    val name = path.getName.toLowerCase
-    if (shouldFilterOut(name)) {
-      Seq.empty[FileStatus]
-    } else {
-      // [SPARK-17599] Prevent ListingFileCatalog from failing if path doesn't exist
-      // Note that statuses only include FileStatus for the files and dirs directly under path,
-      // and does not include anything else recursively.
-      val statuses = try fs.listStatus(path) catch {
-        case _: FileNotFoundException =>
-          logWarning(s"The directory $path was not found. Was it deleted very recently?")
-          Array.empty[FileStatus]
-      }
-
-      val allLeafStatuses = {
-        val (dirs, files) = statuses.partition(_.isDirectory)
-        val stats = files ++ dirs.flatMap(dir => listLeafFiles0(fs, dir.getPath, filter))
-        if (filter != null) stats.filter(f => filter.accept(f.getPath)) else stats
-      }
-
-      allLeafStatuses.filterNot(status => shouldFilterOut(status.getPath.getName)).map {
-        case f: LocatedFileStatus =>
-          f
-
-        // NOTE:
-        //
-        // - Although S3/S3A/S3N file system can be quite slow for remote file metadata
-        //   operations, calling `getFileBlockLocations` does no harm here since these file system
-        //   implementations don't actually issue RPC for this method.
-        //
-        // - Here we are calling `getFileBlockLocations` in a sequential manner, but it should not
-        //   be a big deal since we always use to `listLeafFilesInParallel` when the number of
-        //   paths exceeds threshold.
-        case f =>
-          // The other constructor of LocatedFileStatus will call FileStatus.getPermission(),
-          // which is very slow on some file system (RawLocalFileSystem, which is launch a
-          // subprocess and parse the stdout).
-          val locations = fs.getFileBlockLocations(f, 0, f.getLen)
-          val lfs = new LocatedFileStatus(f.getLen, f.isDirectory, f.getReplication, f.getBlockSize,
-            f.getModificationTime, 0, null, null, null, null, f.getPath, locations)
-          if (f.isSymlink) {
-            lfs.setSymlink(f.getSymlink)
-          }
-          lfs
-      }
-    }
-  }
-
-  /** Checks if we should filter out this path name. */
-  def shouldFilterOut(pathName: String): Boolean = {
-    // We filter everything that starts with _ and ., except _common_metadata and _metadata
-    // because Parquet needs to find those metadata files from leaf files returned by this method.
-    // We should refactor this logic to not mix metadata files with data files.
-    ((pathName.startsWith("_") && !pathName.contains("=")) || pathName.startsWith(".")) &&
-      !pathName.startsWith("_common_metadata") && !pathName.startsWith("_metadata")
-  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
index ac6795b9a2e7b..504464216e5a4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
@@ -33,8 +33,8 @@ import org.apache.spark.sql.types._
 
 // TODO: We should tighten up visibility of the classes here once we clean up Hive coupling.
 
-object PartitionPath {
-  def apply(values: InternalRow, path: String): PartitionPath =
+object PartitionDirectory {
+  def apply(values: InternalRow, path: String): PartitionDirectory =
     apply(values, new Path(path))
 }
 
@@ -42,14 +42,14 @@ object PartitionPath {
  * Holds a directory in a partitioned collection of files as well as as the partition values
  * in the form of a Row.  Before scanning, the files at `path` need to be enumerated.
  */
-case class PartitionPath(values: InternalRow, path: Path)
+case class PartitionDirectory(values: InternalRow, path: Path)
 
 case class PartitionSpec(
     partitionColumns: StructType,
-    partitions: Seq[PartitionPath])
+    partitions: Seq[PartitionDirectory])
 
 object PartitionSpec {
-  val emptySpec = PartitionSpec(StructType(Seq.empty[StructField]), Seq.empty[PartitionPath])
+  val emptySpec = PartitionSpec(StructType(Seq.empty[StructField]), Seq.empty[PartitionDirectory])
 }
 
 object PartitioningUtils {
@@ -141,7 +141,7 @@ object PartitioningUtils {
       // Finally, we create `Partition`s based on paths and resolved partition values.
       val partitions = resolvedPartitionValues.zip(pathsWithPartitionValues).map {
         case (PartitionValues(_, literals), (path, _)) =>
-          PartitionPath(InternalRow.fromSeq(literals.map(_.value)), path)
+          PartitionDirectory(InternalRow.fromSeq(literals.map(_.value)), path)
       }
 
       PartitionSpec(StructType(fields), partitions)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
new file mode 100644
index 0000000000000..4807a92c2e6b8
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
@@ -0,0 +1,225 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import java.io.FileNotFoundException
+
+import scala.collection.mutable
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs._
+import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.metrics.source.HiveCatalogMetrics
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.util.SerializableConfiguration
+
+
+/**
+ * A base class for [[BasicFileCatalog]]s that need a [[SparkSession]] and the ability to find leaf
+ * files in a list of HDFS paths.
+ *
+ * @param sparkSession a [[SparkSession]]
+ * @param ignoreFileNotFound (see [[ListingFileCatalog]])
+ */
+abstract class SessionFileCatalog(sparkSession: SparkSession)
+    extends BasicFileCatalog with Logging {
+  protected val hadoopConf: Configuration
+
+  /**
+   * List leaf files of given paths. This method will submit a Spark job to do parallel
+   * listing whenever there is a path having more files than the parallel partition discovery
+   * discovery threshold.
+   *
+   * This is publicly visible for testing.
+   */
+  def listLeafFiles(paths: Seq[Path]): mutable.LinkedHashSet[FileStatus] = {
+    val files =
+      if (paths.length >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) {
+        SessionFileCatalog.listLeafFilesInParallel(paths, hadoopConf, sparkSession)
+      } else {
+        SessionFileCatalog.listLeafFilesInSerial(paths, hadoopConf)
+      }
+
+    HiveCatalogMetrics.incrementFilesDiscovered(files.size)
+    mutable.LinkedHashSet(files: _*)
+  }
+}
+
+object SessionFileCatalog extends Logging {
+
+  /** A serializable variant of HDFS's BlockLocation. */
+  private case class SerializableBlockLocation(
+      names: Array[String],
+      hosts: Array[String],
+      offset: Long,
+      length: Long)
+
+  /** A serializable variant of HDFS's FileStatus. */
+  private case class SerializableFileStatus(
+      path: String,
+      length: Long,
+      isDir: Boolean,
+      blockReplication: Short,
+      blockSize: Long,
+      modificationTime: Long,
+      accessTime: Long,
+      blockLocations: Array[SerializableBlockLocation])
+
+  /**
+   * List a collection of path recursively.
+   */
+  private def listLeafFilesInSerial(
+      paths: Seq[Path],
+      hadoopConf: Configuration): Seq[FileStatus] = {
+    // Dummy jobconf to get to the pathFilter defined in configuration
+    val jobConf = new JobConf(hadoopConf, this.getClass)
+    val filter = FileInputFormat.getInputPathFilter(jobConf)
+
+    paths.flatMap { path =>
+      val fs = path.getFileSystem(hadoopConf)
+      listLeafFiles0(fs, path, filter)
+    }
+  }
+
+  /**
+   * List a collection of path recursively in parallel (using Spark executors).
+   * Each task launched will use [[listLeafFilesInSerial]] to list.
+   */
+  private def listLeafFilesInParallel(
+      paths: Seq[Path],
+      hadoopConf: Configuration,
+      sparkSession: SparkSession): Seq[FileStatus] = {
+    assert(paths.size >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold)
+    logInfo(s"Listing leaf files and directories in parallel under: ${paths.mkString(", ")}")
+
+    val sparkContext = sparkSession.sparkContext
+    val serializableConfiguration = new SerializableConfiguration(hadoopConf)
+    val serializedPaths = paths.map(_.toString)
+
+    // Set the number of parallelism to prevent following file listing from generating many tasks
+    // in case of large #defaultParallelism.
+    val numParallelism = Math.min(paths.size, 10000)
+
+    val statuses = sparkContext
+      .parallelize(serializedPaths, numParallelism)
+      .mapPartitions { paths =>
+        val hadoopConf = serializableConfiguration.value
+        listLeafFilesInSerial(paths.map(new Path(_)).toSeq, hadoopConf).iterator
+      }.map { status =>
+        // Turn FileStatus into SerializableFileStatus so we can send it back to the driver
+        val blockLocations = status match {
+          case f: LocatedFileStatus =>
+            f.getBlockLocations.map { loc =>
+              SerializableBlockLocation(
+                loc.getNames,
+                loc.getHosts,
+                loc.getOffset,
+                loc.getLength)
+            }
+
+          case _ =>
+            Array.empty[SerializableBlockLocation]
+        }
+
+        SerializableFileStatus(
+          status.getPath.toString,
+          status.getLen,
+          status.isDirectory,
+          status.getReplication,
+          status.getBlockSize,
+          status.getModificationTime,
+          status.getAccessTime,
+          blockLocations)
+      }.collect()
+
+    // Turn SerializableFileStatus back to Status
+    statuses.map { f =>
+      val blockLocations = f.blockLocations.map { loc =>
+        new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length)
+      }
+      new LocatedFileStatus(
+        new FileStatus(
+          f.length, f.isDir, f.blockReplication, f.blockSize, f.modificationTime, new Path(f.path)),
+        blockLocations)
+    }
+  }
+
+  /**
+   * List a single path, provided as a FileStatus, in serial.
+   */
+  private def listLeafFiles0(
+      fs: FileSystem, path: Path, filter: PathFilter): Seq[FileStatus] = {
+    logTrace(s"Listing $path")
+    val name = path.getName.toLowerCase
+    if (shouldFilterOut(name)) {
+      Seq.empty[FileStatus]
+    } else {
+      // [SPARK-17599] Prevent ListingFileCatalog from failing if path doesn't exist
+      // Note that statuses only include FileStatus for the files and dirs directly under path,
+      // and does not include anything else recursively.
+      val statuses = try fs.listStatus(path) catch {
+        case _: FileNotFoundException =>
+          logWarning(s"The directory $path was not found. Was it deleted very recently?")
+          Array.empty[FileStatus]
+      }
+
+      val allLeafStatuses = {
+        val (dirs, files) = statuses.partition(_.isDirectory)
+        val stats = files ++ dirs.flatMap(dir => listLeafFiles0(fs, dir.getPath, filter))
+        if (filter != null) stats.filter(f => filter.accept(f.getPath)) else stats
+      }
+
+      allLeafStatuses.filterNot(status => shouldFilterOut(status.getPath.getName)).map {
+        case f: LocatedFileStatus =>
+          f
+
+        // NOTE:
+        //
+        // - Although S3/S3A/S3N file system can be quite slow for remote file metadata
+        //   operations, calling `getFileBlockLocations` does no harm here since these file system
+        //   implementations don't actually issue RPC for this method.
+        //
+        // - Here we are calling `getFileBlockLocations` in a sequential manner, but it should not
+        //   be a big deal since we always use to `listLeafFilesInParallel` when the number of
+        //   paths exceeds threshold.
+        case f =>
+          // The other constructor of LocatedFileStatus will call FileStatus.getPermission(),
+          // which is very slow on some file system (RawLocalFileSystem, which is launch a
+          // subprocess and parse the stdout).
+          val locations = fs.getFileBlockLocations(f, 0, f.getLen)
+          val lfs = new LocatedFileStatus(f.getLen, f.isDirectory, f.getReplication, f.getBlockSize,
+            f.getModificationTime, 0, null, null, null, null, f.getPath, locations)
+          if (f.isSymlink) {
+            lfs.setSymlink(f.getSymlink)
+          }
+          lfs
+      }
+    }
+  }
+
+  /** Checks if we should filter out this path name. */
+  def shouldFilterOut(pathName: String): Boolean = {
+    // We filter everything that starts with _ and ., except _common_metadata and _metadata
+    // because Parquet needs to find those metadata files from leaf files returned by this method.
+    // We should refactor this logic to not mix metadata files with data files.
+    ((pathName.startsWith("_") && !pathName.contains("=")) || pathName.startsWith(".")) &&
+      !pathName.startsWith("_common_metadata") && !pathName.startsWith("_metadata")
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index 5648ab480a98a..a5c41b244589b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.types.StructType
 
 
 /**
- * A [[FileCatalog]] for a metastore catalog table.
+ * A [[BasicFileCatalog]] for a metastore catalog table.
  *
  * @param sparkSession a [[SparkSession]]
  * @param db the table's database name
@@ -38,9 +38,10 @@ class TableFileCatalog(
     db: String,
     table: String,
     partitionSchema: Option[StructType],
-    override val sizeInBytes: Long) extends FileCatalog {
+    override val sizeInBytes: Long)
+  extends SessionFileCatalog(sparkSession) {
 
-  protected val hadoopConf = sparkSession.sessionState.newHadoopConf
+  override protected val hadoopConf = sparkSession.sessionState.newHadoopConf
 
   private val externalCatalog = sparkSession.sharedState.externalCatalog
 
@@ -50,7 +51,7 @@ class TableFileCatalog(
 
   override def rootPaths: Seq[Path] = baseLocation.map(new Path(_)).toSeq
 
-  override def listFiles(filters: Seq[Expression]): Seq[PartitionDirectory] = {
+  override def listFiles(filters: Seq[Expression]): Seq[Partition] = {
     filterPartitions(filters).listFiles(Nil)
   }
 
@@ -78,7 +79,7 @@ class TableFileCatalog(
       case Some(schema) =>
         val selectedPartitions = externalCatalog.listPartitionsByFilter(db, table, filters)
         val partitions = selectedPartitions.map { p =>
-          PartitionPath(p.toRow(schema), p.storage.locationUri.get)
+          PartitionDirectory(p.toRow(schema), p.storage.locationUri.get)
         }
         val partitionSpec = PartitionSpec(schema, partitions)
         new PrunedTableFileCatalog(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala
index 9c43169cbf898..2695974b84b00 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala
@@ -81,16 +81,6 @@ class FileCatalogSuite extends SharedSQLContext {
     }
   }
 
-  test("PartitioningAwareFileCatalog - file filtering") {
-    assert(!PartitioningAwareFileCatalog.shouldFilterOut("abcd"))
-    assert(PartitioningAwareFileCatalog.shouldFilterOut(".ab"))
-    assert(PartitioningAwareFileCatalog.shouldFilterOut("_cd"))
-    assert(!PartitioningAwareFileCatalog.shouldFilterOut("_metadata"))
-    assert(!PartitioningAwareFileCatalog.shouldFilterOut("_common_metadata"))
-    assert(PartitioningAwareFileCatalog.shouldFilterOut("_ab_metadata"))
-    assert(PartitioningAwareFileCatalog.shouldFilterOut("_cd_common_metadata"))
-  }
-
   test("SPARK-17613 - PartitioningAwareFileCatalog: base path w/o '/' at end") {
     class MockCatalog(
       override val rootPaths: Seq[Path])
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalogSuite.scala
new file mode 100644
index 0000000000000..df509583377ae
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalogSuite.scala
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import org.apache.spark.SparkFunSuite
+
+class SessionFileCatalogSuite extends SparkFunSuite {
+
+  test("file filtering") {
+    assert(!SessionFileCatalog.shouldFilterOut("abcd"))
+    assert(SessionFileCatalog.shouldFilterOut(".ab"))
+    assert(SessionFileCatalog.shouldFilterOut("_cd"))
+
+    assert(!SessionFileCatalog.shouldFilterOut("_metadata"))
+    assert(!SessionFileCatalog.shouldFilterOut("_common_metadata"))
+    assert(SessionFileCatalog.shouldFilterOut("_ab_metadata"))
+    assert(SessionFileCatalog.shouldFilterOut("_cd_common_metadata"))
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
index 36d4df0015ffd..43357c97c395a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
@@ -30,7 +30,7 @@ import org.apache.parquet.hadoop.ParquetOutputFormat
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Literal
-import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation, PartitionPath => Partition, PartitioningAwareFileCatalog, PartitioningUtils, PartitionSpec}
+import org.apache.spark.sql.execution.datasources.{FileCatalog, HadoopFsRelation, LogicalRelation, PartitionDirectory => Partition, PartitioningUtils, PartitionSpec}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
@@ -626,11 +626,10 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
       (1 to 10).map(i => (i, i.toString)).toDF("a", "b").write.parquet(dir.getCanonicalPath)
       val queryExecution = spark.read.parquet(dir.getCanonicalPath).queryExecution
       queryExecution.analyzed.collectFirst {
-        case LogicalRelation(
-            HadoopFsRelation(location: PartitioningAwareFileCatalog, _, _, _, _, _), _, _) =>
-          assert(location.partitionSpec() === PartitionSpec.emptySpec)
+        case LogicalRelation(HadoopFsRelation(location: FileCatalog, _, _, _, _, _), _, _) =>
+          assert(location.partitionSpec === PartitionSpec.emptySpec)
       }.getOrElse {
-        fail(s"Expecting a matching HadoopFsRelation, but got:\n$queryExecution")
+        fail(s"Expecting a ParquetRelation2, but got:\n$queryExecution")
       }
     }
   }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 16e1e37b2fb02..4a2aaa7d4f6ca 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -30,7 +30,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
 import org.apache.spark.sql.execution.command.DDLUtils
-import org.apache.spark.sql.execution.datasources._
+import org.apache.spark.sql.execution.datasources.{Partition => _, _}
 import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat, ParquetOptions}
 import org.apache.spark.sql.hive.orc.OrcFileFormat
 import org.apache.spark.sql.types._

From 7d878cf2da04800bc4147b05610170865b148c64 Mon Sep 17 00:00:00 2001
From: Liwei Lin <lwlin7@gmail.com>
Date: Tue, 18 Oct 2016 00:49:57 -0700
Subject: [PATCH 040/162] [SQL][STREAMING][TEST] Fix flaky tests in
 StreamingQueryListenerSuite

This work has largely been done by lw-lin in his PR #15497. This is a slight refactoring of it.

## What changes were proposed in this pull request?
There were two sources of flakiness in StreamingQueryListener test.

- When testing with manual clock, consecutive attempts to advance the clock can occur without the stream execution thread being unblocked and doing some work between the two attempts. Hence the following can happen with the current ManualClock.
```
+-----------------------------------+--------------------------------+
|      StreamExecution thread       |         testing thread         |
+-----------------------------------+--------------------------------+
|  ManualClock.waitTillTime(100) {  |                                |
|        _isWaiting = true          |                                |
|            wait(10)               |                                |
|        still in wait(10)          |  if (_isWaiting) advance(100)  |
|        still in wait(10)          |  if (_isWaiting) advance(200)  | <- this should be disallowed !
|        still in wait(10)          |  if (_isWaiting) advance(300)  | <- this should be disallowed !
|      wake up from wait(10)        |                                |
|       current time is 600         |                                |
|       _isWaiting = false          |                                |
|  }                                |                                |
+-----------------------------------+--------------------------------+
```

- Second source of flakiness is that the adding data to memory stream may get processing in any trigger, not just the first trigger.

My fix is to make the manual clock wait for the other stream execution thread to start waiting for the clock at the right wait start time. That is, `advance(200)` (see above) will wait for stream execution thread to complete the wait that started at time 0, and start a new wait at time 200 (i.e. time stamp after the previous `advance(100)`).

In addition, since this is a feature that is solely used by StreamExecution, I removed all the non-generic code from ManualClock and put them in StreamManualClock inside StreamTest.

## How was this patch tested?
Ran existing unit test MANY TIME in Jenkins

Author: Tathagata Das <tathagata.das1565@gmail.com>
Author: Liwei Lin <lwlin7@gmail.com>

Closes #15519 from tdas/metrics-flaky-test-fix.
---
 .../org/apache/spark/util/ManualClock.scala   | 18 ++-------
 .../spark/sql/streaming/StreamSuite.scala     |  4 +-
 .../spark/sql/streaming/StreamTest.scala      | 38 ++++++++++++++++---
 .../StreamingQueryListenerSuite.scala         |  8 ++--
 4 files changed, 41 insertions(+), 27 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/ManualClock.scala b/core/src/main/scala/org/apache/spark/util/ManualClock.scala
index 91a95871014f0..e7a65d74a440e 100644
--- a/core/src/main/scala/org/apache/spark/util/ManualClock.scala
+++ b/core/src/main/scala/org/apache/spark/util/ManualClock.scala
@@ -26,8 +26,6 @@ package org.apache.spark.util
  */
 private[spark] class ManualClock(private var time: Long) extends Clock {
 
-  private var _isWaiting = false
-
   /**
    * @return `ManualClock` with initial time 0
    */
@@ -59,19 +57,9 @@ private[spark] class ManualClock(private var time: Long) extends Clock {
    * @return current time reported by the clock when waiting finishes
    */
   def waitTillTime(targetTime: Long): Long = synchronized {
-    _isWaiting = true
-    try {
-      while (time < targetTime) {
-        wait(10)
-      }
-      getTimeMillis()
-    } finally {
-      _isWaiting = false
+    while (time < targetTime) {
+      wait(10)
     }
+    getTimeMillis()
   }
-
-  /**
-   * Returns whether there is any thread being blocked in `waitTillTime`.
-   */
-  def isWaiting: Boolean = synchronized { _isWaiting }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
index cdbad901dba8e..6bdf47901ae68 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
@@ -161,7 +161,7 @@ class StreamSuite extends StreamTest {
 
     val inputData = MemoryStream[Int]
     testStream(inputData.toDS())(
-      StartStream(ProcessingTime("10 seconds"), new ManualClock),
+      StartStream(ProcessingTime("10 seconds"), new StreamManualClock),
 
       /* -- batch 0 ----------------------- */
       // Add some data in batch 0
@@ -199,7 +199,7 @@ class StreamSuite extends StreamTest {
 
       /* Stop then restart the Stream  */
       StopStream,
-      StartStream(ProcessingTime("10 seconds"), new ManualClock),
+      StartStream(ProcessingTime("10 seconds"), new StreamManualClock(60 * 1000)),
 
       /* -- batch 1 rerun ----------------- */
       // this batch 1 would re-run because the latest batch id logged in offset log is 1
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index 3b9d3786349ad..254f823bf54f7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -204,6 +204,21 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
   case class AssertOnLastQueryStatus(condition: StreamingQueryStatus => Unit)
     extends StreamAction
 
+  class StreamManualClock(time: Long = 0L) extends ManualClock(time) {
+    private var waitStartTime: Option[Long] = None
+
+    override def waitTillTime(targetTime: Long): Long = synchronized {
+      try {
+        waitStartTime = Some(getTimeMillis())
+        super.waitTillTime(targetTime)
+      } finally {
+        waitStartTime = None
+      }
+    }
+
+    def isStreamWaitingAt(time: Long): Boolean = synchronized { waitStartTime.contains(time) }
+  }
+
 
   /**
    * Executes the specified actions on the given streaming DataFrame and provides helpful
@@ -307,7 +322,7 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
     val testThread = Thread.currentThread()
     val metadataRoot = Utils.createTempDir(namePrefix = "streaming.metadata").getCanonicalPath
     val statusCollector = new QueryStatusCollector
-
+    var manualClockExpectedTime = -1L
     try {
       spark.streams.addListener(statusCollector)
       startedTest.foreach { action =>
@@ -315,6 +330,12 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
         action match {
           case StartStream(trigger, triggerClock) =>
             verify(currentStream == null, "stream already running")
+            verify(triggerClock.isInstanceOf[SystemClock]
+              || triggerClock.isInstanceOf[StreamManualClock],
+              "Use either SystemClock or StreamManualClock to start the stream")
+            if (triggerClock.isInstanceOf[StreamManualClock]) {
+              manualClockExpectedTime = triggerClock.asInstanceOf[StreamManualClock].getTimeMillis()
+            }
             lastStream = currentStream
             currentStream =
               spark
@@ -338,14 +359,19 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
           case AdvanceManualClock(timeToAdd) =>
             verify(currentStream != null,
                    "can not advance manual clock when a stream is not running")
-            verify(currentStream.triggerClock.isInstanceOf[ManualClock],
+            verify(currentStream.triggerClock.isInstanceOf[StreamManualClock],
                    s"can not advance clock of type ${currentStream.triggerClock.getClass}")
-            val clock = currentStream.triggerClock.asInstanceOf[ManualClock]
+            val clock = currentStream.triggerClock.asInstanceOf[StreamManualClock]
+            assert(manualClockExpectedTime >= 0)
             // Make sure we don't advance ManualClock too early. See SPARK-16002.
-            eventually("ManualClock has not yet entered the waiting state") {
-              assert(clock.isWaiting)
+            eventually("StreamManualClock has not yet entered the waiting state") {
+              assert(clock.isStreamWaitingAt(manualClockExpectedTime))
             }
-            currentStream.triggerClock.asInstanceOf[ManualClock].advance(timeToAdd)
+            clock.advance(timeToAdd)
+            manualClockExpectedTime += timeToAdd
+            verify(clock.getTimeMillis() === manualClockExpectedTime,
+              s"Unexpected clock time after updating: " +
+                s"expecting $manualClockExpectedTime, current ${clock.getTimeMillis()}")
 
           case StopStream =>
             verify(currentStream != null, "can not stop a stream that is not running")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
index 9e0eefbc58aa5..623f66a778eac 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
@@ -43,9 +43,9 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
     // Make sure we don't leak any events to the next test
   }
 
-  ignore("single listener, check trigger statuses") {
+  test("single listener, check trigger statuses") {
     import StreamingQueryListenerSuite._
-    clock = new ManualClock()
+    clock = new StreamManualClock
 
     /** Custom MemoryStream that waits for manual clock to reach a time */
     val inputData = new MemoryStream[Int](0, sqlContext) {
@@ -81,7 +81,7 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
       AssertOnLastQueryStatus { status: StreamingQueryStatus =>
         // Check the correctness of the trigger info of the last completed batch reported by
         // onQueryProgress
-        assert(status.triggerDetails.get("triggerId") == "0")
+        assert(status.triggerDetails.containsKey("triggerId"))
         assert(status.triggerDetails.get("isTriggerActive") === "false")
         assert(status.triggerDetails.get("isDataPresentInTrigger") === "true")
 
@@ -101,7 +101,7 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
         assert(status.triggerDetails.get("numRows.state.aggregation1.updated") === "1")
 
         assert(status.sourceStatuses.length === 1)
-        assert(status.sourceStatuses(0).triggerDetails.get("triggerId") === "0")
+        assert(status.sourceStatuses(0).triggerDetails.containsKey("triggerId"))
         assert(status.sourceStatuses(0).triggerDetails.get("latency.getOffset.source") === "100")
         assert(status.sourceStatuses(0).triggerDetails.get("latency.getBatch.source") === "200")
         assert(status.sourceStatuses(0).triggerDetails.get("numRows.input.source") === "2")

From a9e79a41ee19258e5eb8da74bef4b8af9a2ccb95 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Tue, 18 Oct 2016 02:29:55 -0700
Subject: [PATCH 041/162] [SQL][STREAMING][TEST] Follow up to remove
 Option.contains for Scala 2.10 compatibility

## What changes were proposed in this pull request?

Scala 2.10 does not have Option.contains, which broke Scala 2.10 build.

## How was this patch tested?
Locally compiled and ran sql/core unit tests in 2.10

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #15531 from tdas/metrics-flaky-test-fix-1.
---
 .../scala/org/apache/spark/sql/streaming/StreamTest.scala     | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index 254f823bf54f7..8dfeb8da4b826 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -216,7 +216,9 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
       }
     }
 
-    def isStreamWaitingAt(time: Long): Boolean = synchronized { waitStartTime.contains(time) }
+    def isStreamWaitingAt(time: Long): Boolean = synchronized {
+      waitStartTime == Some(time)
+    }
   }
 
 

From e59df62e62ec4c5f8bd02a13f05fa3ec6f0fc694 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Tue, 18 Oct 2016 11:03:10 -0700
Subject: [PATCH 042/162] [SPARK-17899][SQL][FOLLOW-UP] debug mode should work
 for corrupted table

## What changes were proposed in this pull request?

Debug mode should work for corrupted table, so that we can really debug

## How was this patch tested?

new test in `MetastoreDataSourcesSuite`

Author: Wenchen Fan <wenchen@databricks.com>

Closes #15528 from cloud-fan/debug.
---
 .../spark/sql/hive/HiveExternalCatalog.scala   |  9 ++-------
 .../sql/hive/MetastoreDataSourcesSuite.scala   | 18 +++++++++++++++---
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index ff59b54f53909..2003ff42d4f0c 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -448,7 +448,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
    * properties, and filter out these special entries from table properties.
    */
   private def restoreTableMetadata(table: CatalogTable): CatalogTable = {
-    val catalogTable = if (table.tableType == VIEW) {
+    val catalogTable = if (table.tableType == VIEW || conf.get(DEBUG_MODE)) {
       table
     } else {
       getProviderFromTableProperties(table).map { provider =>
@@ -467,18 +467,13 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
         } else {
           table.storage
         }
-        val tableProps = if (conf.get(DEBUG_MODE)) {
-          table.properties
-        } else {
-          getOriginalTableProperties(table)
-        }
         table.copy(
           storage = storage,
           schema = getSchemaFromTableProperties(table),
           provider = Some(provider),
           partitionColumnNames = getPartitionColumnsFromTableProperties(table),
           bucketSpec = getBucketSpecFromTableProperties(table),
-          properties = tableProps)
+          properties = getOriginalTableProperties(table))
       } getOrElse {
         table.copy(provider = Some("hive"))
       }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index 7cc6179d44977..eaa67d370db37 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -1321,20 +1321,32 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
         sharedState.externalCatalog.getTable("default", "t")
       }.getMessage
       assert(e.contains(s"Could not read schema from the hive metastore because it is corrupted"))
+
+      withDebugMode {
+        val tableMeta = sharedState.externalCatalog.getTable("default", "t")
+        assert(tableMeta.identifier == TableIdentifier("t", Some("default")))
+        assert(tableMeta.properties(DATASOURCE_PROVIDER) == "json")
+      }
     } finally {
       hiveClient.dropTable("default", "t", ignoreIfNotExists = true, purge = true)
     }
   }
 
   test("should keep data source entries in table properties when debug mode is on") {
-    val previousValue = sparkSession.sparkContext.conf.get(DEBUG_MODE)
-    try {
-      sparkSession.sparkContext.conf.set(DEBUG_MODE, true)
+    withDebugMode {
       val newSession = sparkSession.newSession()
       newSession.sql("CREATE TABLE abc(i int) USING json")
       val tableMeta = newSession.sessionState.catalog.getTableMetadata(TableIdentifier("abc"))
       assert(tableMeta.properties(DATASOURCE_SCHEMA_NUMPARTS).toInt == 1)
       assert(tableMeta.properties(DATASOURCE_PROVIDER) == "json")
+    }
+  }
+
+  private def withDebugMode(f: => Unit): Unit = {
+    val previousValue = sparkSession.sparkContext.conf.get(DEBUG_MODE)
+    try {
+      sparkSession.sparkContext.conf.set(DEBUG_MODE, true)
+      f
     } finally {
       sparkSession.sparkContext.conf.set(DEBUG_MODE, previousValue)
     }

From 37686539f546ac7a3657dbfc59b7ac982b4b9bce Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Tue, 18 Oct 2016 13:20:42 -0700
Subject: [PATCH 043/162] [SPARK-17388] [SQL] Support for inferring type
 date/timestamp/decimal for partition column

## What changes were proposed in this pull request?

Currently, Spark only supports to infer `IntegerType`, `LongType`, `DoubleType` and `StringType`.

`DecimalType` is being tried but it seems it never infers type as `DecimalType` as `DoubleType` is being tried first. Also, it seems `DateType` and `TimestampType` could be inferred.

As far as I know, it is pretty common to use both for a partition column.

This PR fixes the incorrect `DecimalType` try and also adds the support for both `DateType` and `TimestampType` for inferring partition column type.

## How was this patch tested?

Unit tests in `ParquetPartitionDiscoverySuite`.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #14947 from HyukjinKwon/SPARK-17388.
---
 .../datasources/PartitioningUtils.scala       | 21 ++++++++--
 .../ParquetPartitionDiscoverySuite.scala      | 42 ++++++++++++++++++-
 2 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
index 504464216e5a4..381261cf65ca0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.datasources
 
 import java.lang.{Double => JDouble, Long => JLong}
 import java.math.{BigDecimal => JBigDecimal}
+import java.sql.{Date => JDate, Timestamp => JTimestamp}
 
 import scala.collection.mutable.ArrayBuffer
 import scala.util.Try
@@ -307,20 +308,34 @@ object PartitioningUtils {
 
   /**
    * Converts a string to a [[Literal]] with automatic type inference.  Currently only supports
-   * [[IntegerType]], [[LongType]], [[DoubleType]], [[DecimalType.SYSTEM_DEFAULT]], and
-   * [[StringType]].
+   * [[IntegerType]], [[LongType]], [[DoubleType]], [[DecimalType]], [[DateType]]
+   * [[TimestampType]], and [[StringType]].
    */
   private[datasources] def inferPartitionColumnValue(
       raw: String,
       defaultPartitionName: String,
       typeInference: Boolean): Literal = {
+    val decimalTry = Try {
+      // `BigDecimal` conversion can fail when the `field` is not a form of number.
+      val bigDecimal = new JBigDecimal(raw)
+      // It reduces the cases for decimals by disallowing values having scale (eg. `1.1`).
+      require(bigDecimal.scale <= 0)
+      // `DecimalType` conversion can fail when
+      //   1. The precision is bigger than 38.
+      //   2. scale is bigger than precision.
+      Literal(bigDecimal)
+    }
+
     if (typeInference) {
       // First tries integral types
       Try(Literal.create(Integer.parseInt(raw), IntegerType))
         .orElse(Try(Literal.create(JLong.parseLong(raw), LongType)))
+        .orElse(decimalTry)
         // Then falls back to fractional types
         .orElse(Try(Literal.create(JDouble.parseDouble(raw), DoubleType)))
-        .orElse(Try(Literal(new JBigDecimal(raw))))
+        // Then falls back to date/timestamp types
+        .orElse(Try(Literal(JDate.valueOf(raw))))
+        .orElse(Try(Literal(JTimestamp.valueOf(unescapePathName(raw)))))
         // Then falls back to string
         .getOrElse {
           if (raw == defaultPartitionName) {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
index 43357c97c395a..2ef66baee1eac 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.datasources.parquet
 
 import java.io.File
 import java.math.BigInteger
-import java.sql.Timestamp
+import java.sql.{Date, Timestamp}
 
 import scala.collection.mutable.ArrayBuffer
 
@@ -56,8 +56,14 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
 
     check("10", Literal.create(10, IntegerType))
     check("1000000000000000", Literal.create(1000000000000000L, LongType))
+    val decimal = Decimal("1" * 20)
+    check("1" * 20,
+      Literal.create(decimal, DecimalType(decimal.precision, decimal.scale)))
     check("1.5", Literal.create(1.5, DoubleType))
     check("hello", Literal.create("hello", StringType))
+    check("1990-02-24", Literal.create(Date.valueOf("1990-02-24"), DateType))
+    check("1990-02-24 12:00:30",
+      Literal.create(Timestamp.valueOf("1990-02-24 12:00:30"), TimestampType))
     check(defaultPartitionName, Literal.create(null, NullType))
   }
 
@@ -687,6 +693,40 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
     }
   }
 
+  test("Various inferred partition value types") {
+    val row =
+      Row(
+        Long.MaxValue,
+        4.5,
+        new java.math.BigDecimal(new BigInteger("1" * 20)),
+        java.sql.Date.valueOf("2015-05-23"),
+        java.sql.Timestamp.valueOf("1990-02-24 12:00:30"),
+        "This is a string, /[]?=:",
+        "This is not a partition column")
+
+    val partitionColumnTypes =
+      Seq(
+        LongType,
+        DoubleType,
+        DecimalType(20, 0),
+        DateType,
+        TimestampType,
+        StringType)
+
+    val partitionColumns = partitionColumnTypes.zipWithIndex.map {
+      case (t, index) => StructField(s"p_$index", t)
+    }
+
+    val schema = StructType(partitionColumns :+ StructField(s"i", StringType))
+    val df = spark.createDataFrame(sparkContext.parallelize(row :: Nil), schema)
+
+    withTempPath { dir =>
+      df.write.format("parquet").partitionBy(partitionColumns.map(_.name): _*).save(dir.toString)
+      val fields = schema.map(f => Column(f.name))
+      checkAnswer(spark.read.load(dir.toString).select(fields: _*), row)
+    }
+  }
+
   test("SPARK-8037: Ignores files whose name starts with dot") {
     withTempPath { dir =>
       val df = (1 to 3).map(i => (i, i, i, i)).toDF("a", "b", "c", "d")

From 231f39e3f6641953a90bc4c40444ede63f363b23 Mon Sep 17 00:00:00 2001
From: Yu Peng <loneknightpy@gmail.com>
Date: Tue, 18 Oct 2016 13:23:31 -0700
Subject: [PATCH 044/162] [SPARK-17711] Compress rolled executor log

## What changes were proposed in this pull request?

This PR adds support for executor log compression.

## How was this patch tested?

Unit tests

cc: yhuai tdas mengxr

Author: Yu Peng <loneknightpy@gmail.com>

Closes #15285 from loneknightpy/compress-executor-log.
---
 .../spark/deploy/worker/ui/LogPage.scala      |  7 +-
 .../scala/org/apache/spark/util/Utils.scala   | 80 ++++++++++++++--
 .../util/logging/RollingFileAppender.scala    | 45 +++++++--
 .../spark/deploy/worker/ui/LogPageSuite.scala |  6 +-
 .../apache/spark/util/FileAppenderSuite.scala | 60 +++++++++++-
 .../org/apache/spark/util/UtilsSuite.scala    | 92 ++++++++++++++-----
 docs/configuration.md                         |  8 ++
 docs/spark-standalone.md                      |  9 ++
 8 files changed, 263 insertions(+), 44 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/ui/LogPage.scala b/core/src/main/scala/org/apache/spark/deploy/worker/ui/LogPage.scala
index 3473c41b935fd..465c214362b25 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/ui/LogPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/ui/LogPage.scala
@@ -22,6 +22,8 @@ import javax.servlet.http.HttpServletRequest
 
 import scala.xml.{Node, Unparsed}
 
+import com.google.common.cache.{CacheBuilder, CacheLoader, LoadingCache}
+
 import org.apache.spark.internal.Logging
 import org.apache.spark.ui.{UIUtils, WebUIPage}
 import org.apache.spark.util.Utils
@@ -138,7 +140,8 @@ private[ui] class LogPage(parent: WorkerWebUI) extends WebUIPage("logPage") with
       val files = RollingFileAppender.getSortedRolledOverFiles(logDirectory, logType)
       logDebug(s"Sorted log files of type $logType in $logDirectory:\n${files.mkString("\n")}")
 
-      val totalLength = files.map { _.length }.sum
+      val fileLengths: Seq[Long] = files.map(Utils.getFileLength(_, worker.conf))
+      val totalLength = fileLengths.sum
       val offset = offsetOption.getOrElse(totalLength - byteLength)
       val startIndex = {
         if (offset < 0) {
@@ -151,7 +154,7 @@ private[ui] class LogPage(parent: WorkerWebUI) extends WebUIPage("logPage") with
       }
       val endIndex = math.min(startIndex + byteLength, totalLength)
       logDebug(s"Getting log from $startIndex to $endIndex")
-      val logText = Utils.offsetBytes(files, startIndex, endIndex)
+      val logText = Utils.offsetBytes(files, fileLengths, startIndex, endIndex)
       logDebug(s"Got log of length ${logText.length} bytes")
       (logText, startIndex, endIndex, totalLength)
     } catch {
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index ef832756ce3b7..a4da138e71992 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -27,6 +27,7 @@ import java.nio.file.{Files, Paths}
 import java.util.{Locale, Properties, Random, UUID}
 import java.util.concurrent._
 import java.util.concurrent.atomic.AtomicBoolean
+import java.util.zip.GZIPInputStream
 import javax.net.ssl.HttpsURLConnection
 
 import scala.annotation.tailrec
@@ -38,8 +39,10 @@ import scala.reflect.ClassTag
 import scala.util.Try
 import scala.util.control.{ControlThrowable, NonFatal}
 
+import com.google.common.cache.{CacheBuilder, CacheLoader, LoadingCache}
 import com.google.common.io.{ByteStreams, Files => GFiles}
 import com.google.common.net.InetAddresses
+import org.apache.commons.io.IOUtils
 import org.apache.commons.lang3.SystemUtils
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
@@ -55,6 +58,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config.{DYN_ALLOCATION_INITIAL_EXECUTORS, DYN_ALLOCATION_MIN_EXECUTORS, EXECUTOR_INSTANCES}
 import org.apache.spark.network.util.JavaUtils
 import org.apache.spark.serializer.{DeserializationStream, SerializationStream, SerializerInstance}
+import org.apache.spark.util.logging.RollingFileAppender
 
 /** CallSite represents a place in user code. It can have a short and a long form. */
 private[spark] case class CallSite(shortForm: String, longForm: String)
@@ -1440,14 +1444,72 @@ private[spark] object Utils extends Logging {
     CallSite(shortForm, longForm)
   }
 
+  private val UNCOMPRESSED_LOG_FILE_LENGTH_CACHE_SIZE_CONF =
+    "spark.worker.ui.compressedLogFileLengthCacheSize"
+  private val DEFAULT_UNCOMPRESSED_LOG_FILE_LENGTH_CACHE_SIZE = 100
+  private var compressedLogFileLengthCache: LoadingCache[String, java.lang.Long] = null
+  private def getCompressedLogFileLengthCache(
+      sparkConf: SparkConf): LoadingCache[String, java.lang.Long] = this.synchronized {
+    if (compressedLogFileLengthCache == null) {
+      val compressedLogFileLengthCacheSize = sparkConf.getInt(
+        UNCOMPRESSED_LOG_FILE_LENGTH_CACHE_SIZE_CONF,
+        DEFAULT_UNCOMPRESSED_LOG_FILE_LENGTH_CACHE_SIZE)
+      compressedLogFileLengthCache = CacheBuilder.newBuilder()
+        .maximumSize(compressedLogFileLengthCacheSize)
+        .build[String, java.lang.Long](new CacheLoader[String, java.lang.Long]() {
+        override def load(path: String): java.lang.Long = {
+          Utils.getCompressedFileLength(new File(path))
+        }
+      })
+    }
+    compressedLogFileLengthCache
+  }
+
+  /**
+   * Return the file length, if the file is compressed it returns the uncompressed file length.
+   * It also caches the uncompressed file size to avoid repeated decompression. The cache size is
+   * read from workerConf.
+   */
+  def getFileLength(file: File, workConf: SparkConf): Long = {
+    if (file.getName.endsWith(".gz")) {
+      getCompressedLogFileLengthCache(workConf).get(file.getAbsolutePath)
+    } else {
+      file.length
+    }
+  }
+
+  /** Return uncompressed file length of a compressed file. */
+  private def getCompressedFileLength(file: File): Long = {
+    try {
+      // Uncompress .gz file to determine file size.
+      var fileSize = 0L
+      val gzInputStream = new GZIPInputStream(new FileInputStream(file))
+      val bufSize = 1024
+      val buf = new Array[Byte](bufSize)
+      var numBytes = IOUtils.read(gzInputStream, buf)
+      while (numBytes > 0) {
+        fileSize += numBytes
+        numBytes = IOUtils.read(gzInputStream, buf)
+      }
+      fileSize
+    } catch {
+      case e: Throwable =>
+        logError(s"Cannot get file length of ${file}", e)
+        throw e
+    }
+  }
+
   /** Return a string containing part of a file from byte 'start' to 'end'. */
-  def offsetBytes(path: String, start: Long, end: Long): String = {
+  def offsetBytes(path: String, length: Long, start: Long, end: Long): String = {
     val file = new File(path)
-    val length = file.length()
     val effectiveEnd = math.min(length, end)
     val effectiveStart = math.max(0, start)
     val buff = new Array[Byte]((effectiveEnd-effectiveStart).toInt)
-    val stream = new FileInputStream(file)
+    val stream = if (path.endsWith(".gz")) {
+      new GZIPInputStream(new FileInputStream(file))
+    } else {
+      new FileInputStream(file)
+    }
 
     try {
       ByteStreams.skipFully(stream, effectiveStart)
@@ -1463,8 +1525,8 @@ private[spark] object Utils extends Logging {
    * and `endIndex` is based on the cumulative size of all the files take in
    * the given order. See figure below for more details.
    */
-  def offsetBytes(files: Seq[File], start: Long, end: Long): String = {
-    val fileLengths = files.map { _.length }
+  def offsetBytes(files: Seq[File], fileLengths: Seq[Long], start: Long, end: Long): String = {
+    assert(files.length == fileLengths.length)
     val startIndex = math.max(start, 0)
     val endIndex = math.min(end, fileLengths.sum)
     val fileToLength = files.zip(fileLengths).toMap
@@ -1472,7 +1534,7 @@ private[spark] object Utils extends Logging {
 
     val stringBuffer = new StringBuffer((endIndex - startIndex).toInt)
     var sum = 0L
-    for (file <- files) {
+    files.zip(fileLengths).foreach { case (file, fileLength) =>
       val startIndexOfFile = sum
       val endIndexOfFile = sum + fileToLength(file)
       logDebug(s"Processing file $file, " +
@@ -1491,19 +1553,19 @@ private[spark] object Utils extends Logging {
 
       if (startIndex <= startIndexOfFile  && endIndex >= endIndexOfFile) {
         // Case C: read the whole file
-        stringBuffer.append(offsetBytes(file.getAbsolutePath, 0, fileToLength(file)))
+        stringBuffer.append(offsetBytes(file.getAbsolutePath, fileLength, 0, fileToLength(file)))
       } else if (startIndex > startIndexOfFile && startIndex < endIndexOfFile) {
         // Case A and B: read from [start of required range] to [end of file / end of range]
         val effectiveStartIndex = startIndex - startIndexOfFile
         val effectiveEndIndex = math.min(endIndex - startIndexOfFile, fileToLength(file))
         stringBuffer.append(Utils.offsetBytes(
-          file.getAbsolutePath, effectiveStartIndex, effectiveEndIndex))
+          file.getAbsolutePath, fileLength, effectiveStartIndex, effectiveEndIndex))
       } else if (endIndex > startIndexOfFile && endIndex < endIndexOfFile) {
         // Case D: read from [start of file] to [end of require range]
         val effectiveStartIndex = math.max(startIndex - startIndexOfFile, 0)
         val effectiveEndIndex = endIndex - startIndexOfFile
         stringBuffer.append(Utils.offsetBytes(
-          file.getAbsolutePath, effectiveStartIndex, effectiveEndIndex))
+          file.getAbsolutePath, fileLength, effectiveStartIndex, effectiveEndIndex))
       }
       sum += fileToLength(file)
       logDebug(s"After processing file $file, string built is ${stringBuffer.toString}")
diff --git a/core/src/main/scala/org/apache/spark/util/logging/RollingFileAppender.scala b/core/src/main/scala/org/apache/spark/util/logging/RollingFileAppender.scala
index a0eb05c7c0e82..5d8cec8447b53 100644
--- a/core/src/main/scala/org/apache/spark/util/logging/RollingFileAppender.scala
+++ b/core/src/main/scala/org/apache/spark/util/logging/RollingFileAppender.scala
@@ -17,9 +17,11 @@
 
 package org.apache.spark.util.logging
 
-import java.io.{File, FileFilter, InputStream}
+import java.io._
+import java.util.zip.GZIPOutputStream
 
 import com.google.common.io.Files
+import org.apache.commons.io.IOUtils
 
 import org.apache.spark.SparkConf
 
@@ -45,6 +47,7 @@ private[spark] class RollingFileAppender(
   import RollingFileAppender._
 
   private val maxRetainedFiles = conf.getInt(RETAINED_FILES_PROPERTY, -1)
+  private val enableCompression = conf.getBoolean(ENABLE_COMPRESSION, false)
 
   /** Stop the appender */
   override def stop() {
@@ -76,6 +79,33 @@ private[spark] class RollingFileAppender(
     }
   }
 
+  // Roll the log file and compress if enableCompression is true.
+  private def rotateFile(activeFile: File, rolloverFile: File): Unit = {
+    if (enableCompression) {
+      val gzFile = new File(rolloverFile.getAbsolutePath + GZIP_LOG_SUFFIX)
+      var gzOutputStream: GZIPOutputStream = null
+      var inputStream: InputStream = null
+      try {
+        inputStream = new FileInputStream(activeFile)
+        gzOutputStream = new GZIPOutputStream(new FileOutputStream(gzFile))
+        IOUtils.copy(inputStream, gzOutputStream)
+        inputStream.close()
+        gzOutputStream.close()
+        activeFile.delete()
+      } finally {
+        IOUtils.closeQuietly(inputStream)
+        IOUtils.closeQuietly(gzOutputStream)
+      }
+    } else {
+      Files.move(activeFile, rolloverFile)
+    }
+  }
+
+  // Check if the rollover file already exists.
+  private def rolloverFileExist(file: File): Boolean = {
+    file.exists || new File(file.getAbsolutePath + GZIP_LOG_SUFFIX).exists
+  }
+
   /** Move the active log file to a new rollover file */
   private def moveFile() {
     val rolloverSuffix = rollingPolicy.generateRolledOverFileSuffix()
@@ -83,8 +113,8 @@ private[spark] class RollingFileAppender(
       activeFile.getParentFile, activeFile.getName + rolloverSuffix).getAbsoluteFile
     logDebug(s"Attempting to rollover file $activeFile to file $rolloverFile")
     if (activeFile.exists) {
-      if (!rolloverFile.exists) {
-        Files.move(activeFile, rolloverFile)
+      if (!rolloverFileExist(rolloverFile)) {
+        rotateFile(activeFile, rolloverFile)
         logInfo(s"Rolled over $activeFile to $rolloverFile")
       } else {
         // In case the rollover file name clashes, make a unique file name.
@@ -97,11 +127,11 @@ private[spark] class RollingFileAppender(
           altRolloverFile = new File(activeFile.getParent,
             s"${activeFile.getName}$rolloverSuffix--$i").getAbsoluteFile
           i += 1
-        } while (i < 10000 && altRolloverFile.exists)
+        } while (i < 10000 && rolloverFileExist(altRolloverFile))
 
         logWarning(s"Rollover file $rolloverFile already exists, " +
           s"rolled over $activeFile to file $altRolloverFile")
-        Files.move(activeFile, altRolloverFile)
+        rotateFile(activeFile, altRolloverFile)
       }
     } else {
       logWarning(s"File $activeFile does not exist")
@@ -142,6 +172,9 @@ private[spark] object RollingFileAppender {
   val SIZE_DEFAULT = (1024 * 1024).toString
   val RETAINED_FILES_PROPERTY = "spark.executor.logs.rolling.maxRetainedFiles"
   val DEFAULT_BUFFER_SIZE = 8192
+  val ENABLE_COMPRESSION = "spark.executor.logs.rolling.enableCompression"
+
+  val GZIP_LOG_SUFFIX = ".gz"
 
   /**
    * Get the sorted list of rolled over files. This assumes that the all the rolled
@@ -158,6 +191,6 @@ private[spark] object RollingFileAppender {
       val file = new File(directory, activeFileName).getAbsoluteFile
       if (file.exists) Some(file) else None
     }
-    rolledOverFiles ++ activeFile
+    rolledOverFiles.sortBy(_.getName.stripSuffix(GZIP_LOG_SUFFIX)) ++ activeFile
   }
 }
diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/ui/LogPageSuite.scala b/core/src/test/scala/org/apache/spark/deploy/worker/ui/LogPageSuite.scala
index 72eaffb416981..4c3e96777940d 100644
--- a/core/src/test/scala/org/apache/spark/deploy/worker/ui/LogPageSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/worker/ui/LogPageSuite.scala
@@ -22,16 +22,20 @@ import java.io.{File, FileWriter}
 import org.mockito.Mockito.{mock, when}
 import org.scalatest.PrivateMethodTester
 
-import org.apache.spark.SparkFunSuite
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.worker.Worker
 
 class LogPageSuite extends SparkFunSuite with PrivateMethodTester {
 
   test("get logs simple") {
     val webui = mock(classOf[WorkerWebUI])
+    val worker = mock(classOf[Worker])
     val tmpDir = new File(sys.props("java.io.tmpdir"))
     val workDir = new File(tmpDir, "work-dir")
     workDir.mkdir()
     when(webui.workDir).thenReturn(workDir)
+    when(webui.worker).thenReturn(worker)
+    when(worker.conf).thenReturn(new SparkConf())
     val logPage = new LogPage(webui)
 
     // Prepare some fake log files to read later
diff --git a/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala b/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala
index 4fa9f9a8f590f..7e2da8e141532 100644
--- a/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala
@@ -20,11 +20,13 @@ package org.apache.spark.util
 import java.io._
 import java.nio.charset.StandardCharsets
 import java.util.concurrent.CountDownLatch
+import java.util.zip.GZIPInputStream
 
 import scala.collection.mutable.HashSet
 import scala.reflect._
 
 import com.google.common.io.Files
+import org.apache.commons.io.IOUtils
 import org.apache.log4j.{Appender, Level, Logger}
 import org.apache.log4j.spi.LoggingEvent
 import org.mockito.ArgumentCaptor
@@ -72,6 +74,25 @@ class FileAppenderSuite extends SparkFunSuite with BeforeAndAfter with Logging {
     testRolling(appender, testOutputStream, textToAppend, rolloverIntervalMillis)
   }
 
+  test("rolling file appender - time-based rolling (compressed)") {
+    // setup input stream and appender
+    val testOutputStream = new PipedOutputStream()
+    val testInputStream = new PipedInputStream(testOutputStream, 100 * 1000)
+    val rolloverIntervalMillis = 100
+    val durationMillis = 1000
+    val numRollovers = durationMillis / rolloverIntervalMillis
+    val textToAppend = (1 to numRollovers).map( _.toString * 10 )
+
+    val sparkConf = new SparkConf()
+    sparkConf.set("spark.executor.logs.rolling.enableCompression", "true")
+    val appender = new RollingFileAppender(testInputStream, testFile,
+      new TimeBasedRollingPolicy(rolloverIntervalMillis, s"--HH-mm-ss-SSSS", false),
+      sparkConf, 10)
+
+    testRolling(
+      appender, testOutputStream, textToAppend, rolloverIntervalMillis, isCompressed = true)
+  }
+
   test("rolling file appender - size-based rolling") {
     // setup input stream and appender
     val testOutputStream = new PipedOutputStream()
@@ -89,6 +110,25 @@ class FileAppenderSuite extends SparkFunSuite with BeforeAndAfter with Logging {
     }
   }
 
+  test("rolling file appender - size-based rolling (compressed)") {
+    // setup input stream and appender
+    val testOutputStream = new PipedOutputStream()
+    val testInputStream = new PipedInputStream(testOutputStream, 100 * 1000)
+    val rolloverSize = 1000
+    val textToAppend = (1 to 3).map( _.toString * 1000 )
+
+    val sparkConf = new SparkConf()
+    sparkConf.set("spark.executor.logs.rolling.enableCompression", "true")
+    val appender = new RollingFileAppender(testInputStream, testFile,
+      new SizeBasedRollingPolicy(rolloverSize, false), sparkConf, 99)
+
+    val files = testRolling(appender, testOutputStream, textToAppend, 0, isCompressed = true)
+    files.foreach { file =>
+      logInfo(file.toString + ": " + file.length + " bytes")
+      assert(file.length < rolloverSize)
+    }
+  }
+
   test("rolling file appender - cleaning") {
     // setup input stream and appender
     val testOutputStream = new PipedOutputStream()
@@ -273,7 +313,8 @@ class FileAppenderSuite extends SparkFunSuite with BeforeAndAfter with Logging {
       appender: FileAppender,
       outputStream: OutputStream,
       textToAppend: Seq[String],
-      sleepTimeBetweenTexts: Long
+      sleepTimeBetweenTexts: Long,
+      isCompressed: Boolean = false
     ): Seq[File] = {
     // send data to appender through the input stream, and wait for the data to be written
     val expectedText = textToAppend.mkString("")
@@ -290,10 +331,23 @@ class FileAppenderSuite extends SparkFunSuite with BeforeAndAfter with Logging {
     // verify whether all the data written to rolled over files is same as expected
     val generatedFiles = RollingFileAppender.getSortedRolledOverFiles(
       testFile.getParentFile.toString, testFile.getName)
-    logInfo("Filtered files: \n" + generatedFiles.mkString("\n"))
+    logInfo("Generate files: \n" + generatedFiles.mkString("\n"))
     assert(generatedFiles.size > 1)
+    if (isCompressed) {
+      assert(
+        generatedFiles.filter(_.getName.endsWith(RollingFileAppender.GZIP_LOG_SUFFIX)).size > 0)
+    }
     val allText = generatedFiles.map { file =>
-      Files.toString(file, StandardCharsets.UTF_8)
+      if (file.getName.endsWith(RollingFileAppender.GZIP_LOG_SUFFIX)) {
+        val inputStream = new GZIPInputStream(new FileInputStream(file))
+        try {
+          IOUtils.toString(inputStream, StandardCharsets.UTF_8)
+        } finally {
+          IOUtils.closeQuietly(inputStream)
+        }
+      } else {
+        Files.toString(file, StandardCharsets.UTF_8)
+      }
     }.mkString("")
     assert(allText === expectedText)
     generatedFiles
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index bc28b2d9cb831..b427f7fb50158 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -25,11 +25,13 @@ import java.nio.charset.StandardCharsets
 import java.text.DecimalFormatSymbols
 import java.util.Locale
 import java.util.concurrent.TimeUnit
+import java.util.zip.GZIPOutputStream
 
 import scala.collection.mutable.ListBuffer
 import scala.util.Random
 
 import com.google.common.io.Files
+import org.apache.commons.io.IOUtils
 import org.apache.commons.lang3.SystemUtils
 import org.apache.commons.math3.stat.inference.ChiSquareTest
 import org.apache.hadoop.conf.Configuration
@@ -274,65 +276,109 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
     assert(str(10 * hour + 59 * minute + 59 * second + 999) === "11" + sep + "00 h")
   }
 
-  test("reading offset bytes of a file") {
+  def getSuffix(isCompressed: Boolean): String = {
+    if (isCompressed) {
+      ".gz"
+    } else {
+      ""
+    }
+  }
+
+  def writeLogFile(path: String, content: Array[Byte]): Unit = {
+    val outputStream = if (path.endsWith(".gz")) {
+      new GZIPOutputStream(new FileOutputStream(path))
+    } else {
+      new FileOutputStream(path)
+    }
+    IOUtils.write(content, outputStream)
+    outputStream.close()
+    content.size
+  }
+
+  private val workerConf = new SparkConf()
+
+  def testOffsetBytes(isCompressed: Boolean): Unit = {
     val tmpDir2 = Utils.createTempDir()
-    val f1Path = tmpDir2 + "/f1"
-    val f1 = new FileOutputStream(f1Path)
-    f1.write("1\n2\n3\n4\n5\n6\n7\n8\n9\n".getBytes(StandardCharsets.UTF_8))
-    f1.close()
+    val suffix = getSuffix(isCompressed)
+    val f1Path = tmpDir2 + "/f1" + suffix
+    writeLogFile(f1Path, "1\n2\n3\n4\n5\n6\n7\n8\n9\n".getBytes(StandardCharsets.UTF_8))
+    val f1Length = Utils.getFileLength(new File(f1Path), workerConf)
 
     // Read first few bytes
-    assert(Utils.offsetBytes(f1Path, 0, 5) === "1\n2\n3")
+    assert(Utils.offsetBytes(f1Path, f1Length, 0, 5) === "1\n2\n3")
 
     // Read some middle bytes
-    assert(Utils.offsetBytes(f1Path, 4, 11) === "3\n4\n5\n6")
+    assert(Utils.offsetBytes(f1Path, f1Length, 4, 11) === "3\n4\n5\n6")
 
     // Read last few bytes
-    assert(Utils.offsetBytes(f1Path, 12, 18) === "7\n8\n9\n")
+    assert(Utils.offsetBytes(f1Path, f1Length, 12, 18) === "7\n8\n9\n")
 
     // Read some nonexistent bytes in the beginning
-    assert(Utils.offsetBytes(f1Path, -5, 5) === "1\n2\n3")
+    assert(Utils.offsetBytes(f1Path, f1Length, -5, 5) === "1\n2\n3")
 
     // Read some nonexistent bytes at the end
-    assert(Utils.offsetBytes(f1Path, 12, 22) === "7\n8\n9\n")
+    assert(Utils.offsetBytes(f1Path, f1Length, 12, 22) === "7\n8\n9\n")
 
     // Read some nonexistent bytes on both ends
-    assert(Utils.offsetBytes(f1Path, -3, 25) === "1\n2\n3\n4\n5\n6\n7\n8\n9\n")
+    assert(Utils.offsetBytes(f1Path, f1Length, -3, 25) === "1\n2\n3\n4\n5\n6\n7\n8\n9\n")
 
     Utils.deleteRecursively(tmpDir2)
   }
 
-  test("reading offset bytes across multiple files") {
+  test("reading offset bytes of a file") {
+    testOffsetBytes(isCompressed = false)
+  }
+
+  test("reading offset bytes of a file (compressed)") {
+    testOffsetBytes(isCompressed = true)
+  }
+
+  def testOffsetBytesMultipleFiles(isCompressed: Boolean): Unit = {
     val tmpDir = Utils.createTempDir()
-    val files = (1 to 3).map(i => new File(tmpDir, i.toString))
-    Files.write("0123456789", files(0), StandardCharsets.UTF_8)
-    Files.write("abcdefghij", files(1), StandardCharsets.UTF_8)
-    Files.write("ABCDEFGHIJ", files(2), StandardCharsets.UTF_8)
+    val suffix = getSuffix(isCompressed)
+    val files = (1 to 3).map(i => new File(tmpDir, i.toString + suffix)) :+ new File(tmpDir, "4")
+    writeLogFile(files(0).getAbsolutePath, "0123456789".getBytes(StandardCharsets.UTF_8))
+    writeLogFile(files(1).getAbsolutePath, "abcdefghij".getBytes(StandardCharsets.UTF_8))
+    writeLogFile(files(2).getAbsolutePath, "ABCDEFGHIJ".getBytes(StandardCharsets.UTF_8))
+    writeLogFile(files(3).getAbsolutePath, "9876543210".getBytes(StandardCharsets.UTF_8))
+    val fileLengths = files.map(Utils.getFileLength(_, workerConf))
 
     // Read first few bytes in the 1st file
-    assert(Utils.offsetBytes(files, 0, 5) === "01234")
+    assert(Utils.offsetBytes(files, fileLengths, 0, 5) === "01234")
 
     // Read bytes within the 1st file
-    assert(Utils.offsetBytes(files, 5, 8) === "567")
+    assert(Utils.offsetBytes(files, fileLengths, 5, 8) === "567")
 
     // Read bytes across 1st and 2nd file
-    assert(Utils.offsetBytes(files, 8, 18) === "89abcdefgh")
+    assert(Utils.offsetBytes(files, fileLengths, 8, 18) === "89abcdefgh")
 
     // Read bytes across 1st, 2nd and 3rd file
-    assert(Utils.offsetBytes(files, 5, 24) === "56789abcdefghijABCD")
+    assert(Utils.offsetBytes(files, fileLengths, 5, 24) === "56789abcdefghijABCD")
+
+    // Read bytes across 3rd and 4th file
+    assert(Utils.offsetBytes(files, fileLengths, 25, 35) === "FGHIJ98765")
 
     // Read some nonexistent bytes in the beginning
-    assert(Utils.offsetBytes(files, -5, 18) === "0123456789abcdefgh")
+    assert(Utils.offsetBytes(files, fileLengths, -5, 18) === "0123456789abcdefgh")
 
     // Read some nonexistent bytes at the end
-    assert(Utils.offsetBytes(files, 18, 35) === "ijABCDEFGHIJ")
+    assert(Utils.offsetBytes(files, fileLengths, 18, 45) === "ijABCDEFGHIJ9876543210")
 
     // Read some nonexistent bytes on both ends
-    assert(Utils.offsetBytes(files, -5, 35) === "0123456789abcdefghijABCDEFGHIJ")
+    assert(Utils.offsetBytes(files, fileLengths, -5, 45) ===
+      "0123456789abcdefghijABCDEFGHIJ9876543210")
 
     Utils.deleteRecursively(tmpDir)
   }
 
+  test("reading offset bytes across multiple files") {
+    testOffsetBytesMultipleFiles(isCompressed = false)
+  }
+
+  test("reading offset bytes across multiple files (compressed)") {
+    testOffsetBytesMultipleFiles(isCompressed = true)
+  }
+
   test("deserialize long value") {
     val testval : Long = 9730889947L
     val bbuf = ByteBuffer.allocate(8)
diff --git a/docs/configuration.md b/docs/configuration.md
index 373e22d71a872..a4a99d6fa4630 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -293,6 +293,14 @@ Apart from these, the following properties are also available, and may be useful
     Older log files will be deleted. Disabled by default.
   </td>
 </tr>
+<tr>
+  <td><code>spark.executor.logs.rolling.enableCompression</code></td>
+  <td>false</td>
+  <td>
+    Enable executor log compression. If it is enabled, the rolled executor logs will be compressed.
+    Disabled by default.
+  </td>
+</tr>
 <tr>
   <td><code>spark.executor.logs.rolling.maxSize</code></td>
   <td>(none)</td>
diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md
index 7b82b957d5299..1c0b60f7b9346 100644
--- a/docs/spark-standalone.md
+++ b/docs/spark-standalone.md
@@ -250,6 +250,15 @@ SPARK_WORKER_OPTS supports the following system properties:
     especially if you run jobs very frequently.
   </td>
 </tr>
+<tr>
+  <td><code>spark.worker.ui.compressedLogFileLengthCacheSize</code></td>
+  <td>100</td>
+  <td>
+    For compressed log files, the uncompressed file can only be computed by uncompressing the files.
+    Spark caches the uncompressed file size of compressed log files. This property controls the cache
+    size.
+  </td>
+</tr>
 </table>
 
 # Connecting an Application to the Cluster

From 4ef39c2f4436fa22d0b957fe7ad477e4c4a16452 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Tue, 18 Oct 2016 13:33:46 -0700
Subject: [PATCH 045/162] [SPARK-17974] try 2) Refactor FileCatalog classes to
 simplify the inheritance tree

## What changes were proposed in this pull request?

This renames `BasicFileCatalog => FileCatalog`, combines  `SessionFileCatalog` with `PartitioningAwareFileCatalog`, and removes the old `FileCatalog` trait.

In summary,
```
MetadataLogFileCatalog extends PartitioningAwareFileCatalog
ListingFileCatalog extends PartitioningAwareFileCatalog
PartitioningAwareFileCatalog extends FileCatalog
TableFileCatalog extends FileCatalog
```

(note that this is a re-submission of https://github.com/apache/spark/pull/15518 which got reverted)

## How was this patch tested?

Existing tests

Author: Eric Liang <ekl@databricks.com>

Closes #15533 from ericl/fix-scalastyle-revert.
---
 .../scala/org/apache/spark/sql/Dataset.scala  |   2 +-
 .../sql/execution/DataSourceScanExec.scala    |   4 +-
 .../execution/datasources/FileCatalog.scala   |  66 +++++
 .../execution/datasources/FileFormat.scala    |  61 -----
 .../datasources/HadoopFsRelation.scala        |   4 +-
 .../PartitioningAwareFileCatalog.scala        | 217 ++++++++++++++++-
 .../datasources/PartitioningUtils.scala       |  12 +-
 .../datasources/SessionFileCatalog.scala      | 225 ------------------
 .../datasources/TableFileCatalog.scala        |  11 +-
 .../datasources/FileCatalogSuite.scala        |  10 +
 .../datasources/SessionFileCatalogSuite.scala |  34 ---
 .../ParquetPartitionDiscoverySuite.scala      |  10 +-
 .../spark/sql/hive/HiveMetastoreCatalog.scala |   2 +-
 13 files changed, 304 insertions(+), 354 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCatalog.scala
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
 delete mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalogSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 7dccbbd3f0a5b..073d2b1512b95 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -43,7 +43,7 @@ import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.util.usePrettyExpression
 import org.apache.spark.sql.execution.{FileRelation, LogicalRDD, QueryExecution, SQLExecution}
 import org.apache.spark.sql.execution.command.{CreateViewCommand, ExplainCommand, GlobalTempView, LocalTempView}
-import org.apache.spark.sql.execution.datasources.{FileCatalog, HadoopFsRelation, LogicalRelation}
+import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}
 import org.apache.spark.sql.execution.datasources.json.JacksonGenerator
 import org.apache.spark.sql.execution.python.EvaluatePython
 import org.apache.spark.sql.streaming.{DataStreamWriter, StreamingQuery}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
index 623d2be55dcec..fdd1fa3648251 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -431,7 +431,7 @@ case class FileSourceScanExec(
   private def createBucketedReadRDD(
       bucketSpec: BucketSpec,
       readFile: (PartitionedFile) => Iterator[InternalRow],
-      selectedPartitions: Seq[Partition],
+      selectedPartitions: Seq[PartitionDirectory],
       fsRelation: HadoopFsRelation): RDD[InternalRow] = {
     logInfo(s"Planning with ${bucketSpec.numBuckets} buckets")
     val bucketed =
@@ -463,7 +463,7 @@ case class FileSourceScanExec(
    */
   private def createNonBucketedReadRDD(
       readFile: (PartitionedFile) => Iterator[InternalRow],
-      selectedPartitions: Seq[Partition],
+      selectedPartitions: Seq[PartitionDirectory],
       fsRelation: HadoopFsRelation): RDD[InternalRow] = {
     val defaultMaxSplitBytes =
       fsRelation.sparkSession.sessionState.conf.filesMaxPartitionBytes
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCatalog.scala
new file mode 100644
index 0000000000000..2bc66ceeebdb4
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCatalog.scala
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import org.apache.hadoop.fs._
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions._
+
+/**
+ * A collection of data files from a partitioned relation, along with the partition values in the
+ * form of an [[InternalRow]].
+ */
+case class PartitionDirectory(values: InternalRow, files: Seq[FileStatus])
+
+/**
+ * An interface for objects capable of enumerating the root paths of a relation as well as the
+ * partitions of a relation subject to some pruning expressions.
+ */
+trait FileCatalog {
+
+  /**
+   * Returns the list of root input paths from which the catalog will get files. There may be a
+   * single root path from which partitions are discovered, or individual partitions may be
+   * specified by each path.
+   */
+  def rootPaths: Seq[Path]
+
+  /**
+   * Returns all valid files grouped into partitions when the data is partitioned. If the data is
+   * unpartitioned, this will return a single partition with no partition values.
+   *
+   * @param filters The filters used to prune which partitions are returned.  These filters must
+   *                only refer to partition columns and this method will only return files
+   *                where these predicates are guaranteed to evaluate to `true`.  Thus, these
+   *                filters will not need to be evaluated again on the returned data.
+   */
+  def listFiles(filters: Seq[Expression]): Seq[PartitionDirectory]
+
+  /**
+   * Returns the list of files that will be read when scanning this relation. This call may be
+   * very expensive for large tables.
+   */
+  def inputFiles: Array[String]
+
+  /** Refresh any cached file listings */
+  def refresh(): Unit
+
+  /** Sum of table file sizes, in bytes */
+  def sizeInBytes: Long
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
index e7239ef91b326..9d153cec731a8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
@@ -175,64 +175,3 @@ abstract class TextBasedFileFormat extends FileFormat {
     codec == null || codec.isInstanceOf[SplittableCompressionCodec]
   }
 }
-
-/**
- * A collection of data files from a partitioned relation, along with the partition values in the
- * form of an [[InternalRow]].
- */
-case class Partition(values: InternalRow, files: Seq[FileStatus])
-
-/**
- * An interface for objects capable of enumerating the root paths of a relation as well as the
- * partitions of a relation subject to some pruning expressions.
- */
-trait BasicFileCatalog {
-
-  /**
-   * Returns the list of root input paths from which the catalog will get files. There may be a
-   * single root path from which partitions are discovered, or individual partitions may be
-   * specified by each path.
-   */
-  def rootPaths: Seq[Path]
-
-  /**
-   * Returns all valid files grouped into partitions when the data is partitioned. If the data is
-   * unpartitioned, this will return a single partition with no partition values.
-   *
-   * @param filters The filters used to prune which partitions are returned.  These filters must
-   *                only refer to partition columns and this method will only return files
-   *                where these predicates are guaranteed to evaluate to `true`.  Thus, these
-   *                filters will not need to be evaluated again on the returned data.
-   */
-  def listFiles(filters: Seq[Expression]): Seq[Partition]
-
-  /** Returns the list of files that will be read when scanning this relation. */
-  def inputFiles: Array[String]
-
-  /** Refresh any cached file listings */
-  def refresh(): Unit
-
-  /** Sum of table file sizes, in bytes */
-  def sizeInBytes: Long
-}
-
-/**
- * A [[BasicFileCatalog]] which can enumerate all of the files comprising a relation and, from
- * those, infer the relation's partition specification.
- */
-// TODO: Consider a more descriptive, appropriate name which suggests this is a file catalog for
-// which it is safe to list all of its files?
-trait FileCatalog extends BasicFileCatalog {
-
-  /** Returns the specification of the partitions inferred from the data. */
-  def partitionSpec(): PartitionSpec
-
-  /** Returns all the valid files. */
-  def allFiles(): Seq[FileStatus]
-
-  /** Returns the list of files that will be read when scanning this relation. */
-  override def inputFiles: Array[String] =
-    allFiles().map(_.getPath.toUri.toString).toArray
-
-  override def sizeInBytes: Long = allFiles().map(_.getLen).sum
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
index db889edf032d6..afad8898089bd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.types.StructType
  * Acts as a container for all of the metadata required to read from a datasource. All discovery,
  * resolution and merging logic for schemas and partitions has been removed.
  *
- * @param location A [[BasicFileCatalog]] that can enumerate the locations of all the files that
+ * @param location A [[FileCatalog]] that can enumerate the locations of all the files that
  *                 comprise this relation.
  * @param partitionSchema The schema of the columns (if any) that are used to partition the relation
  * @param dataSchema The schema of any remaining columns.  Note that if any partition columns are
@@ -38,7 +38,7 @@ import org.apache.spark.sql.types.StructType
  * @param options Configuration used when reading / writing data.
  */
 case class HadoopFsRelation(
-    location: BasicFileCatalog,
+    location: FileCatalog,
     partitionSchema: StructType,
     dataSchema: StructType,
     bucketSpec: Option[BucketSpec],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
index b2508115c282f..5c8eff7ec46b4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
@@ -17,14 +17,21 @@
 
 package org.apache.spark.sql.execution.datasources
 
+import java.io.FileNotFoundException
+
 import scala.collection.mutable
 
-import org.apache.hadoop.fs.{FileStatus, Path}
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs._
+import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
 
+import org.apache.spark.internal.Logging
+import org.apache.spark.metrics.source.HiveCatalogMetrics
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.{expressions, InternalRow}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.types.{StringType, StructType}
+import org.apache.spark.util.SerializableConfiguration
 
 
 /**
@@ -38,22 +45,24 @@ import org.apache.spark.sql.types.{StringType, StructType}
 abstract class PartitioningAwareFileCatalog(
     sparkSession: SparkSession,
     parameters: Map[String, String],
-    partitionSchema: Option[StructType])
-  extends SessionFileCatalog(sparkSession) with FileCatalog {
+    partitionSchema: Option[StructType]) extends FileCatalog with Logging {
   import PartitioningAwareFileCatalog.BASE_PATH_PARAM
 
-  override protected val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(parameters)
+  /** Returns the specification of the partitions inferred from the data. */
+  def partitionSpec(): PartitionSpec
+
+  protected val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(parameters)
 
   protected def leafFiles: mutable.LinkedHashMap[Path, FileStatus]
 
   protected def leafDirToChildrenFiles: Map[Path, Array[FileStatus]]
 
-  override def listFiles(filters: Seq[Expression]): Seq[Partition] = {
+  override def listFiles(filters: Seq[Expression]): Seq[PartitionDirectory] = {
     val selectedPartitions = if (partitionSpec().partitionColumns.isEmpty) {
-      Partition(InternalRow.empty, allFiles().filter(f => isDataPath(f.getPath))) :: Nil
+      PartitionDirectory(InternalRow.empty, allFiles().filter(f => isDataPath(f.getPath))) :: Nil
     } else {
       prunePartitions(filters, partitionSpec()).map {
-        case PartitionDirectory(values, path) =>
+        case PartitionPath(values, path) =>
           val files: Seq[FileStatus] = leafDirToChildrenFiles.get(path) match {
             case Some(existingDir) =>
               // Directory has children files in it, return them
@@ -63,14 +72,20 @@ abstract class PartitioningAwareFileCatalog(
               // Directory does not exist, or has no children files
               Nil
           }
-          Partition(values, files)
+          PartitionDirectory(values, files)
       }
     }
     logTrace("Selected files after partition pruning:\n\t" + selectedPartitions.mkString("\n\t"))
     selectedPartitions
   }
 
-  override def allFiles(): Seq[FileStatus] = {
+  /** Returns the list of files that will be read when scanning this relation. */
+  override def inputFiles: Array[String] =
+    allFiles().map(_.getPath.toUri.toString).toArray
+
+  override def sizeInBytes: Long = allFiles().map(_.getLen).sum
+
+  def allFiles(): Seq[FileStatus] = {
     if (partitionSpec().partitionColumns.isEmpty) {
       // For each of the root input paths, get the list of files inside them
       rootPaths.flatMap { path =>
@@ -139,7 +154,7 @@ abstract class PartitioningAwareFileCatalog(
 
   private def prunePartitions(
       predicates: Seq[Expression],
-      partitionSpec: PartitionSpec): Seq[PartitionDirectory] = {
+      partitionSpec: PartitionSpec): Seq[PartitionPath] = {
     val PartitionSpec(partitionColumns, partitions) = partitionSpec
     val partitionColumnNames = partitionColumns.map(_.name).toSet
     val partitionPruningPredicates = predicates.filter {
@@ -156,7 +171,7 @@ abstract class PartitioningAwareFileCatalog(
       })
 
       val selected = partitions.filter {
-        case PartitionDirectory(values, _) => boundPredicate(values)
+        case PartitionPath(values, _) => boundPredicate(values)
       }
       logInfo {
         val total = partitions.length
@@ -214,8 +229,186 @@ abstract class PartitioningAwareFileCatalog(
     val name = path.getName
     !((name.startsWith("_") && !name.contains("=")) || name.startsWith("."))
   }
+
+  /**
+   * List leaf files of given paths. This method will submit a Spark job to do parallel
+   * listing whenever there is a path having more files than the parallel partition discovery
+   * discovery threshold.
+   *
+   * This is publicly visible for testing.
+   */
+  def listLeafFiles(paths: Seq[Path]): mutable.LinkedHashSet[FileStatus] = {
+    val files =
+      if (paths.length >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) {
+        PartitioningAwareFileCatalog.listLeafFilesInParallel(paths, hadoopConf, sparkSession)
+      } else {
+        PartitioningAwareFileCatalog.listLeafFilesInSerial(paths, hadoopConf)
+      }
+
+    HiveCatalogMetrics.incrementFilesDiscovered(files.size)
+    mutable.LinkedHashSet(files: _*)
+  }
 }
 
-object PartitioningAwareFileCatalog {
+object PartitioningAwareFileCatalog extends Logging {
   val BASE_PATH_PARAM = "basePath"
+
+  /** A serializable variant of HDFS's BlockLocation. */
+  private case class SerializableBlockLocation(
+      names: Array[String],
+      hosts: Array[String],
+      offset: Long,
+      length: Long)
+
+  /** A serializable variant of HDFS's FileStatus. */
+  private case class SerializableFileStatus(
+      path: String,
+      length: Long,
+      isDir: Boolean,
+      blockReplication: Short,
+      blockSize: Long,
+      modificationTime: Long,
+      accessTime: Long,
+      blockLocations: Array[SerializableBlockLocation])
+
+  /**
+   * List a collection of path recursively.
+   */
+  private def listLeafFilesInSerial(
+      paths: Seq[Path],
+      hadoopConf: Configuration): Seq[FileStatus] = {
+    // Dummy jobconf to get to the pathFilter defined in configuration
+    val jobConf = new JobConf(hadoopConf, this.getClass)
+    val filter = FileInputFormat.getInputPathFilter(jobConf)
+
+    paths.flatMap { path =>
+      val fs = path.getFileSystem(hadoopConf)
+      listLeafFiles0(fs, path, filter)
+    }
+  }
+
+  /**
+   * List a collection of path recursively in parallel (using Spark executors).
+   * Each task launched will use [[listLeafFilesInSerial]] to list.
+   */
+  private def listLeafFilesInParallel(
+      paths: Seq[Path],
+      hadoopConf: Configuration,
+      sparkSession: SparkSession): Seq[FileStatus] = {
+    assert(paths.size >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold)
+    logInfo(s"Listing leaf files and directories in parallel under: ${paths.mkString(", ")}")
+
+    val sparkContext = sparkSession.sparkContext
+    val serializableConfiguration = new SerializableConfiguration(hadoopConf)
+    val serializedPaths = paths.map(_.toString)
+
+    // Set the number of parallelism to prevent following file listing from generating many tasks
+    // in case of large #defaultParallelism.
+    val numParallelism = Math.min(paths.size, 10000)
+
+    val statuses = sparkContext
+      .parallelize(serializedPaths, numParallelism)
+      .mapPartitions { paths =>
+        val hadoopConf = serializableConfiguration.value
+        listLeafFilesInSerial(paths.map(new Path(_)).toSeq, hadoopConf).iterator
+      }.map { status =>
+        // Turn FileStatus into SerializableFileStatus so we can send it back to the driver
+        val blockLocations = status match {
+          case f: LocatedFileStatus =>
+            f.getBlockLocations.map { loc =>
+              SerializableBlockLocation(
+                loc.getNames,
+                loc.getHosts,
+                loc.getOffset,
+                loc.getLength)
+            }
+
+          case _ =>
+            Array.empty[SerializableBlockLocation]
+        }
+
+        SerializableFileStatus(
+          status.getPath.toString,
+          status.getLen,
+          status.isDirectory,
+          status.getReplication,
+          status.getBlockSize,
+          status.getModificationTime,
+          status.getAccessTime,
+          blockLocations)
+      }.collect()
+
+    // Turn SerializableFileStatus back to Status
+    statuses.map { f =>
+      val blockLocations = f.blockLocations.map { loc =>
+        new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length)
+      }
+      new LocatedFileStatus(
+        new FileStatus(
+          f.length, f.isDir, f.blockReplication, f.blockSize, f.modificationTime, new Path(f.path)),
+        blockLocations)
+    }
+  }
+
+  /**
+   * List a single path, provided as a FileStatus, in serial.
+   */
+  private def listLeafFiles0(
+      fs: FileSystem, path: Path, filter: PathFilter): Seq[FileStatus] = {
+    logTrace(s"Listing $path")
+    val name = path.getName.toLowerCase
+    if (shouldFilterOut(name)) {
+      Seq.empty[FileStatus]
+    } else {
+      // [SPARK-17599] Prevent ListingFileCatalog from failing if path doesn't exist
+      // Note that statuses only include FileStatus for the files and dirs directly under path,
+      // and does not include anything else recursively.
+      val statuses = try fs.listStatus(path) catch {
+        case _: FileNotFoundException =>
+          logWarning(s"The directory $path was not found. Was it deleted very recently?")
+          Array.empty[FileStatus]
+      }
+
+      val allLeafStatuses = {
+        val (dirs, files) = statuses.partition(_.isDirectory)
+        val stats = files ++ dirs.flatMap(dir => listLeafFiles0(fs, dir.getPath, filter))
+        if (filter != null) stats.filter(f => filter.accept(f.getPath)) else stats
+      }
+
+      allLeafStatuses.filterNot(status => shouldFilterOut(status.getPath.getName)).map {
+        case f: LocatedFileStatus =>
+          f
+
+        // NOTE:
+        //
+        // - Although S3/S3A/S3N file system can be quite slow for remote file metadata
+        //   operations, calling `getFileBlockLocations` does no harm here since these file system
+        //   implementations don't actually issue RPC for this method.
+        //
+        // - Here we are calling `getFileBlockLocations` in a sequential manner, but it should not
+        //   be a big deal since we always use to `listLeafFilesInParallel` when the number of
+        //   paths exceeds threshold.
+        case f =>
+          // The other constructor of LocatedFileStatus will call FileStatus.getPermission(),
+          // which is very slow on some file system (RawLocalFileSystem, which is launch a
+          // subprocess and parse the stdout).
+          val locations = fs.getFileBlockLocations(f, 0, f.getLen)
+          val lfs = new LocatedFileStatus(f.getLen, f.isDirectory, f.getReplication, f.getBlockSize,
+            f.getModificationTime, 0, null, null, null, null, f.getPath, locations)
+          if (f.isSymlink) {
+            lfs.setSymlink(f.getSymlink)
+          }
+          lfs
+      }
+    }
+  }
+
+  /** Checks if we should filter out this path name. */
+  def shouldFilterOut(pathName: String): Boolean = {
+    // We filter everything that starts with _ and ., except _common_metadata and _metadata
+    // because Parquet needs to find those metadata files from leaf files returned by this method.
+    // We should refactor this logic to not mix metadata files with data files.
+    ((pathName.startsWith("_") && !pathName.contains("=")) || pathName.startsWith(".")) &&
+      !pathName.startsWith("_common_metadata") && !pathName.startsWith("_metadata")
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
index 381261cf65ca0..81bdabb7afdab 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
@@ -34,8 +34,8 @@ import org.apache.spark.sql.types._
 
 // TODO: We should tighten up visibility of the classes here once we clean up Hive coupling.
 
-object PartitionDirectory {
-  def apply(values: InternalRow, path: String): PartitionDirectory =
+object PartitionPath {
+  def apply(values: InternalRow, path: String): PartitionPath =
     apply(values, new Path(path))
 }
 
@@ -43,14 +43,14 @@ object PartitionDirectory {
  * Holds a directory in a partitioned collection of files as well as as the partition values
  * in the form of a Row.  Before scanning, the files at `path` need to be enumerated.
  */
-case class PartitionDirectory(values: InternalRow, path: Path)
+case class PartitionPath(values: InternalRow, path: Path)
 
 case class PartitionSpec(
     partitionColumns: StructType,
-    partitions: Seq[PartitionDirectory])
+    partitions: Seq[PartitionPath])
 
 object PartitionSpec {
-  val emptySpec = PartitionSpec(StructType(Seq.empty[StructField]), Seq.empty[PartitionDirectory])
+  val emptySpec = PartitionSpec(StructType(Seq.empty[StructField]), Seq.empty[PartitionPath])
 }
 
 object PartitioningUtils {
@@ -142,7 +142,7 @@ object PartitioningUtils {
       // Finally, we create `Partition`s based on paths and resolved partition values.
       val partitions = resolvedPartitionValues.zip(pathsWithPartitionValues).map {
         case (PartitionValues(_, literals), (path, _)) =>
-          PartitionDirectory(InternalRow.fromSeq(literals.map(_.value)), path)
+          PartitionPath(InternalRow.fromSeq(literals.map(_.value)), path)
       }
 
       PartitionSpec(StructType(fields), partitions)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
deleted file mode 100644
index 4807a92c2e6b8..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalog.scala
+++ /dev/null
@@ -1,225 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution.datasources
-
-import java.io.FileNotFoundException
-
-import scala.collection.mutable
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs._
-import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
-
-import org.apache.spark.internal.Logging
-import org.apache.spark.metrics.source.HiveCatalogMetrics
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.util.SerializableConfiguration
-
-
-/**
- * A base class for [[BasicFileCatalog]]s that need a [[SparkSession]] and the ability to find leaf
- * files in a list of HDFS paths.
- *
- * @param sparkSession a [[SparkSession]]
- * @param ignoreFileNotFound (see [[ListingFileCatalog]])
- */
-abstract class SessionFileCatalog(sparkSession: SparkSession)
-    extends BasicFileCatalog with Logging {
-  protected val hadoopConf: Configuration
-
-  /**
-   * List leaf files of given paths. This method will submit a Spark job to do parallel
-   * listing whenever there is a path having more files than the parallel partition discovery
-   * discovery threshold.
-   *
-   * This is publicly visible for testing.
-   */
-  def listLeafFiles(paths: Seq[Path]): mutable.LinkedHashSet[FileStatus] = {
-    val files =
-      if (paths.length >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) {
-        SessionFileCatalog.listLeafFilesInParallel(paths, hadoopConf, sparkSession)
-      } else {
-        SessionFileCatalog.listLeafFilesInSerial(paths, hadoopConf)
-      }
-
-    HiveCatalogMetrics.incrementFilesDiscovered(files.size)
-    mutable.LinkedHashSet(files: _*)
-  }
-}
-
-object SessionFileCatalog extends Logging {
-
-  /** A serializable variant of HDFS's BlockLocation. */
-  private case class SerializableBlockLocation(
-      names: Array[String],
-      hosts: Array[String],
-      offset: Long,
-      length: Long)
-
-  /** A serializable variant of HDFS's FileStatus. */
-  private case class SerializableFileStatus(
-      path: String,
-      length: Long,
-      isDir: Boolean,
-      blockReplication: Short,
-      blockSize: Long,
-      modificationTime: Long,
-      accessTime: Long,
-      blockLocations: Array[SerializableBlockLocation])
-
-  /**
-   * List a collection of path recursively.
-   */
-  private def listLeafFilesInSerial(
-      paths: Seq[Path],
-      hadoopConf: Configuration): Seq[FileStatus] = {
-    // Dummy jobconf to get to the pathFilter defined in configuration
-    val jobConf = new JobConf(hadoopConf, this.getClass)
-    val filter = FileInputFormat.getInputPathFilter(jobConf)
-
-    paths.flatMap { path =>
-      val fs = path.getFileSystem(hadoopConf)
-      listLeafFiles0(fs, path, filter)
-    }
-  }
-
-  /**
-   * List a collection of path recursively in parallel (using Spark executors).
-   * Each task launched will use [[listLeafFilesInSerial]] to list.
-   */
-  private def listLeafFilesInParallel(
-      paths: Seq[Path],
-      hadoopConf: Configuration,
-      sparkSession: SparkSession): Seq[FileStatus] = {
-    assert(paths.size >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold)
-    logInfo(s"Listing leaf files and directories in parallel under: ${paths.mkString(", ")}")
-
-    val sparkContext = sparkSession.sparkContext
-    val serializableConfiguration = new SerializableConfiguration(hadoopConf)
-    val serializedPaths = paths.map(_.toString)
-
-    // Set the number of parallelism to prevent following file listing from generating many tasks
-    // in case of large #defaultParallelism.
-    val numParallelism = Math.min(paths.size, 10000)
-
-    val statuses = sparkContext
-      .parallelize(serializedPaths, numParallelism)
-      .mapPartitions { paths =>
-        val hadoopConf = serializableConfiguration.value
-        listLeafFilesInSerial(paths.map(new Path(_)).toSeq, hadoopConf).iterator
-      }.map { status =>
-        // Turn FileStatus into SerializableFileStatus so we can send it back to the driver
-        val blockLocations = status match {
-          case f: LocatedFileStatus =>
-            f.getBlockLocations.map { loc =>
-              SerializableBlockLocation(
-                loc.getNames,
-                loc.getHosts,
-                loc.getOffset,
-                loc.getLength)
-            }
-
-          case _ =>
-            Array.empty[SerializableBlockLocation]
-        }
-
-        SerializableFileStatus(
-          status.getPath.toString,
-          status.getLen,
-          status.isDirectory,
-          status.getReplication,
-          status.getBlockSize,
-          status.getModificationTime,
-          status.getAccessTime,
-          blockLocations)
-      }.collect()
-
-    // Turn SerializableFileStatus back to Status
-    statuses.map { f =>
-      val blockLocations = f.blockLocations.map { loc =>
-        new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length)
-      }
-      new LocatedFileStatus(
-        new FileStatus(
-          f.length, f.isDir, f.blockReplication, f.blockSize, f.modificationTime, new Path(f.path)),
-        blockLocations)
-    }
-  }
-
-  /**
-   * List a single path, provided as a FileStatus, in serial.
-   */
-  private def listLeafFiles0(
-      fs: FileSystem, path: Path, filter: PathFilter): Seq[FileStatus] = {
-    logTrace(s"Listing $path")
-    val name = path.getName.toLowerCase
-    if (shouldFilterOut(name)) {
-      Seq.empty[FileStatus]
-    } else {
-      // [SPARK-17599] Prevent ListingFileCatalog from failing if path doesn't exist
-      // Note that statuses only include FileStatus for the files and dirs directly under path,
-      // and does not include anything else recursively.
-      val statuses = try fs.listStatus(path) catch {
-        case _: FileNotFoundException =>
-          logWarning(s"The directory $path was not found. Was it deleted very recently?")
-          Array.empty[FileStatus]
-      }
-
-      val allLeafStatuses = {
-        val (dirs, files) = statuses.partition(_.isDirectory)
-        val stats = files ++ dirs.flatMap(dir => listLeafFiles0(fs, dir.getPath, filter))
-        if (filter != null) stats.filter(f => filter.accept(f.getPath)) else stats
-      }
-
-      allLeafStatuses.filterNot(status => shouldFilterOut(status.getPath.getName)).map {
-        case f: LocatedFileStatus =>
-          f
-
-        // NOTE:
-        //
-        // - Although S3/S3A/S3N file system can be quite slow for remote file metadata
-        //   operations, calling `getFileBlockLocations` does no harm here since these file system
-        //   implementations don't actually issue RPC for this method.
-        //
-        // - Here we are calling `getFileBlockLocations` in a sequential manner, but it should not
-        //   be a big deal since we always use to `listLeafFilesInParallel` when the number of
-        //   paths exceeds threshold.
-        case f =>
-          // The other constructor of LocatedFileStatus will call FileStatus.getPermission(),
-          // which is very slow on some file system (RawLocalFileSystem, which is launch a
-          // subprocess and parse the stdout).
-          val locations = fs.getFileBlockLocations(f, 0, f.getLen)
-          val lfs = new LocatedFileStatus(f.getLen, f.isDirectory, f.getReplication, f.getBlockSize,
-            f.getModificationTime, 0, null, null, null, null, f.getPath, locations)
-          if (f.isSymlink) {
-            lfs.setSymlink(f.getSymlink)
-          }
-          lfs
-      }
-    }
-  }
-
-  /** Checks if we should filter out this path name. */
-  def shouldFilterOut(pathName: String): Boolean = {
-    // We filter everything that starts with _ and ., except _common_metadata and _metadata
-    // because Parquet needs to find those metadata files from leaf files returned by this method.
-    // We should refactor this logic to not mix metadata files with data files.
-    ((pathName.startsWith("_") && !pathName.contains("=")) || pathName.startsWith(".")) &&
-      !pathName.startsWith("_common_metadata") && !pathName.startsWith("_metadata")
-  }
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index a5c41b244589b..5648ab480a98a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.types.StructType
 
 
 /**
- * A [[BasicFileCatalog]] for a metastore catalog table.
+ * A [[FileCatalog]] for a metastore catalog table.
  *
  * @param sparkSession a [[SparkSession]]
  * @param db the table's database name
@@ -38,10 +38,9 @@ class TableFileCatalog(
     db: String,
     table: String,
     partitionSchema: Option[StructType],
-    override val sizeInBytes: Long)
-  extends SessionFileCatalog(sparkSession) {
+    override val sizeInBytes: Long) extends FileCatalog {
 
-  override protected val hadoopConf = sparkSession.sessionState.newHadoopConf
+  protected val hadoopConf = sparkSession.sessionState.newHadoopConf
 
   private val externalCatalog = sparkSession.sharedState.externalCatalog
 
@@ -51,7 +50,7 @@ class TableFileCatalog(
 
   override def rootPaths: Seq[Path] = baseLocation.map(new Path(_)).toSeq
 
-  override def listFiles(filters: Seq[Expression]): Seq[Partition] = {
+  override def listFiles(filters: Seq[Expression]): Seq[PartitionDirectory] = {
     filterPartitions(filters).listFiles(Nil)
   }
 
@@ -79,7 +78,7 @@ class TableFileCatalog(
       case Some(schema) =>
         val selectedPartitions = externalCatalog.listPartitionsByFilter(db, table, filters)
         val partitions = selectedPartitions.map { p =>
-          PartitionDirectory(p.toRow(schema), p.storage.locationUri.get)
+          PartitionPath(p.toRow(schema), p.storage.locationUri.get)
         }
         val partitionSpec = PartitionSpec(schema, partitions)
         new PrunedTableFileCatalog(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala
index 2695974b84b00..9c43169cbf898 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala
@@ -81,6 +81,16 @@ class FileCatalogSuite extends SharedSQLContext {
     }
   }
 
+  test("PartitioningAwareFileCatalog - file filtering") {
+    assert(!PartitioningAwareFileCatalog.shouldFilterOut("abcd"))
+    assert(PartitioningAwareFileCatalog.shouldFilterOut(".ab"))
+    assert(PartitioningAwareFileCatalog.shouldFilterOut("_cd"))
+    assert(!PartitioningAwareFileCatalog.shouldFilterOut("_metadata"))
+    assert(!PartitioningAwareFileCatalog.shouldFilterOut("_common_metadata"))
+    assert(PartitioningAwareFileCatalog.shouldFilterOut("_ab_metadata"))
+    assert(PartitioningAwareFileCatalog.shouldFilterOut("_cd_common_metadata"))
+  }
+
   test("SPARK-17613 - PartitioningAwareFileCatalog: base path w/o '/' at end") {
     class MockCatalog(
       override val rootPaths: Seq[Path])
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalogSuite.scala
deleted file mode 100644
index df509583377ae..0000000000000
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SessionFileCatalogSuite.scala
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution.datasources
-
-import org.apache.spark.SparkFunSuite
-
-class SessionFileCatalogSuite extends SparkFunSuite {
-
-  test("file filtering") {
-    assert(!SessionFileCatalog.shouldFilterOut("abcd"))
-    assert(SessionFileCatalog.shouldFilterOut(".ab"))
-    assert(SessionFileCatalog.shouldFilterOut("_cd"))
-
-    assert(!SessionFileCatalog.shouldFilterOut("_metadata"))
-    assert(!SessionFileCatalog.shouldFilterOut("_common_metadata"))
-    assert(SessionFileCatalog.shouldFilterOut("_ab_metadata"))
-    assert(SessionFileCatalog.shouldFilterOut("_cd_common_metadata"))
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
index 2ef66baee1eac..f2a209e91962d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
@@ -30,7 +30,8 @@ import org.apache.parquet.hadoop.ParquetOutputFormat
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Literal
-import org.apache.spark.sql.execution.datasources.{FileCatalog, HadoopFsRelation, LogicalRelation, PartitionDirectory => Partition, PartitioningUtils, PartitionSpec}
+import org.apache.spark.sql.execution.datasources._
+import org.apache.spark.sql.execution.datasources.{PartitionPath => Partition}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
@@ -632,10 +633,11 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
       (1 to 10).map(i => (i, i.toString)).toDF("a", "b").write.parquet(dir.getCanonicalPath)
       val queryExecution = spark.read.parquet(dir.getCanonicalPath).queryExecution
       queryExecution.analyzed.collectFirst {
-        case LogicalRelation(HadoopFsRelation(location: FileCatalog, _, _, _, _, _), _, _) =>
-          assert(location.partitionSpec === PartitionSpec.emptySpec)
+        case LogicalRelation(
+            HadoopFsRelation(location: PartitioningAwareFileCatalog, _, _, _, _, _), _, _) =>
+          assert(location.partitionSpec() === PartitionSpec.emptySpec)
       }.getOrElse {
-        fail(s"Expecting a ParquetRelation2, but got:\n$queryExecution")
+        fail(s"Expecting a matching HadoopFsRelation, but got:\n$queryExecution")
       }
     }
   }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 4a2aaa7d4f6ca..16e1e37b2fb02 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -30,7 +30,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
 import org.apache.spark.sql.execution.command.DDLUtils
-import org.apache.spark.sql.execution.datasources.{Partition => _, _}
+import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat, ParquetOptions}
 import org.apache.spark.sql.hive.orc.OrcFileFormat
 import org.apache.spark.sql.types._

From bfe7885aee2f406c1bbde08e30809a0b4bb070d2 Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@happy-camper.st>
Date: Tue, 18 Oct 2016 13:36:00 -0700
Subject: [PATCH 046/162] [SPARK-17985][CORE] Bump commons-lang3 version to
 3.5.

## What changes were proposed in this pull request?

`SerializationUtils.clone()` of commons-lang3 (<3.5) has a bug that breaks thread safety, which gets stack sometimes caused by race condition of initializing hash map.
See https://issues.apache.org/jira/browse/LANG-1251.

## How was this patch tested?

Existing tests.

Author: Takuya UESHIN <ueshin@happy-camper.st>

Closes #15525 from ueshin/issues/SPARK-17985.
---
 dev/deps/spark-deps-hadoop-2.2      | 2 +-
 dev/deps/spark-deps-hadoop-2.3      | 2 +-
 dev/deps/spark-deps-hadoop-2.4      | 2 +-
 dev/deps/spark-deps-hadoop-2.6      | 2 +-
 dev/deps/spark-deps-hadoop-2.7      | 2 +-
 docs/streaming-flume-integration.md | 4 ++--
 pom.xml                             | 2 +-
 7 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2
index b30f8c347c0af..525dcef5b7d99 100644
--- a/dev/deps/spark-deps-hadoop-2.2
+++ b/dev/deps/spark-deps-hadoop-2.2
@@ -33,7 +33,7 @@ commons-digester-1.8.jar
 commons-httpclient-3.1.jar
 commons-io-2.4.jar
 commons-lang-2.6.jar
-commons-lang3-3.3.2.jar
+commons-lang3-3.5.jar
 commons-logging-1.1.3.jar
 commons-math-2.1.jar
 commons-math3-3.4.1.jar
diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3
index 5b3a7651dd299..562fe6461e753 100644
--- a/dev/deps/spark-deps-hadoop-2.3
+++ b/dev/deps/spark-deps-hadoop-2.3
@@ -36,7 +36,7 @@ commons-digester-1.8.jar
 commons-httpclient-3.1.jar
 commons-io-2.4.jar
 commons-lang-2.6.jar
-commons-lang3-3.3.2.jar
+commons-lang3-3.5.jar
 commons-logging-1.1.3.jar
 commons-math3-3.4.1.jar
 commons-net-2.2.jar
diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4
index e323efe30f64b..747521aa2a566 100644
--- a/dev/deps/spark-deps-hadoop-2.4
+++ b/dev/deps/spark-deps-hadoop-2.4
@@ -36,7 +36,7 @@ commons-digester-1.8.jar
 commons-httpclient-3.1.jar
 commons-io-2.4.jar
 commons-lang-2.6.jar
-commons-lang3-3.3.2.jar
+commons-lang3-3.5.jar
 commons-logging-1.1.3.jar
 commons-math3-3.4.1.jar
 commons-net-2.2.jar
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 77d97e5365b9f..afd4502c59d33 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -40,7 +40,7 @@ commons-digester-1.8.jar
 commons-httpclient-3.1.jar
 commons-io-2.4.jar
 commons-lang-2.6.jar
-commons-lang3-3.3.2.jar
+commons-lang3-3.5.jar
 commons-logging-1.1.3.jar
 commons-math3-3.4.1.jar
 commons-net-2.2.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 572edfa0cc29e..687b855b649d8 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -40,7 +40,7 @@ commons-digester-1.8.jar
 commons-httpclient-3.1.jar
 commons-io-2.4.jar
 commons-lang-2.6.jar
-commons-lang3-3.3.2.jar
+commons-lang3-3.5.jar
 commons-logging-1.1.3.jar
 commons-math3-3.4.1.jar
 commons-net-2.2.jar
diff --git a/docs/streaming-flume-integration.md b/docs/streaming-flume-integration.md
index 767e1f9402e01..a5d36da5b6de9 100644
--- a/docs/streaming-flume-integration.md
+++ b/docs/streaming-flume-integration.md
@@ -115,11 +115,11 @@ Configuring Flume on the chosen machine requires the following two steps.
 		artifactId = scala-library
 		version = {{site.SCALA_VERSION}}
 
-	(iii) *Commons Lang 3 JAR*: Download the Commons Lang 3 JAR. It can be found with the following artifact detail (or, [direct link](http://search.maven.org/remotecontent?filepath=org/apache/commons/commons-lang3/3.3.2/commons-lang3-3.3.2.jar)).
+	(iii) *Commons Lang 3 JAR*: Download the Commons Lang 3 JAR. It can be found with the following artifact detail (or, [direct link](http://search.maven.org/remotecontent?filepath=org/apache/commons/commons-lang3/3.5/commons-lang3-3.5.jar)).
 
 		groupId = org.apache.commons
 		artifactId = commons-lang3
-		version = 3.3.2
+		version = 3.5
 
 2. **Configuration file**: On that machine, configure Flume agent to send data to an Avro sink by having the following in the configuration file.
 
diff --git a/pom.xml b/pom.xml
index 7d13c51b2a596..aaf7cfa7eb2ad 100644
--- a/pom.xml
+++ b/pom.xml
@@ -168,7 +168,7 @@
     <!-- org.apache.commons/commons-lang/-->
     <commons-lang2.version>2.6</commons-lang2.version>
     <!-- org.apache.commons/commons-lang3/-->
-    <commons-lang3.version>3.3.2</commons-lang3.version>
+    <commons-lang3.version>3.5</commons-lang3.version>
     <datanucleus-core.version>3.2.10</datanucleus-core.version>
     <janino.version>3.0.0</janino.version>
     <jersey.version>2.22.2</jersey.version>

From 20dd11096cfda51e47b9dbe3b715a12ccbb4ce1d Mon Sep 17 00:00:00 2001
From: Weiqing Yang <yangweiqing001@gmail.com>
Date: Tue, 18 Oct 2016 13:38:14 -0700
Subject: [PATCH 047/162] [MINOR][DOC] Add more built-in sources in
 sql-programming-guide.md

## What changes were proposed in this pull request?
Add more built-in sources in sql-programming-guide.md.

## How was this patch tested?
Manually.

Author: Weiqing Yang <yangweiqing001@gmail.com>

Closes #15522 from weiqingy/dsDoc.
---
 docs/sql-programming-guide.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index dcc828cc69fed..3f1b73a830eca 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -422,8 +422,8 @@ In the simplest form, the default data source (`parquet` unless otherwise config
 You can also manually specify the data source that will be used along with any extra options
 that you would like to pass to the data source. Data sources are specified by their fully qualified
 name (i.e., `org.apache.spark.sql.parquet`), but for built-in sources you can also use their short
-names (`json`, `parquet`, `jdbc`). DataFrames loaded from any data source type can be converted into other types
-using this syntax.
+names (`json`, `parquet`, `jdbc`, `orc`, `libsvm`, `csv`, `text`). DataFrames loaded from any data
+source type can be converted into other types using this syntax.
 
 <div class="codetabs">
 <div data-lang="scala"  markdown="1">

From 4518642abd71bb1213a9efd72732102abf0bf7e7 Mon Sep 17 00:00:00 2001
From: Guoqiang Li <witgo@qq.com>
Date: Tue, 18 Oct 2016 13:46:57 -0700
Subject: [PATCH 048/162] [SPARK-17930][CORE] The SerializerInstance instance
 used when deserializing a TaskResult is not reused

## What changes were proposed in this pull request?
The following code is called when the DirectTaskResult instance is deserialized

```scala

  def value(): T = {
    if (valueObjectDeserialized) {
      valueObject
    } else {
      // Each deserialization creates a new instance of SerializerInstance, which is very time-consuming
      val resultSer = SparkEnv.get.serializer.newInstance()
      valueObject = resultSer.deserialize(valueBytes)
      valueObjectDeserialized = true
      valueObject
    }
  }

```

In the case of stage has a lot of tasks, reuse SerializerInstance instance can improve the scheduling performance of three times

The test data is TPC-DS 2T (Parquet) and  SQL statement as follows (query 2):

```sql

select  i_item_id,
        avg(ss_quantity) agg1,
        avg(ss_list_price) agg2,
        avg(ss_coupon_amt) agg3,
        avg(ss_sales_price) agg4
 from store_sales, customer_demographics, date_dim, item, promotion
 where ss_sold_date_sk = d_date_sk and
       ss_item_sk = i_item_sk and
       ss_cdemo_sk = cd_demo_sk and
       ss_promo_sk = p_promo_sk and
       cd_gender = 'M' and
       cd_marital_status = 'M' and
       cd_education_status = '4 yr Degree' and
       (p_channel_email = 'N' or p_channel_event = 'N') and
       d_year = 2001
 group by i_item_id
 order by i_item_id
 limit 100;

```

`spark-defaults.conf` file:

```
spark.master                           yarn-client
spark.executor.instances               20
spark.driver.memory                    16g
spark.executor.memory                  30g
spark.executor.cores                   5
spark.default.parallelism              100
spark.sql.shuffle.partitions           100000
spark.serializer                       org.apache.spark.serializer.KryoSerializer
spark.driver.maxResultSize              0
spark.rpc.netty.dispatcher.numThreads   8
spark.executor.extraJavaOptions          -XX:+UseG1GC -XX:+UseStringDeduplication -XX:G1HeapRegionSize=16M -XX:MetaspaceSize=256M
spark.cleaner.referenceTracking.blocking true
spark.cleaner.referenceTracking.blocking.shuffle true

```

Performance test results are as follows

[SPARK-17930](https://github.com/witgo/spark/tree/SPARK-17930)| [ed14633](https://github.com/witgo/spark/commit/ed1463341455830b8867b721a1b34f291139baf3])
------------ | -------------
54.5 s|231.7 s

## How was this patch tested?

Existing tests.

Author: Guoqiang Li <witgo@qq.com>

Closes #15512 from witgo/SPARK-17930.
---
 .../scala/org/apache/spark/scheduler/TaskResult.scala  |  9 +++++----
 .../org/apache/spark/scheduler/TaskResultGetter.scala  | 10 +++++++++-
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala
index 77fda6fcff959..366b92c5f2ada 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala
@@ -23,6 +23,7 @@ import java.nio.ByteBuffer
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.SparkEnv
+import org.apache.spark.serializer.SerializerInstance
 import org.apache.spark.storage.BlockId
 import org.apache.spark.util.{AccumulatorV2, Utils}
 
@@ -77,14 +78,14 @@ private[spark] class DirectTaskResult[T](
    *
    * After the first time, `value()` is trivial and just returns the deserialized `valueObject`.
    */
-  def value(): T = {
+  def value(resultSer: SerializerInstance = null): T = {
     if (valueObjectDeserialized) {
       valueObject
     } else {
       // This should not run when holding a lock because it may cost dozens of seconds for a large
-      // value.
-      val resultSer = SparkEnv.get.serializer.newInstance()
-      valueObject = resultSer.deserialize(valueBytes)
+      // value
+      val ser = if (resultSer == null) SparkEnv.get.serializer.newInstance() else resultSer
+      valueObject = ser.deserialize(valueBytes)
       valueObjectDeserialized = true
       valueObject
     }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala
index 1c3fcbd4612a0..b1addc128e696 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala
@@ -48,6 +48,12 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul
     }
   }
 
+  protected val taskResultSerializer = new ThreadLocal[SerializerInstance] {
+    override def initialValue(): SerializerInstance = {
+      sparkEnv.serializer.newInstance()
+    }
+  }
+
   def enqueueSuccessfulTask(
       taskSetManager: TaskSetManager,
       tid: Long,
@@ -63,7 +69,7 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul
               // deserialize "value" without holding any lock so that it won't block other threads.
               // We should call it here, so that when it's called again in
               // "TaskSetManager.handleSuccessfulTask", it does not need to deserialize the value.
-              directResult.value()
+              directResult.value(taskResultSerializer.get())
               (directResult, serializedData.limit())
             case IndirectTaskResult(blockId, size) =>
               if (!taskSetManager.canFetchMoreResults(size)) {
@@ -84,6 +90,8 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul
               }
               val deserializedResult = serializer.get().deserialize[DirectTaskResult[_]](
                 serializedTaskResult.get.toByteBuffer)
+              // force deserialization of referenced value
+              deserializedResult.value(taskResultSerializer.get())
               sparkEnv.blockManager.master.removeBlock(blockId)
               (deserializedResult, size)
           }

From b3130c7b6a1ab4975023f08c3ab02ee8d2c7e995 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Tue, 18 Oct 2016 13:49:02 -0700
Subject: [PATCH 049/162] [SPARK-17955][SQL] Make DataFrameReader.jdbc call
 DataFrameReader.format("jdbc").load

## What changes were proposed in this pull request?

This PR proposes to make `DataFrameReader.jdbc` call `DataFrameReader.format("jdbc").load` consistently with other APIs in `DataFrameReader`/`DataFrameWriter` and avoid calling `sparkSession.baseRelationToDataFrame(..)` here and there.

The changes were mostly copied from `DataFrameWriter.jdbc()` which was recently updated.

```diff
-    val params = extraOptions.toMap ++ connectionProperties.asScala.toMap
-    val options = new JDBCOptions(url, table, params)
-    val relation = JDBCRelation(parts, options)(sparkSession)
-    sparkSession.baseRelationToDataFrame(relation)
+    this.extraOptions = this.extraOptions ++ connectionProperties.asScala
+    // explicit url and dbtable should override all
+    this.extraOptions += ("url" -> url, "dbtable" -> table)
+    format("jdbc").load()
```

## How was this patch tested?

Existing tests should cover this.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #15499 from HyukjinKwon/SPARK-17955.
---
 .../main/scala/org/apache/spark/sql/DataFrameReader.scala | 8 ++++----
 .../main/scala/org/apache/spark/sql/DataFrameWriter.scala | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index ac3358592202f..b7b2203cdd85b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -232,10 +232,10 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
       parts: Array[Partition],
       connectionProperties: Properties): DataFrame = {
     // connectionProperties should override settings in extraOptions.
-    val params = extraOptions.toMap ++ connectionProperties.asScala.toMap
-    val options = new JDBCOptions(url, table, params)
-    val relation = JDBCRelation(parts, options)(sparkSession)
-    sparkSession.baseRelationToDataFrame(relation)
+    this.extraOptions = this.extraOptions ++ connectionProperties.asScala
+    // explicit url and dbtable should override all
+    this.extraOptions += ("url" -> url, "dbtable" -> table)
+    format("jdbc").load()
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 35ef050dcb169..5be3277651d02 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -426,8 +426,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
   def jdbc(url: String, table: String, connectionProperties: Properties): Unit = {
     assertNotPartitioned("jdbc")
     assertNotBucketed("jdbc")
-    // connectionProperties should override settings in extraOptions
-    this.extraOptions = this.extraOptions ++ (connectionProperties.asScala)
+    // connectionProperties should override settings in extraOptions.
+    this.extraOptions = this.extraOptions ++ connectionProperties.asScala
     // explicit url and dbtable should override all
     this.extraOptions += ("url" -> url, "dbtable" -> table)
     format("jdbc").save()

From cd662bc7a2050264f40650442858a85c4827b608 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Tue, 18 Oct 2016 13:56:35 -0700
Subject: [PATCH 050/162] Revert "[SPARK-17985][CORE] Bump commons-lang3
 version to 3.5."

This reverts commit bfe7885aee2f406c1bbde08e30809a0b4bb070d2.

The commit caused build failures on Hadoop 2.2 profile:

```
[error] /scratch/rxin/spark/core/src/main/scala/org/apache/spark/util/Utils.scala:1489: value read is not a member of object org.apache.commons.io.IOUtils
[error]       var numBytes = IOUtils.read(gzInputStream, buf)
[error]                              ^
[error] /scratch/rxin/spark/core/src/main/scala/org/apache/spark/util/Utils.scala:1492: value read is not a member of object org.apache.commons.io.IOUtils
[error]         numBytes = IOUtils.read(gzInputStream, buf)
[error]                            ^
```
---
 dev/deps/spark-deps-hadoop-2.2      | 2 +-
 dev/deps/spark-deps-hadoop-2.3      | 2 +-
 dev/deps/spark-deps-hadoop-2.4      | 2 +-
 dev/deps/spark-deps-hadoop-2.6      | 2 +-
 dev/deps/spark-deps-hadoop-2.7      | 2 +-
 docs/streaming-flume-integration.md | 4 ++--
 pom.xml                             | 2 +-
 7 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2
index 525dcef5b7d99..b30f8c347c0af 100644
--- a/dev/deps/spark-deps-hadoop-2.2
+++ b/dev/deps/spark-deps-hadoop-2.2
@@ -33,7 +33,7 @@ commons-digester-1.8.jar
 commons-httpclient-3.1.jar
 commons-io-2.4.jar
 commons-lang-2.6.jar
-commons-lang3-3.5.jar
+commons-lang3-3.3.2.jar
 commons-logging-1.1.3.jar
 commons-math-2.1.jar
 commons-math3-3.4.1.jar
diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3
index 562fe6461e753..5b3a7651dd299 100644
--- a/dev/deps/spark-deps-hadoop-2.3
+++ b/dev/deps/spark-deps-hadoop-2.3
@@ -36,7 +36,7 @@ commons-digester-1.8.jar
 commons-httpclient-3.1.jar
 commons-io-2.4.jar
 commons-lang-2.6.jar
-commons-lang3-3.5.jar
+commons-lang3-3.3.2.jar
 commons-logging-1.1.3.jar
 commons-math3-3.4.1.jar
 commons-net-2.2.jar
diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4
index 747521aa2a566..e323efe30f64b 100644
--- a/dev/deps/spark-deps-hadoop-2.4
+++ b/dev/deps/spark-deps-hadoop-2.4
@@ -36,7 +36,7 @@ commons-digester-1.8.jar
 commons-httpclient-3.1.jar
 commons-io-2.4.jar
 commons-lang-2.6.jar
-commons-lang3-3.5.jar
+commons-lang3-3.3.2.jar
 commons-logging-1.1.3.jar
 commons-math3-3.4.1.jar
 commons-net-2.2.jar
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index afd4502c59d33..77d97e5365b9f 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -40,7 +40,7 @@ commons-digester-1.8.jar
 commons-httpclient-3.1.jar
 commons-io-2.4.jar
 commons-lang-2.6.jar
-commons-lang3-3.5.jar
+commons-lang3-3.3.2.jar
 commons-logging-1.1.3.jar
 commons-math3-3.4.1.jar
 commons-net-2.2.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 687b855b649d8..572edfa0cc29e 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -40,7 +40,7 @@ commons-digester-1.8.jar
 commons-httpclient-3.1.jar
 commons-io-2.4.jar
 commons-lang-2.6.jar
-commons-lang3-3.5.jar
+commons-lang3-3.3.2.jar
 commons-logging-1.1.3.jar
 commons-math3-3.4.1.jar
 commons-net-2.2.jar
diff --git a/docs/streaming-flume-integration.md b/docs/streaming-flume-integration.md
index a5d36da5b6de9..767e1f9402e01 100644
--- a/docs/streaming-flume-integration.md
+++ b/docs/streaming-flume-integration.md
@@ -115,11 +115,11 @@ Configuring Flume on the chosen machine requires the following two steps.
 		artifactId = scala-library
 		version = {{site.SCALA_VERSION}}
 
-	(iii) *Commons Lang 3 JAR*: Download the Commons Lang 3 JAR. It can be found with the following artifact detail (or, [direct link](http://search.maven.org/remotecontent?filepath=org/apache/commons/commons-lang3/3.5/commons-lang3-3.5.jar)).
+	(iii) *Commons Lang 3 JAR*: Download the Commons Lang 3 JAR. It can be found with the following artifact detail (or, [direct link](http://search.maven.org/remotecontent?filepath=org/apache/commons/commons-lang3/3.3.2/commons-lang3-3.3.2.jar)).
 
 		groupId = org.apache.commons
 		artifactId = commons-lang3
-		version = 3.5
+		version = 3.3.2
 
 2. **Configuration file**: On that machine, configure Flume agent to send data to an Avro sink by having the following in the configuration file.
 
diff --git a/pom.xml b/pom.xml
index aaf7cfa7eb2ad..7d13c51b2a596 100644
--- a/pom.xml
+++ b/pom.xml
@@ -168,7 +168,7 @@
     <!-- org.apache.commons/commons-lang/-->
     <commons-lang2.version>2.6</commons-lang2.version>
     <!-- org.apache.commons/commons-lang3/-->
-    <commons-lang3.version>3.5</commons-lang3.version>
+    <commons-lang3.version>3.3.2</commons-lang3.version>
     <datanucleus-core.version>3.2.10</datanucleus-core.version>
     <janino.version>3.0.0</janino.version>
     <jersey.version>2.22.2</jersey.version>

From cd106b050ff789b6de539956a7f01159ab15c820 Mon Sep 17 00:00:00 2001
From: cody koeninger <cody@koeninger.org>
Date: Tue, 18 Oct 2016 14:01:49 -0700
Subject: [PATCH 051/162] [SPARK-17841][STREAMING][KAFKA] drain commitQueue

## What changes were proposed in this pull request?

Actually drain commit queue rather than just iterating it.
iterator() on a concurrent linked queue won't remove items from the queue, poll() will.

## How was this patch tested?
Unit tests

Author: cody koeninger <cody@koeninger.org>

Closes #15407 from koeninger/SPARK-17841.
---
 .../spark/streaming/kafka010/DirectKafkaInputDStream.scala  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala
index 432537ebf05b2..7e57bb18cbd50 100644
--- a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala
+++ b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala
@@ -282,13 +282,13 @@ private[spark] class DirectKafkaInputDStream[K, V](
 
   protected def commitAll(): Unit = {
     val m = new ju.HashMap[TopicPartition, OffsetAndMetadata]()
-    val it = commitQueue.iterator()
-    while (it.hasNext) {
-      val osr = it.next
+    var osr = commitQueue.poll()
+    while (null != osr) {
       val tp = osr.topicPartition
       val x = m.get(tp)
       val offset = if (null == x) { osr.untilOffset } else { Math.max(x.offset, osr.untilOffset) }
       m.put(tp, new OffsetAndMetadata(offset))
+      osr = commitQueue.poll()
     }
     if (!m.isEmpty) {
       consumer.commitAsync(m, commitCallback.get)

From 1e35e969305555dda02cb0788c8143e5f2e1944b Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Tue, 18 Oct 2016 14:25:10 -0700
Subject: [PATCH 052/162] [SPARK-17817] [PYSPARK] [FOLLOWUP] PySpark RDD
 Repartitioning Results in Highly Skewed Partition Sizes

## What changes were proposed in this pull request?

This change is a followup for #15389 which calls `_to_java_object_rdd()` to solve this issue. Due to the concern of the possible expensive cost of the call, we can choose to decrease the batch size to solve this issue too.

Simple benchmark:

    import time
    num_partitions = 20000
    a = sc.parallelize(range(int(1e6)), 2)
    start = time.time()
    l = a.repartition(num_partitions).glom().map(len).collect()
    end = time.time()
    print(end - start)

Before: 419.447577953
_to_java_object_rdd(): 421.916361094
decreasing the batch size: 423.712255955

## How was this patch tested?

Jenkins tests.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #15445 from viirya/repartition-batch-size.
---
 python/pyspark/rdd.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 0e2ae19ca39aa..2de2c2fd1a60b 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -2029,12 +2029,12 @@ def coalesce(self, numPartitions, shuffle=False):
         [[1, 2, 3, 4, 5]]
         """
         if shuffle:
-            # In Scala's repartition code, we will distribute elements evenly across output
-            # partitions. However, the RDD from Python is serialized as a single binary data,
-            # so the distribution fails and produces highly skewed partitions. We need to
-            # convert it to a RDD of java object before repartitioning.
-            data_java_rdd = self._to_java_object_rdd().coalesce(numPartitions, shuffle)
-            jrdd = self.ctx._jvm.SerDeUtil.javaToPython(data_java_rdd)
+            # Decrease the batch size in order to distribute evenly the elements across output
+            # partitions. Otherwise, repartition will possibly produce highly skewed partitions.
+            batchSize = min(10, self.ctx._batchSize or 1024)
+            ser = BatchedSerializer(PickleSerializer(), batchSize)
+            selfCopy = self._reserialize(ser)
+            jrdd = selfCopy._jrdd.coalesce(numPartitions, shuffle)
         else:
             jrdd = self._jrdd.coalesce(numPartitions, shuffle)
         return RDD(jrdd, self.ctx, self._jrdd_deserializer)

From 941b3f9aca59e62c078508a934f8c2221ced96ce Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Tue, 18 Oct 2016 17:32:16 -0700
Subject: [PATCH 053/162] [SPARK-17731][SQL][STREAMING][FOLLOWUP] Refactored
 StreamingQueryListener APIs

## What changes were proposed in this pull request?

As per rxin request, here are further API changes
- Changed `Stream(Started/Progress/Terminated)` events to `Stream*Event`
- Changed the fields in `StreamingQueryListener.on***` from `query*` to `event`

## How was this patch tested?
Existing unit tests.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #15530 from tdas/SPARK-17731-1.
---
 project/MimaExcludes.scala                        |  9 +++++++++
 .../sql/execution/streaming/StreamExecution.scala | 15 ++++++++-------
 .../streaming/StreamingQueryListenerBus.scala     |  8 ++++----
 .../sql/streaming/StreamingQueryListener.scala    | 14 +++++++-------
 .../apache/spark/sql/streaming/StreamTest.scala   |  6 +++---
 .../streaming/StreamingQueryListenerSuite.scala   | 13 +++++++------
 .../spark/sql/streaming/StreamingQuerySuite.scala |  6 +++---
 7 files changed, 41 insertions(+), 30 deletions(-)

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 1349af4219c16..facf034ea7e7d 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -68,6 +68,15 @@ object MimaExcludes {
       ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryListener#QueryProgress.this"),
       ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQueryListener#QueryProgress.queryInfo"),
       ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQueryListener#QueryTerminated.queryInfo"),
+      ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.streaming.StreamingQueryListener$QueryStarted"),
+      ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgress"),
+      ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.streaming.StreamingQueryListener$QueryTerminated"),
+      ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryListener.onQueryStarted"),
+      ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQueryListener.onQueryStarted"),
+      ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryListener.onQueryProgress"),
+      ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQueryListener.onQueryProgress"),
+      ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryListener.onQueryTerminated"),
+      ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQueryListener.onQueryTerminated"),
 
       // [SPARK-17338][SQL] add global temp view
       ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.catalog.Catalog.dropGlobalTempView"),
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 9144736c940f5..ba8cf808e339c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -165,7 +165,7 @@ class StreamExecution(
     new Path(new Path(checkpointRoot), name).toUri.toString
 
   /**
-   * Starts the execution. This returns only after the thread has started and [[QueryStarted]] event
+   * Starts the execution. This returns only after the thread has started and [[QueryStartedEvent]]
    * has been posted to all the listeners.
    */
   def start(): Unit = {
@@ -177,9 +177,10 @@ class StreamExecution(
   /**
    * Repeatedly attempts to run batches as data arrives.
    *
-   * Note that this method ensures that [[QueryStarted]] and [[QueryTerminated]] events are posted
-   * such that listeners are guaranteed to get a start event before a termination. Furthermore, this
-   * method also ensures that [[QueryStarted]] event is posted before the `start()` method returns.
+   * Note that this method ensures that [[QueryStartedEvent]] and [[QueryTerminatedEvent]] are
+   * posted such that listeners are guaranteed to get a start event before a termination.
+   * Furthermore, this method also ensures that [[QueryStartedEvent]] event is posted before the
+   * `start()` method returns.
    */
   private def runBatches(): Unit = {
     try {
@@ -190,7 +191,7 @@ class StreamExecution(
         sparkSession.sparkContext.env.metricsSystem.registerSource(streamMetrics)
       }
       updateStatus()
-      postEvent(new QueryStarted(currentStatus)) // Assumption: Does not throw exception.
+      postEvent(new QueryStartedEvent(currentStatus)) // Assumption: Does not throw exception.
 
       // Unblock starting thread
       startLatch.countDown()
@@ -232,7 +233,7 @@ class StreamExecution(
         // Update metrics and notify others
         streamMetrics.reportTriggerFinished()
         updateStatus()
-        postEvent(new QueryProgress(currentStatus))
+        postEvent(new QueryProgressEvent(currentStatus))
         isTerminated
       })
     } catch {
@@ -260,7 +261,7 @@ class StreamExecution(
       // Notify others
       sparkSession.streams.notifyQueryTermination(StreamExecution.this)
       postEvent(
-        new QueryTerminated(currentStatus, exception.map(_.cause).map(Utils.exceptionString)))
+        new QueryTerminatedEvent(currentStatus, exception.map(_.cause).map(Utils.exceptionString)))
       terminationLatch.countDown()
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryListenerBus.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryListenerBus.scala
index 1e663956f980b..fc2190d39da4f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryListenerBus.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryListenerBus.scala
@@ -40,7 +40,7 @@ class StreamingQueryListenerBus(sparkListenerBus: LiveListenerBus)
    */
   def post(event: StreamingQueryListener.Event) {
     event match {
-      case s: QueryStarted =>
+      case s: QueryStartedEvent =>
         postToAll(s)
       case _ =>
         sparkListenerBus.post(event)
@@ -59,11 +59,11 @@ class StreamingQueryListenerBus(sparkListenerBus: LiveListenerBus)
       listener: StreamingQueryListener,
       event: StreamingQueryListener.Event): Unit = {
     event match {
-      case queryStarted: QueryStarted =>
+      case queryStarted: QueryStartedEvent =>
         listener.onQueryStarted(queryStarted)
-      case queryProgress: QueryProgress =>
+      case queryProgress: QueryProgressEvent =>
         listener.onQueryProgress(queryProgress)
-      case queryTerminated: QueryTerminated =>
+      case queryTerminated: QueryTerminatedEvent =>
         listener.onQueryTerminated(queryTerminated)
       case _ =>
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala
index 69790e33b2168..9e311fae842be 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala
@@ -41,7 +41,7 @@ abstract class StreamingQueryListener {
    *       don't block this method as it will block your query.
    * @since 2.0.0
    */
-  def onQueryStarted(queryStarted: QueryStarted): Unit
+  def onQueryStarted(event: QueryStartedEvent): Unit
 
   /**
    * Called when there is some status update (ingestion rate updated, etc.)
@@ -49,16 +49,16 @@ abstract class StreamingQueryListener {
    * @note This method is asynchronous. The status in [[StreamingQuery]] will always be
    *       latest no matter when this method is called. Therefore, the status of [[StreamingQuery]]
    *       may be changed before/when you process the event. E.g., you may find [[StreamingQuery]]
-   *       is terminated when you are processing [[QueryProgress]].
+   *       is terminated when you are processing [[QueryProgressEvent]].
    * @since 2.0.0
    */
-  def onQueryProgress(queryProgress: QueryProgress): Unit
+  def onQueryProgress(event: QueryProgressEvent): Unit
 
   /**
    * Called when a query is stopped, with or without error.
    * @since 2.0.0
    */
-  def onQueryTerminated(queryTerminated: QueryTerminated): Unit
+  def onQueryTerminated(event: QueryTerminatedEvent): Unit
 }
 
 
@@ -84,7 +84,7 @@ object StreamingQueryListener {
    * @since 2.0.0
    */
   @Experimental
-  class QueryStarted private[sql](val queryStatus: StreamingQueryStatus) extends Event
+  class QueryStartedEvent private[sql](val queryStatus: StreamingQueryStatus) extends Event
 
   /**
    * :: Experimental ::
@@ -92,7 +92,7 @@ object StreamingQueryListener {
    * @since 2.0.0
    */
   @Experimental
-  class QueryProgress private[sql](val queryStatus: StreamingQueryStatus) extends Event
+  class QueryProgressEvent private[sql](val queryStatus: StreamingQueryStatus) extends Event
 
   /**
    * :: Experimental ::
@@ -104,7 +104,7 @@ object StreamingQueryListener {
    * @since 2.0.0
    */
   @Experimental
-  class QueryTerminated private[sql](
+  class QueryTerminatedEvent private[sql](
       val queryStatus: StreamingQueryStatus,
       val exception: Option[String]) extends Event
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index 8dfeb8da4b826..742833065144d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -684,20 +684,20 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
     }
 
 
-    override def onQueryStarted(queryStarted: QueryStarted): Unit = {
+    override def onQueryStarted(queryStarted: QueryStartedEvent): Unit = {
       asyncTestWaiter {
         startStatus = queryStarted.queryStatus
       }
     }
 
-    override def onQueryProgress(queryProgress: QueryProgress): Unit = {
+    override def onQueryProgress(queryProgress: QueryProgressEvent): Unit = {
       asyncTestWaiter {
         assert(startStatus != null, "onQueryProgress called before onQueryStarted")
         synchronized { progressStatuses += queryProgress.queryStatus }
       }
     }
 
-    override def onQueryTerminated(queryTerminated: QueryTerminated): Unit = {
+    override def onQueryTerminated(queryTerminated: QueryTerminatedEvent): Unit = {
       asyncTestWaiter {
         assert(startStatus != null, "onQueryTerminated called before onQueryStarted")
         terminationStatus = queryTerminated.queryStatus
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
index 623f66a778eac..ff843865a017e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
@@ -177,30 +177,31 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
   }
 
   test("QueryStarted serialization") {
-    val queryStarted = new StreamingQueryListener.QueryStarted(StreamingQueryStatus.testStatus)
+    val queryStarted = new StreamingQueryListener.QueryStartedEvent(StreamingQueryStatus.testStatus)
     val json = JsonProtocol.sparkEventToJson(queryStarted)
     val newQueryStarted = JsonProtocol.sparkEventFromJson(json)
-      .asInstanceOf[StreamingQueryListener.QueryStarted]
+      .asInstanceOf[StreamingQueryListener.QueryStartedEvent]
     assertStreamingQueryInfoEquals(queryStarted.queryStatus, newQueryStarted.queryStatus)
   }
 
   test("QueryProgress serialization") {
-    val queryProcess = new StreamingQueryListener.QueryProgress(StreamingQueryStatus.testStatus)
+    val queryProcess = new StreamingQueryListener.QueryProgressEvent(
+      StreamingQueryStatus.testStatus)
     val json = JsonProtocol.sparkEventToJson(queryProcess)
     val newQueryProcess = JsonProtocol.sparkEventFromJson(json)
-      .asInstanceOf[StreamingQueryListener.QueryProgress]
+      .asInstanceOf[StreamingQueryListener.QueryProgressEvent]
     assertStreamingQueryInfoEquals(queryProcess.queryStatus, newQueryProcess.queryStatus)
   }
 
   test("QueryTerminated serialization") {
     val exception = new RuntimeException("exception")
-    val queryQueryTerminated = new StreamingQueryListener.QueryTerminated(
+    val queryQueryTerminated = new StreamingQueryListener.QueryTerminatedEvent(
       StreamingQueryStatus.testStatus,
       Some(exception.getMessage))
     val json =
       JsonProtocol.sparkEventToJson(queryQueryTerminated)
     val newQueryTerminated = JsonProtocol.sparkEventFromJson(json)
-      .asInstanceOf[StreamingQueryListener.QueryTerminated]
+      .asInstanceOf[StreamingQueryListener.QueryTerminatedEvent]
     assertStreamingQueryInfoEquals(queryQueryTerminated.queryStatus, newQueryTerminated.queryStatus)
     assert(queryQueryTerminated.exception === newQueryTerminated.exception)
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
index 9f8e2db966367..92020be9789fe 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
@@ -290,11 +290,11 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
     // A StreamingQueryListener that gets the query status after the first completed trigger
     val listener = new StreamingQueryListener {
       @volatile var firstStatus: StreamingQueryStatus = null
-      override def onQueryStarted(queryStarted: QueryStarted): Unit = { }
-      override def onQueryProgress(queryProgress: QueryProgress): Unit = {
+      override def onQueryStarted(queryStarted: QueryStartedEvent): Unit = { }
+      override def onQueryProgress(queryProgress: QueryProgressEvent): Unit = {
        if (firstStatus == null) firstStatus = queryProgress.queryStatus
       }
-      override def onQueryTerminated(queryTerminated: QueryTerminated): Unit = { }
+      override def onQueryTerminated(queryTerminated: QueryTerminatedEvent): Unit = { }
     }
 
     try {

From 5f20ae0394388574a3767daf7f499c89658f61e1 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Wed, 19 Oct 2016 10:20:12 +0800
Subject: [PATCH 054/162] [SPARK-17980][SQL] Fix refreshByPath for converted
 Hive tables

## What changes were proposed in this pull request?

There was a bug introduced in https://github.com/apache/spark/pull/14690 which broke refreshByPath with converted hive tables (though, it turns out it was very difficult to refresh converted hive tables anyways, since you had to specify the exact path of one of the partitions).

This changes refreshByPath to invalidate by prefix instead of exact match, and fixes the issue.

cc sameeragarwal for refreshByPath changes
mallman

## How was this patch tested?

Extended unit test.

Author: Eric Liang <ekl@databricks.com>

Closes #15521 from ericl/fix-caching.
---
 .../apache/spark/sql/catalog/Catalog.scala    |  3 ++-
 .../spark/sql/execution/CacheManager.scala    |  5 +++--
 .../datasources/TableFileCatalog.scala        | 18 ++++++++++++----
 .../spark/sql/hive/HiveMetastoreCatalog.scala |  2 +-
 .../sql/hive/HiveMetadataCacheSuite.scala     | 21 +++++++++++++++++--
 5 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala
index 889b8a02784d6..aecdda1c36498 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala
@@ -343,7 +343,8 @@ abstract class Catalog {
 
   /**
    * Invalidate and refresh all the cached data (and the associated metadata) for any dataframe that
-   * contains the given data source path.
+   * contains the given data source path. Path matching is by prefix, i.e. "/" would invalidate
+   * everything that is cached.
    *
    * @since 2.0.0
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
index 92fd366e101fd..fb72c679e3628 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
@@ -185,9 +185,10 @@ class CacheManager extends Logging {
     plan match {
       case lr: LogicalRelation => lr.relation match {
         case hr: HadoopFsRelation =>
+          val prefixToInvalidate = qualifiedPath.toString
           val invalidate = hr.location.rootPaths
-            .map(_.makeQualified(fs.getUri, fs.getWorkingDirectory))
-            .contains(qualifiedPath)
+            .map(_.makeQualified(fs.getUri, fs.getWorkingDirectory).toString)
+            .exists(_.startsWith(prefixToInvalidate))
           if (invalidate) hr.location.refresh()
           invalidate
         case _ => false
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index 5648ab480a98a..fc08c3798ee06 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -48,13 +48,18 @@ class TableFileCatalog(
 
   private val baseLocation = catalogTable.storage.locationUri
 
+  // Populated on-demand by calls to cachedAllPartitions
+  private var cachedAllPartitions: ListingFileCatalog = null
+
   override def rootPaths: Seq[Path] = baseLocation.map(new Path(_)).toSeq
 
   override def listFiles(filters: Seq[Expression]): Seq[PartitionDirectory] = {
     filterPartitions(filters).listFiles(Nil)
   }
 
-  override def refresh(): Unit = {}
+  override def refresh(): Unit = synchronized {
+    cachedAllPartitions = null
+  }
 
   /**
    * Returns a [[ListingFileCatalog]] for this table restricted to the subset of partitions
@@ -64,7 +69,7 @@ class TableFileCatalog(
    */
   def filterPartitions(filters: Seq[Expression]): ListingFileCatalog = {
     if (filters.isEmpty) {
-      cachedAllPartitions
+      allPartitions
     } else {
       filterPartitions0(filters)
     }
@@ -89,9 +94,14 @@ class TableFileCatalog(
   }
 
   // Not used in the hot path of queries when metastore partition pruning is enabled
-  lazy val cachedAllPartitions: ListingFileCatalog = filterPartitions0(Nil)
+  def allPartitions: ListingFileCatalog = synchronized {
+    if (cachedAllPartitions == null) {
+      cachedAllPartitions = filterPartitions0(Nil)
+    }
+    cachedAllPartitions
+  }
 
-  override def inputFiles: Array[String] = cachedAllPartitions.inputFiles
+  override def inputFiles: Array[String] = allPartitions.inputFiles
 }
 
 /**
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 16e1e37b2fb02..c909eb5d20bcd 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -235,7 +235,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
           if (lazyPruningEnabled) {
             catalog
           } else {
-            catalog.cachedAllPartitions
+            catalog.allPartitions
           }
         }
         val partitionSchemaColumnNames = partitionSchema.map(_.name.toLowerCase).toSet
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
index 7af81a3a90504..2ca1cd4c07fdb 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
@@ -80,9 +80,13 @@ class HiveMetadataCacheSuite extends QueryTest with SQLTestUtils with TestHiveSi
             val df = spark.sql("select * from test")
             assert(sql("select * from test").count() == 5)
 
+            def deleteRandomFile(): Unit = {
+              val p = new Path(spark.table("test").inputFiles.head)
+              assert(p.getFileSystem(hiveContext.sessionState.newHadoopConf()).delete(p, true))
+            }
+
             // Delete a file, then assert that we tried to read it. This means the table was cached.
-            val p = new Path(spark.table("test").inputFiles.head)
-            assert(p.getFileSystem(hiveContext.sessionState.newHadoopConf()).delete(p, true))
+            deleteRandomFile()
             val e = intercept[SparkException] {
               sql("select * from test").count()
             }
@@ -91,6 +95,19 @@ class HiveMetadataCacheSuite extends QueryTest with SQLTestUtils with TestHiveSi
             // Test refreshing the cache.
             spark.catalog.refreshTable("test")
             assert(sql("select * from test").count() == 4)
+            assert(spark.table("test").inputFiles.length == 4)
+
+            // Test refresh by path separately since it goes through different code paths than
+            // refreshTable does.
+            deleteRandomFile()
+            spark.catalog.cacheTable("test")
+            spark.catalog.refreshByPath("/some-invalid-path")  // no-op
+            val e2 = intercept[SparkException] {
+              sql("select * from test").count()
+            }
+            assert(e2.getMessage.contains("FileNotFoundException"))
+            spark.catalog.refreshByPath(dir.getAbsolutePath)
+            assert(sql("select * from test").count() == 3)
           }
         }
       }

From 2629cd74602cfe77188b76428fed62a7a7149315 Mon Sep 17 00:00:00 2001
From: Yu Peng <loneknightpy@gmail.com>
Date: Tue, 18 Oct 2016 19:43:08 -0700
Subject: [PATCH 055/162] [SPARK-17711][TEST-HADOOP2.2] Fix hadoop2.2
 compilation error

## What changes were proposed in this pull request?

Fix hadoop2.2 compilation error.

## How was this patch tested?

Existing tests.

cc tdas zsxwing

Author: Yu Peng <loneknightpy@gmail.com>

Closes #15537 from loneknightpy/fix-17711.
---
 core/src/main/scala/org/apache/spark/util/Utils.scala | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index a4da138e71992..7fba901b85695 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -42,7 +42,6 @@ import scala.util.control.{ControlThrowable, NonFatal}
 import com.google.common.cache.{CacheBuilder, CacheLoader, LoadingCache}
 import com.google.common.io.{ByteStreams, Files => GFiles}
 import com.google.common.net.InetAddresses
-import org.apache.commons.io.IOUtils
 import org.apache.commons.lang3.SystemUtils
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
@@ -1486,10 +1485,10 @@ private[spark] object Utils extends Logging {
       val gzInputStream = new GZIPInputStream(new FileInputStream(file))
       val bufSize = 1024
       val buf = new Array[Byte](bufSize)
-      var numBytes = IOUtils.read(gzInputStream, buf)
+      var numBytes = ByteStreams.read(gzInputStream, buf, 0, bufSize)
       while (numBytes > 0) {
         fileSize += numBytes
-        numBytes = IOUtils.read(gzInputStream, buf)
+        numBytes = ByteStreams.read(gzInputStream, buf, 0, bufSize)
       }
       fileSize
     } catch {

From 4329c5cea4d235dc582fdb7cbdb822f62e650f5d Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Tue, 18 Oct 2016 20:23:13 -0700
Subject: [PATCH 056/162] [SPARK-17873][SQL] ALTER TABLE RENAME TO should allow
 users to specify database in destination table name(but have to be same as
 source table)

## What changes were proposed in this pull request?

Unlike Hive, in Spark SQL, ALTER TABLE RENAME TO cannot move a table from one database to another(e.g. `ALTER TABLE db1.tbl RENAME TO db2.tbl2`), and will report error if the database in source table and destination table is different. So in #14955 , we forbid users to specify database of destination table in ALTER TABLE RENAME TO, to be consistent with other database systems and also make it easier to rename tables in non-current database, e.g. users can write `ALTER TABLE db1.tbl RENAME TO tbl2`, instead of `ALTER TABLE db1.tbl RENAME TO db1.tbl2`.

However, this is a breaking change. Users may already have queries that specify database of destination table in ALTER TABLE RENAME TO.

This PR reverts most of #14955 , and simplify the usage of ALTER TABLE RENAME TO by making database of source table the default database of destination table, instead of current database, so that users can still write `ALTER TABLE db1.tbl RENAME TO tbl2`, which is consistent with other databases like MySQL, Postgres, etc.

## How was this patch tested?

The added back tests and some new tests.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #15434 from cloud-fan/revert.
---
 .../sql/catalyst/catalog/SessionCatalog.scala | 18 +++++++-
 .../catalog/SessionCatalogSuite.scala         | 23 ++++++----
 .../spark/sql/execution/SparkSqlParser.scala  | 10 +---
 .../spark/sql/execution/command/tables.scala  |  7 ++-
 .../execution/command/DDLCommandSuite.scala   | 18 ++++----
 .../sql/execution/command/DDLSuite.scala      | 46 +++++++++++++++++--
 6 files changed, 87 insertions(+), 35 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
index fe41c41a6eb20..9711131d88a05 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -462,11 +462,20 @@ class SessionCatalog(
    * If a database is specified in `oldName`, this will rename the table in that database.
    * If no database is specified, this will first attempt to rename a temporary table with
    * the same name, then, if that does not exist, rename the table in the current database.
+   *
+   * This assumes the database specified in `newName` matches the one in `oldName`.
    */
-  def renameTable(oldName: TableIdentifier, newName: String): Unit = synchronized {
+  def renameTable(oldName: TableIdentifier, newName: TableIdentifier): Unit = synchronized {
     val db = formatDatabaseName(oldName.database.getOrElse(currentDb))
+    newName.database.map(formatDatabaseName).foreach { newDb =>
+      if (db != newDb) {
+        throw new AnalysisException(
+          s"RENAME TABLE source and destination databases do not match: '$db' != '$newDb'")
+      }
+    }
+
     val oldTableName = formatTableName(oldName.table)
-    val newTableName = formatTableName(newName)
+    val newTableName = formatTableName(newName.table)
     if (db == globalTempViewManager.database) {
       globalTempViewManager.rename(oldTableName, newTableName)
     } else {
@@ -476,6 +485,11 @@ class SessionCatalog(
         requireTableNotExists(TableIdentifier(newTableName, Some(db)))
         externalCatalog.renameTable(db, oldTableName, newTableName)
       } else {
+        if (newName.database.isDefined) {
+          throw new AnalysisException(
+            s"RENAME TEMPORARY TABLE from '$oldName' to '$newName': cannot specify database " +
+              s"name '${newName.database.get}' in the destination table")
+        }
         if (tempTables.contains(newTableName)) {
           throw new AnalysisException(s"RENAME TEMPORARY TABLE from '$oldName' to '$newName': " +
             "destination table already exists")
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
index 915ed8f8b1787..187611bc77460 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
@@ -273,27 +273,34 @@ class SessionCatalogSuite extends SparkFunSuite {
     val externalCatalog = newBasicCatalog()
     val sessionCatalog = new SessionCatalog(externalCatalog)
     assert(externalCatalog.listTables("db2").toSet == Set("tbl1", "tbl2"))
-    sessionCatalog.renameTable(TableIdentifier("tbl1", Some("db2")), "tblone")
+    sessionCatalog.renameTable(TableIdentifier("tbl1", Some("db2")), TableIdentifier("tblone"))
     assert(externalCatalog.listTables("db2").toSet == Set("tblone", "tbl2"))
-    sessionCatalog.renameTable(TableIdentifier("tbl2", Some("db2")), "tbltwo")
+    sessionCatalog.renameTable(TableIdentifier("tbl2", Some("db2")), TableIdentifier("tbltwo"))
     assert(externalCatalog.listTables("db2").toSet == Set("tblone", "tbltwo"))
     // Rename table without explicitly specifying database
     sessionCatalog.setCurrentDatabase("db2")
-    sessionCatalog.renameTable(TableIdentifier("tbltwo"), "table_two")
+    sessionCatalog.renameTable(TableIdentifier("tbltwo"), TableIdentifier("table_two"))
     assert(externalCatalog.listTables("db2").toSet == Set("tblone", "table_two"))
+    // Renaming "db2.tblone" to "db1.tblones" should fail because databases don't match
+    intercept[AnalysisException] {
+      sessionCatalog.renameTable(
+        TableIdentifier("tblone", Some("db2")), TableIdentifier("tblones", Some("db1")))
+    }
     // The new table already exists
     intercept[TableAlreadyExistsException] {
-      sessionCatalog.renameTable(TableIdentifier("tblone", Some("db2")), "table_two")
+      sessionCatalog.renameTable(
+        TableIdentifier("tblone", Some("db2")),
+        TableIdentifier("table_two"))
     }
   }
 
   test("rename table when database/table does not exist") {
     val catalog = new SessionCatalog(newBasicCatalog())
     intercept[NoSuchDatabaseException] {
-      catalog.renameTable(TableIdentifier("tbl1", Some("unknown_db")), "tbl2")
+      catalog.renameTable(TableIdentifier("tbl1", Some("unknown_db")), TableIdentifier("tbl2"))
     }
     intercept[NoSuchTableException] {
-      catalog.renameTable(TableIdentifier("unknown_table", Some("db2")), "tbl2")
+      catalog.renameTable(TableIdentifier("unknown_table", Some("db2")), TableIdentifier("tbl2"))
     }
   }
 
@@ -306,12 +313,12 @@ class SessionCatalogSuite extends SparkFunSuite {
     assert(sessionCatalog.getTempView("tbl1") == Option(tempTable))
     assert(externalCatalog.listTables("db2").toSet == Set("tbl1", "tbl2"))
     // If database is not specified, temp table should be renamed first
-    sessionCatalog.renameTable(TableIdentifier("tbl1"), "tbl3")
+    sessionCatalog.renameTable(TableIdentifier("tbl1"), TableIdentifier("tbl3"))
     assert(sessionCatalog.getTempView("tbl1").isEmpty)
     assert(sessionCatalog.getTempView("tbl3") == Option(tempTable))
     assert(externalCatalog.listTables("db2").toSet == Set("tbl1", "tbl2"))
     // If database is specified, temp tables are never renamed
-    sessionCatalog.renameTable(TableIdentifier("tbl2", Some("db2")), "tbl4")
+    sessionCatalog.renameTable(TableIdentifier("tbl2", Some("db2")), TableIdentifier("tbl4"))
     assert(sessionCatalog.getTempView("tbl3") == Option(tempTable))
     assert(sessionCatalog.getTempView("tbl4").isEmpty)
     assert(externalCatalog.listTables("db2").toSet == Set("tbl1", "tbl4"))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index 8c68d1e3a2379..ea22b02d40b80 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -689,15 +689,9 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
    * }}}
    */
   override def visitRenameTable(ctx: RenameTableContext): LogicalPlan = withOrigin(ctx) {
-    val fromName = visitTableIdentifier(ctx.from)
-    val toName = visitTableIdentifier(ctx.to)
-    if (toName.database.isDefined) {
-      operationNotAllowed("Can not specify database in table/view name after RENAME TO", ctx)
-    }
-
     AlterTableRenameCommand(
-      fromName,
-      toName.table,
+      visitTableIdentifier(ctx.from),
+      visitTableIdentifier(ctx.to),
       ctx.VIEW != null)
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index 424ef58d76c5e..403b479a0e1bc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -146,7 +146,7 @@ case class CreateTableCommand(table: CatalogTable, ifNotExists: Boolean) extends
  */
 case class AlterTableRenameCommand(
     oldName: TableIdentifier,
-    newName: String,
+    newName: TableIdentifier,
     isView: Boolean)
   extends RunnableCommand {
 
@@ -159,7 +159,6 @@ case class AlterTableRenameCommand(
     } else {
       val table = catalog.getTableMetadata(oldName)
       DDLUtils.verifyAlterTableType(catalog, table, isView)
-      val newTblName = TableIdentifier(newName, oldName.database)
       // If an exception is thrown here we can just assume the table is uncached;
       // this can happen with Hive tables when the underlying catalog is in-memory.
       val wasCached = Try(sparkSession.catalog.isCached(oldName.unquotedString)).getOrElse(false)
@@ -172,7 +171,7 @@ case class AlterTableRenameCommand(
       }
       // For datasource tables, we also need to update the "path" serde property
       if (DDLUtils.isDatasourceTable(table) && table.tableType == CatalogTableType.MANAGED) {
-        val newPath = catalog.defaultTablePath(newTblName)
+        val newPath = catalog.defaultTablePath(newName)
         val newTable = table.withNewStorage(
           properties = table.storage.properties ++ Map("path" -> newPath))
         catalog.alterTable(newTable)
@@ -182,7 +181,7 @@ case class AlterTableRenameCommand(
       catalog.refreshTable(oldName)
       catalog.renameTable(oldName, newName)
       if (wasCached) {
-        sparkSession.catalog.cacheTable(newTblName.unquotedString)
+        sparkSession.catalog.cacheTable(newName.unquotedString)
       }
     }
     Seq.empty[Row]
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
index 547fb63813750..a3dbc9234f2f3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
@@ -387,20 +387,22 @@ class DDLCommandSuite extends PlanTest {
     val parsed_table = parser.parsePlan(sql_table)
     val parsed_view = parser.parsePlan(sql_view)
     val expected_table = AlterTableRenameCommand(
-      TableIdentifier("table_name", None),
-      "new_table_name",
+      TableIdentifier("table_name"),
+      TableIdentifier("new_table_name"),
       isView = false)
     val expected_view = AlterTableRenameCommand(
-      TableIdentifier("table_name", None),
-      "new_table_name",
+      TableIdentifier("table_name"),
+      TableIdentifier("new_table_name"),
       isView = true)
     comparePlans(parsed_table, expected_table)
     comparePlans(parsed_view, expected_view)
+  }
 
-    val e = intercept[ParseException](
-      parser.parsePlan("ALTER TABLE db1.tbl RENAME TO db1.tbl2")
-    )
-    assert(e.getMessage.contains("Can not specify database in table/view name after RENAME TO"))
+  test("alter table: rename table with database") {
+    val query = "ALTER TABLE db1.tbl RENAME TO db1.tbl2"
+    val plan = parseAs[AlterTableRenameCommand](query)
+    assert(plan.oldName == TableIdentifier("tbl", Some("db1")))
+    assert(plan.newName == TableIdentifier("tbl2", Some("db1")))
   }
 
   // ALTER TABLE table_name SET TBLPROPERTIES ('comment' = new_comment);
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index 097dc2441351f..c8b8e9ebabc75 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -665,16 +665,27 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
     createDatabase(catalog, "dbx")
     createDatabase(catalog, "dby")
     createTable(catalog, tableIdent1)
+
     assert(catalog.listTables("dbx") == Seq(tableIdent1))
-    sql("ALTER TABLE dbx.tab1 RENAME TO tab2")
+    sql("ALTER TABLE dbx.tab1 RENAME TO dbx.tab2")
     assert(catalog.listTables("dbx") == Seq(tableIdent2))
+
+    // The database in destination table name can be omitted, and we will use the database of source
+    // table for it.
+    sql("ALTER TABLE dbx.tab2 RENAME TO tab1")
+    assert(catalog.listTables("dbx") == Seq(tableIdent1))
+
     catalog.setCurrentDatabase("dbx")
     // rename without explicitly specifying database
-    sql("ALTER TABLE tab2 RENAME TO tab1")
-    assert(catalog.listTables("dbx") == Seq(tableIdent1))
+    sql("ALTER TABLE tab1 RENAME TO tab2")
+    assert(catalog.listTables("dbx") == Seq(tableIdent2))
     // table to rename does not exist
     intercept[AnalysisException] {
-      sql("ALTER TABLE dbx.does_not_exist RENAME TO tab2")
+      sql("ALTER TABLE dbx.does_not_exist RENAME TO dbx.tab2")
+    }
+    // destination database is different
+    intercept[AnalysisException] {
+      sql("ALTER TABLE dbx.tab1 RENAME TO dby.tab2")
     }
   }
 
@@ -696,6 +707,31 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
     assert(spark.table("teachers").collect().toSeq == df.collect().toSeq)
   }
 
+  test("rename temporary table - destination table with database name") {
+    withTempView("tab1") {
+      sql(
+        """
+          |CREATE TEMPORARY TABLE tab1
+          |USING org.apache.spark.sql.sources.DDLScanSource
+          |OPTIONS (
+          |  From '1',
+          |  To '10',
+          |  Table 'test1'
+          |)
+        """.stripMargin)
+
+      val e = intercept[AnalysisException] {
+        sql("ALTER TABLE tab1 RENAME TO default.tab2")
+      }
+      assert(e.getMessage.contains(
+        "RENAME TEMPORARY TABLE from '`tab1`' to '`default`.`tab2`': " +
+          "cannot specify database name 'default' in the destination table"))
+
+      val catalog = spark.sessionState.catalog
+      assert(catalog.listTables("default") == Seq(TableIdentifier("tab1")))
+    }
+  }
+
   test("rename temporary table") {
     withTempView("tab1", "tab2") {
       spark.range(10).createOrReplaceTempView("tab1")
@@ -736,7 +772,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
         sql("ALTER TABLE tab1 RENAME TO tab2")
       }
       assert(e.getMessage.contains(
-        "RENAME TEMPORARY TABLE from '`tab1`' to 'tab2': destination table already exists"))
+        "RENAME TEMPORARY TABLE from '`tab1`' to '`tab2`': destination table already exists"))
 
       val catalog = spark.sessionState.catalog
       assert(catalog.listTables("default") == Seq(TableIdentifier("tab1"), TableIdentifier("tab2")))

From f39852e59883c214b0d007faffb406570ea3084b Mon Sep 17 00:00:00 2001
From: Tommy YU <tummyyu@163.com>
Date: Tue, 18 Oct 2016 21:15:32 -0700
Subject: [PATCH 057/162] [SPARK-18001][DOCUMENT] fix broke link to
 SparkDataFrame

## What changes were proposed in this pull request?

In http://spark.apache.org/docs/latest/sql-programming-guide.html, Section "Untyped Dataset Operations (aka DataFrame Operations)"

Link to R DataFrame doesn't work that return
The requested URL /docs/latest/api/R/DataFrame.html was not found on this server.

Correct link is SparkDataFrame.html for spark 2.0

## How was this patch tested?

Manual checked.

Author: Tommy YU <tummyyu@163.com>

Closes #15543 from Wenpei/spark-18001.
---
 docs/sql-programming-guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 3f1b73a830eca..d334a86bc73d7 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -140,7 +140,7 @@ As an example, the following creates a DataFrame based on the content of a JSON
 
 ## Untyped Dataset Operations (aka DataFrame Operations)
 
-DataFrames provide a domain-specific language for structured data manipulation in [Scala](api/scala/index.html#org.apache.spark.sql.Dataset), [Java](api/java/index.html?org/apache/spark/sql/Dataset.html), [Python](api/python/pyspark.sql.html#pyspark.sql.DataFrame) and [R](api/R/DataFrame.html).
+DataFrames provide a domain-specific language for structured data manipulation in [Scala](api/scala/index.html#org.apache.spark.sql.Dataset), [Java](api/java/index.html?org/apache/spark/sql/Dataset.html), [Python](api/python/pyspark.sql.html#pyspark.sql.DataFrame) and [R](api/R/SparkDataFrame.html).
 
 As mentioned above, in Spark 2.0, DataFrames are just Dataset of `Row`s in Scala and Java API. These operations are also referred as "untyped transformations" in contrast to "typed transformations" come with strongly typed Scala/Java Datasets.
 

From 9540357ada7df1acfefa7b775c82675cd475244c Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@happy-camper.st>
Date: Wed, 19 Oct 2016 10:06:43 +0100
Subject: [PATCH 058/162] [SPARK-17985][CORE] Bump commons-lang3 version to
 3.5.

## What changes were proposed in this pull request?

`SerializationUtils.clone()` of commons-lang3 (<3.5) has a bug that breaks thread safety, which gets stack sometimes caused by race condition of initializing hash map.
See https://issues.apache.org/jira/browse/LANG-1251.

## How was this patch tested?

Existing tests.

Author: Takuya UESHIN <ueshin@happy-camper.st>

Closes #15548 from ueshin/issues/SPARK-17985.
---
 dev/deps/spark-deps-hadoop-2.2      | 2 +-
 dev/deps/spark-deps-hadoop-2.3      | 2 +-
 dev/deps/spark-deps-hadoop-2.4      | 2 +-
 dev/deps/spark-deps-hadoop-2.6      | 2 +-
 dev/deps/spark-deps-hadoop-2.7      | 2 +-
 docs/streaming-flume-integration.md | 4 ++--
 pom.xml                             | 2 +-
 7 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2
index b30f8c347c0af..525dcef5b7d99 100644
--- a/dev/deps/spark-deps-hadoop-2.2
+++ b/dev/deps/spark-deps-hadoop-2.2
@@ -33,7 +33,7 @@ commons-digester-1.8.jar
 commons-httpclient-3.1.jar
 commons-io-2.4.jar
 commons-lang-2.6.jar
-commons-lang3-3.3.2.jar
+commons-lang3-3.5.jar
 commons-logging-1.1.3.jar
 commons-math-2.1.jar
 commons-math3-3.4.1.jar
diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3
index 5b3a7651dd299..562fe6461e753 100644
--- a/dev/deps/spark-deps-hadoop-2.3
+++ b/dev/deps/spark-deps-hadoop-2.3
@@ -36,7 +36,7 @@ commons-digester-1.8.jar
 commons-httpclient-3.1.jar
 commons-io-2.4.jar
 commons-lang-2.6.jar
-commons-lang3-3.3.2.jar
+commons-lang3-3.5.jar
 commons-logging-1.1.3.jar
 commons-math3-3.4.1.jar
 commons-net-2.2.jar
diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4
index e323efe30f64b..747521aa2a566 100644
--- a/dev/deps/spark-deps-hadoop-2.4
+++ b/dev/deps/spark-deps-hadoop-2.4
@@ -36,7 +36,7 @@ commons-digester-1.8.jar
 commons-httpclient-3.1.jar
 commons-io-2.4.jar
 commons-lang-2.6.jar
-commons-lang3-3.3.2.jar
+commons-lang3-3.5.jar
 commons-logging-1.1.3.jar
 commons-math3-3.4.1.jar
 commons-net-2.2.jar
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 77d97e5365b9f..afd4502c59d33 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -40,7 +40,7 @@ commons-digester-1.8.jar
 commons-httpclient-3.1.jar
 commons-io-2.4.jar
 commons-lang-2.6.jar
-commons-lang3-3.3.2.jar
+commons-lang3-3.5.jar
 commons-logging-1.1.3.jar
 commons-math3-3.4.1.jar
 commons-net-2.2.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 572edfa0cc29e..687b855b649d8 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -40,7 +40,7 @@ commons-digester-1.8.jar
 commons-httpclient-3.1.jar
 commons-io-2.4.jar
 commons-lang-2.6.jar
-commons-lang3-3.3.2.jar
+commons-lang3-3.5.jar
 commons-logging-1.1.3.jar
 commons-math3-3.4.1.jar
 commons-net-2.2.jar
diff --git a/docs/streaming-flume-integration.md b/docs/streaming-flume-integration.md
index 767e1f9402e01..a5d36da5b6de9 100644
--- a/docs/streaming-flume-integration.md
+++ b/docs/streaming-flume-integration.md
@@ -115,11 +115,11 @@ Configuring Flume on the chosen machine requires the following two steps.
 		artifactId = scala-library
 		version = {{site.SCALA_VERSION}}
 
-	(iii) *Commons Lang 3 JAR*: Download the Commons Lang 3 JAR. It can be found with the following artifact detail (or, [direct link](http://search.maven.org/remotecontent?filepath=org/apache/commons/commons-lang3/3.3.2/commons-lang3-3.3.2.jar)).
+	(iii) *Commons Lang 3 JAR*: Download the Commons Lang 3 JAR. It can be found with the following artifact detail (or, [direct link](http://search.maven.org/remotecontent?filepath=org/apache/commons/commons-lang3/3.5/commons-lang3-3.5.jar)).
 
 		groupId = org.apache.commons
 		artifactId = commons-lang3
-		version = 3.3.2
+		version = 3.5
 
 2. **Configuration file**: On that machine, configure Flume agent to send data to an Avro sink by having the following in the configuration file.
 
diff --git a/pom.xml b/pom.xml
index 7d13c51b2a596..aaf7cfa7eb2ad 100644
--- a/pom.xml
+++ b/pom.xml
@@ -168,7 +168,7 @@
     <!-- org.apache.commons/commons-lang/-->
     <commons-lang2.version>2.6</commons-lang2.version>
     <!-- org.apache.commons/commons-lang3/-->
-    <commons-lang3.version>3.3.2</commons-lang3.version>
+    <commons-lang3.version>3.5</commons-lang3.version>
     <datanucleus-core.version>3.2.10</datanucleus-core.version>
     <janino.version>3.0.0</janino.version>
     <jersey.version>2.22.2</jersey.version>

From 444c2d22e38a8a78135adf0d3a3774f0e9fc866c Mon Sep 17 00:00:00 2001
From: Alex Bozarth <ajbozart@us.ibm.com>
Date: Wed, 19 Oct 2016 13:01:33 -0700
Subject: [PATCH 059/162] [SPARK-10541][WEB UI] Allow
 ApplicationHistoryProviders to provide their own text when there aren't any
 complete apps

## What changes were proposed in this pull request?

I've added a method to `ApplicationHistoryProvider` that returns the html paragraph to display when there are no applications. This allows providers other than `FsHistoryProvider` to determine what is printed. The current hard coded text is now moved into `FsHistoryProvider` since it assumed that's what was being used before.

I chose to make the function return html rather than text because the current text block had inline html in it and it allows a new implementation of `ApplicationHistoryProvider` more versatility. I did not see any security issues with this since injecting html here requires implementing `ApplicationHistoryProvider` and can't be done outside of code.

## How was this patch tested?

Manual testing and dev/run-tests

No visible changes to the UI

Author: Alex Bozarth <ajbozart@us.ibm.com>

Closes #15490 from ajbozarth/spark10541.
---
 .../deploy/history/ApplicationHistoryProvider.scala  |  6 ++++++
 .../spark/deploy/history/FsHistoryProvider.scala     | 12 ++++++++++++
 .../apache/spark/deploy/history/HistoryPage.scala    |  8 +-------
 .../apache/spark/deploy/history/HistoryServer.scala  |  8 ++++++++
 4 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala
index ad7a0972ef9d1..06530ff836466 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala
@@ -19,6 +19,8 @@ package org.apache.spark.deploy.history
 
 import java.util.zip.ZipOutputStream
 
+import scala.xml.Node
+
 import org.apache.spark.SparkException
 import org.apache.spark.ui.SparkUI
 
@@ -114,4 +116,8 @@ private[history] abstract class ApplicationHistoryProvider {
    */
   def getApplicationInfo(appId: String): Option[ApplicationHistoryInfo]
 
+  /**
+   * @return html text to display when the application list is empty
+   */
+  def getEmptyListingHtml(): Seq[Node] = Seq.empty
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index 3c2d169f3270e..530cc5252214b 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -23,6 +23,7 @@ import java.util.concurrent.{Executors, ExecutorService, TimeUnit}
 import java.util.zip.{ZipEntry, ZipOutputStream}
 
 import scala.collection.mutable
+import scala.xml.Node
 
 import com.google.common.io.ByteStreams
 import com.google.common.util.concurrent.{MoreExecutors, ThreadFactoryBuilder}
@@ -262,6 +263,17 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
     }
   }
 
+  override def getEmptyListingHtml(): Seq[Node] = {
+    <p>
+      Did you specify the correct logging directory? Please verify your setting of
+      <span style="font-style:italic">spark.history.fs.logDirectory</span>
+      listed above and whether you have the permissions to access it.
+      <br/>
+      It is also possible that your application did not run to
+      completion or did not stop the SparkContext.
+    </p>
+  }
+
   override def getConfig(): Map[String, String] = {
     val safeMode = if (isFsInSafeMode()) {
       Map("HDFS State" -> "In safe mode, application logs not available.")
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
index 95b72224e0f94..96b9ecf43b14c 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
@@ -47,13 +47,7 @@ private[history] class HistoryPage(parent: HistoryServer) extends WebUIPage("")
             } else if (requestedIncomplete) {
               <h4>No incomplete applications found!</h4>
             } else {
-              <h4>No completed applications found!</h4> ++
-                <p>Did you specify the correct logging directory?
-                  Please verify your setting of <span style="font-style:italic">
-                  spark.history.fs.logDirectory</span> and whether you have the permissions to
-                  access it.<br /> It is also possible that your application did not run to
-                  completion or did not stop the SparkContext.
-                </p>
+              <h4>No completed applications found!</h4> ++ parent.emptyListingHtml
             }
             }
 
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
index 087c69e6489dd..3175b36b3e56f 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
@@ -22,6 +22,7 @@ import java.util.zip.ZipOutputStream
 import javax.servlet.http.{HttpServlet, HttpServletRequest, HttpServletResponse}
 
 import scala.util.control.NonFatal
+import scala.xml.Node
 
 import org.eclipse.jetty.servlet.{ServletContextHandler, ServletHolder}
 
@@ -193,6 +194,13 @@ class HistoryServer(
     provider.writeEventLogs(appId, attemptId, zipStream)
   }
 
+  /**
+   * @return html text to display when the application list is empty
+   */
+  def emptyListingHtml(): Seq[Node] = {
+    provider.getEmptyListingHtml()
+  }
+
   /**
    * Returns the provider configuration to show in the listing page.
    *

From 4b2011ec9da1245923b5cbd883240fef0dbf3ef0 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Wed, 19 Oct 2016 19:36:21 -0700
Subject: [PATCH 060/162] [SPARK-17989][SQL] Check ascendingOrder type in
 sort_array function rather than throwing ClassCastException

## What changes were proposed in this pull request?

This PR proposes to check the second argument, `ascendingOrder`  rather than throwing `ClassCastException` exception message.

```sql
select sort_array(array('b', 'd'), '1');
```

**Before**

```
16/10/19 13:16:08 ERROR SparkSQLDriver: Failed in [select sort_array(array('b', 'd'), '1')]
java.lang.ClassCastException: org.apache.spark.unsafe.types.UTF8String cannot be cast to java.lang.Boolean
	at scala.runtime.BoxesRunTime.unboxToBoolean(BoxesRunTime.java:85)
	at org.apache.spark.sql.catalyst.expressions.SortArray.nullSafeEval(collectionOperations.scala:185)
	at org.apache.spark.sql.catalyst.expressions.BinaryExpression.eval(Expression.scala:416)
	at org.apache.spark.sql.catalyst.optimizer.ConstantFolding$$anonfun$apply$1$$anonfun$applyOrElse$1.applyOrElse(expressions.scala:50)
	at org.apache.spark.sql.catalyst.optimizer.ConstantFolding$$anonfun$apply$1$$anonfun$applyOrElse$1.applyOrElse(expressions.scala:43)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:292)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:292)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:74)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:291)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:297)
```

**After**

```
Error in query: cannot resolve 'sort_array(array('b', 'd'), '1')' due to data type mismatch: Sort order in second argument requires a boolean literal.; line 1 pos 7;
```

## How was this patch tested?

Unit test in `DataFrameFunctionsSuite`.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #15532 from HyukjinKwon/SPARK-17989.
---
 .../expressions/collectionOperations.scala    |  8 ++++++-
 .../test/resources/sql-tests/inputs/array.sql |  6 ++++++
 .../resources/sql-tests/results/array.sql.out | 21 ++++++++++++++++---
 3 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index c0200299376ca..f56bb39d10791 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -124,7 +124,13 @@ case class SortArray(base: Expression, ascendingOrder: Expression)
 
   override def checkInputDataTypes(): TypeCheckResult = base.dataType match {
     case ArrayType(dt, _) if RowOrdering.isOrderable(dt) =>
-      TypeCheckResult.TypeCheckSuccess
+      ascendingOrder match {
+        case Literal(_: Boolean, BooleanType) =>
+          TypeCheckResult.TypeCheckSuccess
+        case _ =>
+          TypeCheckResult.TypeCheckFailure(
+            "Sort order in second argument requires a boolean literal.")
+      }
     case ArrayType(dt, _) =>
       TypeCheckResult.TypeCheckFailure(
         s"$prettyName does not support sorting array of type ${dt.simpleString}")
diff --git a/sql/core/src/test/resources/sql-tests/inputs/array.sql b/sql/core/src/test/resources/sql-tests/inputs/array.sql
index 4038a0da41d2b..984321ab795fc 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/array.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/array.sql
@@ -71,6 +71,12 @@ select
   sort_array(timestamp_array)
 from primitive_arrays;
 
+-- sort_array with an invalid string literal for the argument of sort order.
+select sort_array(array('b', 'd'), '1');
+
+-- sort_array with an invalid null literal casted as boolean for the argument of sort order.
+select sort_array(array('b', 'd'), cast(NULL as boolean));
+
 -- size
 select
   size(boolean_array),
diff --git a/sql/core/src/test/resources/sql-tests/results/array.sql.out b/sql/core/src/test/resources/sql-tests/results/array.sql.out
index 4a1d149c1f362..499a3d5fb72f6 100644
--- a/sql/core/src/test/resources/sql-tests/results/array.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/array.sql.out
@@ -124,8 +124,23 @@ struct<sort_array(boolean_array, true):array<boolean>,sort_array(tinyint_array,
 -- !query 8 output
 [true]	[1,2]	[1,2]	[1,2]	[1,2]	[9223372036854775808,9223372036854775809]	[1.0,2.0]	[1.0,2.0]	[2016-03-13,2016-03-14]	[2016-11-12 20:54:00.0,2016-11-15 20:54:00.0]
 
-
 -- !query 9
+select sort_array(array('b', 'd'), '1')
+-- !query 9 schema
+struct<>
+-- !query 9 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'sort_array(array('b', 'd'), '1')' due to data type mismatch: Sort order in second argument requires a boolean literal.; line 1 pos 7
+
+-- !query 10
+select sort_array(array('b', 'd'), cast(NULL as boolean))
+-- !query 10 schema
+struct<>
+-- !query 10 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'sort_array(array('b', 'd'), CAST(NULL AS BOOLEAN))' due to data type mismatch: Sort order in second argument requires a boolean literal.; line 1 pos 7
+
+-- !query 11
 select
   size(boolean_array),
   size(tinyint_array),
@@ -138,7 +153,7 @@ select
   size(date_array),
   size(timestamp_array)
 from primitive_arrays
--- !query 9 schema
+-- !query 11 schema
 struct<size(boolean_array):int,size(tinyint_array):int,size(smallint_array):int,size(int_array):int,size(bigint_array):int,size(decimal_array):int,size(double_array):int,size(float_array):int,size(date_array):int,size(timestamp_array):int>
--- !query 9 output
+-- !query 11 output
 1	2	2	2	2	2	2	2	2	2

From f313117bc93b0bf560528b316d3e6947caa96296 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 19 Oct 2016 22:22:35 -0700
Subject: [PATCH 061/162] [SPARK-18012][SQL] Simplify WriterContainer

## What changes were proposed in this pull request?
This patch refactors WriterContainer to simplify the logic and make control flow more obvious.The previous code setup made it pretty difficult to track the actual dependencies on variables and setups because the driver side and the executor side were using the same set of variables.

## How was this patch tested?
N/A - this should be covered by existing tests.

Author: Reynold Xin <rxin@databricks.com>

Closes #15551 from rxin/writercontainer-refactor.
---
 .../InsertIntoHadoopFsRelationCommand.scala   |  79 +--
 .../execution/datasources/WriteOutput.scala   | 480 ++++++++++++++++++
 .../datasources/WriterContainer.scala         | 445 ----------------
 .../apache/spark/sql/internal/SQLConf.scala   |   9 -
 4 files changed, 492 insertions(+), 521 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriteOutput.scala
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index 99ca3df673568..22dbe7149531c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -20,18 +20,12 @@ package org.apache.spark.sql.execution.datasources
 import java.io.IOException
 
 import org.apache.hadoop.fs.Path
-import org.apache.hadoop.mapreduce._
-import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
 
-import org.apache.spark._
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.catalog.BucketSpec
-import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet}
+import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.execution.command.RunnableCommand
-import org.apache.spark.sql.internal.SQLConf
 
 /**
  * A command for writing data to a [[HadoopFsRelation]].  Supports both overwriting and appending.
@@ -40,20 +34,6 @@ import org.apache.spark.sql.internal.SQLConf
  * implementation of [[HadoopFsRelation]] should use this UUID together with task id to generate
  * unique file path for each task output file.  This UUID is passed to executor side via a
  * property named `spark.sql.sources.writeJobUUID`.
- *
- * Different writer containers, [[DefaultWriterContainer]] and [[DynamicPartitionWriterContainer]]
- * are used to write to normal tables and tables with dynamic partitions.
- *
- * Basic work flow of this command is:
- *
- *   1. Driver side setup, including output committer initialization and data source specific
- *      preparation work for the write job to be issued.
- *   2. Issues a write job consists of one or more executor side tasks, each of which writes all
- *      rows within an RDD partition.
- *   3. If no exception is thrown in a task, commits that task, otherwise aborts that task;  If any
- *      exception is thrown during task commitment, also aborts that task.
- *   4. If all tasks are committed, commit the job, otherwise aborts the job;  If any exception is
- *      thrown during job commitment, also aborts the job.
  */
 case class InsertIntoHadoopFsRelationCommand(
     outputPath: Path,
@@ -103,52 +83,17 @@ case class InsertIntoHadoopFsRelationCommand(
     val isAppend = pathExists && (mode == SaveMode.Append)
 
     if (doInsertion) {
-      val job = Job.getInstance(hadoopConf)
-      job.setOutputKeyClass(classOf[Void])
-      job.setOutputValueClass(classOf[InternalRow])
-      FileOutputFormat.setOutputPath(job, qualifiedOutputPath)
-
-      val partitionSet = AttributeSet(partitionColumns)
-      val dataColumns = query.output.filterNot(partitionSet.contains)
-
-      val queryExecution = Dataset.ofRows(sparkSession, query).queryExecution
-      SQLExecution.withNewExecutionId(sparkSession, queryExecution) {
-        val relation =
-          WriteRelation(
-            sparkSession,
-            dataColumns.toStructType,
-            qualifiedOutputPath.toString,
-            fileFormat.prepareWrite(sparkSession, _, options, dataColumns.toStructType),
-            bucketSpec)
-
-        val writerContainer = if (partitionColumns.isEmpty && bucketSpec.isEmpty) {
-          new DefaultWriterContainer(relation, job, isAppend)
-        } else {
-          new DynamicPartitionWriterContainer(
-            relation,
-            job,
-            partitionColumns = partitionColumns,
-            dataColumns = dataColumns,
-            inputSchema = query.output,
-            PartitioningUtils.DEFAULT_PARTITION_NAME,
-            sparkSession.sessionState.conf.partitionMaxFiles,
-            isAppend)
-        }
-
-        // This call shouldn't be put into the `try` block below because it only initializes and
-        // prepares the job, any exception thrown from here shouldn't cause abortJob() to be called.
-        writerContainer.driverSideSetup()
-
-        try {
-          sparkSession.sparkContext.runJob(queryExecution.toRdd, writerContainer.writeRows _)
-          writerContainer.commitJob()
-          refreshFunction()
-        } catch { case cause: Throwable =>
-          logError("Aborting job.", cause)
-          writerContainer.abortJob()
-          throw new SparkException("Job aborted.", cause)
-        }
-      }
+      WriteOutput.write(
+        sparkSession,
+        query,
+        fileFormat,
+        qualifiedOutputPath,
+        hadoopConf,
+        partitionColumns,
+        bucketSpec,
+        refreshFunction,
+        options,
+        isAppend)
     } else {
       logInfo("Skipping insertion into a relation that already exists.")
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriteOutput.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriteOutput.scala
new file mode 100644
index 0000000000000..54d0f3bd6291a
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriteOutput.scala
@@ -0,0 +1,480 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import java.util.{Date, UUID}
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.mapreduce._
+import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter, FileOutputFormat}
+import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
+
+import org.apache.spark._
+import org.apache.spark.internal.Logging
+import org.apache.spark.mapred.SparkHadoopMapRedUtil
+import org.apache.spark.sql.{Dataset, SparkSession}
+import org.apache.spark.sql.catalyst.catalog.BucketSpec
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.execution.{SQLExecution, UnsafeKVExternalSorter}
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
+import org.apache.spark.util.{SerializableConfiguration, Utils}
+import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter
+
+
+/** A helper object for writing data out to a location. */
+object WriteOutput extends Logging {
+
+  /** A shared job description for all the write tasks. */
+  private class WriteJobDescription(
+      val serializableHadoopConf: SerializableConfiguration,
+      val outputWriterFactory: OutputWriterFactory,
+      val allColumns: Seq[Attribute],
+      val partitionColumns: Seq[Attribute],
+      val nonPartitionColumns: Seq[Attribute],
+      val bucketSpec: Option[BucketSpec],
+      val isAppend: Boolean,
+      val path: String,
+      val outputFormatClass: Class[_ <: OutputFormat[_, _]])
+    extends Serializable {
+
+    assert(AttributeSet(allColumns) == AttributeSet(partitionColumns ++ nonPartitionColumns),
+      s"""
+         |All columns: ${allColumns.mkString(", ")}
+         |Partition columns: ${partitionColumns.mkString(", ")}
+         |Non-partition columns: ${nonPartitionColumns.mkString(", ")}
+       """.stripMargin)
+  }
+
+  /**
+   * Basic work flow of this command is:
+   * 1. Driver side setup, including output committer initialization and data source specific
+   *    preparation work for the write job to be issued.
+   * 2. Issues a write job consists of one or more executor side tasks, each of which writes all
+   *    rows within an RDD partition.
+   * 3. If no exception is thrown in a task, commits that task, otherwise aborts that task;  If any
+   *    exception is thrown during task commitment, also aborts that task.
+   * 4. If all tasks are committed, commit the job, otherwise aborts the job;  If any exception is
+   *    thrown during job commitment, also aborts the job.
+   */
+  def write(
+      sparkSession: SparkSession,
+      plan: LogicalPlan,
+      fileFormat: FileFormat,
+      outputPath: Path,
+      hadoopConf: Configuration,
+      partitionColumns: Seq[Attribute],
+      bucketSpec: Option[BucketSpec],
+      refreshFunction: () => Unit,
+      options: Map[String, String],
+      isAppend: Boolean): Unit = {
+
+    val job = Job.getInstance(hadoopConf)
+    job.setOutputKeyClass(classOf[Void])
+    job.setOutputValueClass(classOf[InternalRow])
+    FileOutputFormat.setOutputPath(job, outputPath)
+
+    val partitionSet = AttributeSet(partitionColumns)
+    val dataColumns = plan.output.filterNot(partitionSet.contains)
+    val queryExecution = Dataset.ofRows(sparkSession, plan).queryExecution
+
+    // Note: prepareWrite has side effect. It sets "job".
+    val outputWriterFactory =
+      fileFormat.prepareWrite(sparkSession, job, options, dataColumns.toStructType)
+
+    val description = new WriteJobDescription(
+      serializableHadoopConf = new SerializableConfiguration(job.getConfiguration),
+      outputWriterFactory = outputWriterFactory,
+      allColumns = plan.output,
+      partitionColumns = partitionColumns,
+      nonPartitionColumns = dataColumns,
+      bucketSpec = bucketSpec,
+      isAppend = isAppend,
+      path = outputPath.toString,
+      outputFormatClass = job.getOutputFormatClass)
+
+    SQLExecution.withNewExecutionId(sparkSession, queryExecution) {
+      // This call shouldn't be put into the `try` block below because it only initializes and
+      // prepares the job, any exception thrown from here shouldn't cause abortJob() to be called.
+      val committer = setupDriverCommitter(job, outputPath.toString, isAppend)
+
+      try {
+        sparkSession.sparkContext.runJob(queryExecution.toRdd,
+          (taskContext: TaskContext, iter: Iterator[InternalRow]) => {
+            executeTask(
+              description = description,
+              sparkStageId = taskContext.stageId(),
+              sparkPartitionId = taskContext.partitionId(),
+              sparkAttemptNumber = taskContext.attemptNumber(),
+              iterator = iter)
+          })
+
+        committer.commitJob(job)
+        logInfo(s"Job ${job.getJobID} committed.")
+        refreshFunction()
+      } catch { case cause: Throwable =>
+        logError(s"Aborting job ${job.getJobID}.", cause)
+        committer.abortJob(job, JobStatus.State.FAILED)
+        throw new SparkException("Job aborted.", cause)
+      }
+    }
+  }
+
+  /** Writes data out in a single Spark task. */
+  private def executeTask(
+      description: WriteJobDescription,
+      sparkStageId: Int,
+      sparkPartitionId: Int,
+      sparkAttemptNumber: Int,
+      iterator: Iterator[InternalRow]): Unit = {
+
+    val jobId = SparkHadoopWriter.createJobID(new Date, sparkStageId)
+    val taskId = new TaskID(jobId, TaskType.MAP, sparkPartitionId)
+    val taskAttemptId = new TaskAttemptID(taskId, sparkAttemptNumber)
+
+    // Set up the attempt context required to use in the output committer.
+    val taskAttemptContext: TaskAttemptContext = {
+      // Set up the configuration object
+      val hadoopConf = description.serializableHadoopConf.value
+      hadoopConf.set("mapred.job.id", jobId.toString)
+      hadoopConf.set("mapred.tip.id", taskAttemptId.getTaskID.toString)
+      hadoopConf.set("mapred.task.id", taskAttemptId.toString)
+      hadoopConf.setBoolean("mapred.task.is.map", true)
+      hadoopConf.setInt("mapred.task.partition", 0)
+
+      new TaskAttemptContextImpl(hadoopConf, taskAttemptId)
+    }
+
+    val committer = newOutputCommitter(
+      description.outputFormatClass, taskAttemptContext, description.path, description.isAppend)
+    committer.setupTask(taskAttemptContext)
+
+    // Figure out where we need to write data to for staging.
+    // For FileOutputCommitter it has its own staging path called "work path".
+    val stagingPath = committer match {
+      case f: FileOutputCommitter => f.getWorkPath.toString
+      case _ => description.path
+    }
+
+    val writeTask =
+      if (description.partitionColumns.isEmpty && description.bucketSpec.isEmpty) {
+        new SingleDirectoryWriteTask(description, taskAttemptContext, stagingPath)
+      } else {
+        new DynamicPartitionWriteTask(description, taskAttemptContext, stagingPath)
+      }
+
+    try {
+      Utils.tryWithSafeFinallyAndFailureCallbacks(block = {
+        // Execute the task to write rows out
+        writeTask.execute(iterator)
+        writeTask.releaseResources()
+
+        // Commit the task
+        SparkHadoopMapRedUtil.commitTask(committer, taskAttemptContext, jobId.getId, taskId.getId)
+      })(catchBlock = {
+        // If there is an error, release resource and then abort the task
+        try {
+          writeTask.releaseResources()
+        } finally {
+          committer.abortTask(taskAttemptContext)
+          logError(s"Job $jobId aborted.")
+        }
+      })
+    } catch {
+      case t: Throwable =>
+        throw new SparkException("Task failed while writing rows", t)
+    }
+  }
+
+  /**
+   * A simple trait for writing out data in a single Spark task, without any concerns about how
+   * to commit or abort tasks. Exceptions thrown by the implementation of this trait will
+   * automatically trigger task aborts.
+   */
+  private trait ExecuteWriteTask {
+    def execute(iterator: Iterator[InternalRow]): Unit
+    def releaseResources(): Unit
+  }
+
+  /** Writes data to a single directory (used for non-dynamic-partition writes). */
+  private class SingleDirectoryWriteTask(
+      description: WriteJobDescription,
+      taskAttemptContext: TaskAttemptContext,
+      stagingPath: String) extends ExecuteWriteTask {
+
+    private[this] var outputWriter: OutputWriter = {
+      val outputWriter = description.outputWriterFactory.newInstance(
+        path = stagingPath,
+        bucketId = None,
+        dataSchema = description.nonPartitionColumns.toStructType,
+        context = taskAttemptContext)
+      outputWriter.initConverter(dataSchema = description.nonPartitionColumns.toStructType)
+      outputWriter
+    }
+
+    override def execute(iter: Iterator[InternalRow]): Unit = {
+      while (iter.hasNext) {
+        val internalRow = iter.next()
+        outputWriter.writeInternal(internalRow)
+      }
+    }
+
+    override def releaseResources(): Unit = {
+      if (outputWriter != null) {
+        outputWriter.close()
+        outputWriter = null
+      }
+    }
+  }
+
+  /**
+   * Writes data to using dynamic partition writes, meaning this single function can write to
+   * multiple directories (partitions) or files (bucketing).
+   */
+  private class DynamicPartitionWriteTask(
+      description: WriteJobDescription,
+      taskAttemptContext: TaskAttemptContext,
+      stagingPath: String) extends ExecuteWriteTask {
+
+    // currentWriter is initialized whenever we see a new key
+    private var currentWriter: OutputWriter = _
+
+    private val bucketColumns: Seq[Attribute] = description.bucketSpec.toSeq.flatMap {
+      spec => spec.bucketColumnNames.map(c => description.allColumns.find(_.name == c).get)
+    }
+
+    private val sortColumns: Seq[Attribute] = description.bucketSpec.toSeq.flatMap {
+      spec => spec.sortColumnNames.map(c => description.allColumns.find(_.name == c).get)
+    }
+
+    private def bucketIdExpression: Option[Expression] = description.bucketSpec.map { spec =>
+      // Use `HashPartitioning.partitionIdExpression` as our bucket id expression, so that we can
+      // guarantee the data distribution is same between shuffle and bucketed data source, which
+      // enables us to only shuffle one side when join a bucketed table and a normal one.
+      HashPartitioning(bucketColumns, spec.numBuckets).partitionIdExpression
+    }
+
+    /** Expressions that given a partition key build a string like: col1=val/col2=val/... */
+    private def partitionStringExpression: Seq[Expression] = {
+      description.partitionColumns.zipWithIndex.flatMap { case (c, i) =>
+        val escaped = ScalaUDF(
+          PartitioningUtils.escapePathName _,
+          StringType,
+          Seq(Cast(c, StringType)),
+          Seq(StringType))
+        val str = If(IsNull(c), Literal(PartitioningUtils.DEFAULT_PARTITION_NAME), escaped)
+        val partitionName = Literal(c.name + "=") :: str :: Nil
+        if (i == 0) partitionName else Literal(Path.SEPARATOR) :: partitionName
+      }
+    }
+
+    private def getBucketIdFromKey(key: InternalRow): Option[Int] =
+      description.bucketSpec.map { _ => key.getInt(description.partitionColumns.length) }
+
+    /**
+     * Open and returns a new OutputWriter given a partition key and optional bucket id.
+     * If bucket id is specified, we will append it to the end of the file name, but before the
+     * file extension, e.g. part-r-00009-ea518ad4-455a-4431-b471-d24e03814677-00002.gz.parquet
+     */
+    private def newOutputWriter(
+        key: InternalRow,
+        getPartitionString: UnsafeProjection): OutputWriter = {
+      val path =
+        if (description.partitionColumns.nonEmpty) {
+          val partitionPath = getPartitionString(key).getString(0)
+          new Path(stagingPath, partitionPath).toString
+        } else {
+          stagingPath
+        }
+      val bucketId = getBucketIdFromKey(key)
+
+      val newWriter = description.outputWriterFactory.newInstance(
+        path = path,
+        bucketId = bucketId,
+        dataSchema = description.nonPartitionColumns.toStructType,
+        context = taskAttemptContext)
+      newWriter.initConverter(description.nonPartitionColumns.toStructType)
+      newWriter
+    }
+
+    override def execute(iter: Iterator[InternalRow]): Unit = {
+      // We should first sort by partition columns, then bucket id, and finally sorting columns.
+      val sortingExpressions: Seq[Expression] =
+      description.partitionColumns ++ bucketIdExpression ++ sortColumns
+      val getSortingKey = UnsafeProjection.create(sortingExpressions, description.allColumns)
+
+      val sortingKeySchema = StructType(sortingExpressions.map {
+        case a: Attribute => StructField(a.name, a.dataType, a.nullable)
+        // The sorting expressions are all `Attribute` except bucket id.
+        case _ => StructField("bucketId", IntegerType, nullable = false)
+      })
+
+      // Returns the data columns to be written given an input row
+      val getOutputRow = UnsafeProjection.create(
+        description.nonPartitionColumns, description.allColumns)
+
+      // Returns the partition path given a partition key.
+      val getPartitionString =
+      UnsafeProjection.create(Seq(Concat(partitionStringExpression)), description.partitionColumns)
+
+      // Sorts the data before write, so that we only need one writer at the same time.
+      val sorter = new UnsafeKVExternalSorter(
+        sortingKeySchema,
+        StructType.fromAttributes(description.nonPartitionColumns),
+        SparkEnv.get.blockManager,
+        SparkEnv.get.serializerManager,
+        TaskContext.get().taskMemoryManager().pageSizeBytes,
+        SparkEnv.get.conf.getLong("spark.shuffle.spill.numElementsForceSpillThreshold",
+          UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD))
+
+      while (iter.hasNext) {
+        val currentRow = iter.next()
+        sorter.insertKV(getSortingKey(currentRow), getOutputRow(currentRow))
+      }
+      logInfo(s"Sorting complete. Writing out partition files one at a time.")
+
+      val getBucketingKey: InternalRow => InternalRow = if (sortColumns.isEmpty) {
+        identity
+      } else {
+        UnsafeProjection.create(sortingExpressions.dropRight(sortColumns.length).zipWithIndex.map {
+          case (expr, ordinal) => BoundReference(ordinal, expr.dataType, expr.nullable)
+        })
+      }
+
+      val sortedIterator = sorter.sortedIterator()
+
+      // If anything below fails, we should abort the task.
+      var currentKey: UnsafeRow = null
+      while (sortedIterator.next()) {
+        val nextKey = getBucketingKey(sortedIterator.getKey).asInstanceOf[UnsafeRow]
+        if (currentKey != nextKey) {
+          if (currentWriter != null) {
+            currentWriter.close()
+            currentWriter = null
+          }
+          currentKey = nextKey.copy()
+          logDebug(s"Writing partition: $currentKey")
+
+          currentWriter = newOutputWriter(currentKey, getPartitionString)
+        }
+        currentWriter.writeInternal(sortedIterator.getValue)
+      }
+      if (currentWriter != null) {
+        currentWriter.close()
+        currentWriter = null
+      }
+    }
+
+    override def releaseResources(): Unit = {
+      if (currentWriter != null) {
+        currentWriter.close()
+        currentWriter = null
+      }
+    }
+  }
+
+  private def setupDriverCommitter(job: Job, path: String, isAppend: Boolean): OutputCommitter = {
+    // Setup IDs
+    val jobId = SparkHadoopWriter.createJobID(new Date, 0)
+    val taskId = new TaskID(jobId, TaskType.MAP, 0)
+    val taskAttemptId = new TaskAttemptID(taskId, 0)
+
+    // Set up the configuration object
+    job.getConfiguration.set("mapred.job.id", jobId.toString)
+    job.getConfiguration.set("mapred.tip.id", taskAttemptId.getTaskID.toString)
+    job.getConfiguration.set("mapred.task.id", taskAttemptId.toString)
+    job.getConfiguration.setBoolean("mapred.task.is.map", true)
+    job.getConfiguration.setInt("mapred.task.partition", 0)
+
+    // This UUID is sent to executor side together with the serialized `Configuration` object within
+    // the `Job` instance.  `OutputWriters` on the executor side should use this UUID to generate
+    // unique task output files.
+    // This UUID is used to avoid output file name collision between different appending write jobs.
+    // These jobs may belong to different SparkContext instances. Concrete data source
+    // implementations may use this UUID to generate unique file names (e.g.,
+    // `part-r-<task-id>-<job-uuid>.parquet`). The reason why this ID is used to identify a job
+    // rather than a single task output file is that, speculative tasks must generate the same
+    // output file name as the original task.
+    job.getConfiguration.set(WriterContainer.DATASOURCE_WRITEJOBUUID, UUID.randomUUID().toString)
+
+    val taskAttemptContext = new TaskAttemptContextImpl(job.getConfiguration, taskAttemptId)
+    val outputCommitter = newOutputCommitter(
+      job.getOutputFormatClass, taskAttemptContext, path, isAppend)
+    outputCommitter.setupJob(job)
+    outputCommitter
+  }
+
+  private def newOutputCommitter(
+      outputFormatClass: Class[_ <: OutputFormat[_, _]],
+      context: TaskAttemptContext,
+      path: String,
+      isAppend: Boolean): OutputCommitter = {
+    val defaultOutputCommitter = outputFormatClass.newInstance().getOutputCommitter(context)
+
+    if (isAppend) {
+      // If we are appending data to an existing dir, we will only use the output committer
+      // associated with the file output format since it is not safe to use a custom
+      // committer for appending. For example, in S3, direct parquet output committer may
+      // leave partial data in the destination dir when the appending job fails.
+      // See SPARK-8578 for more details
+      logInfo(
+        s"Using default output committer ${defaultOutputCommitter.getClass.getCanonicalName} " +
+          "for appending.")
+      defaultOutputCommitter
+    } else {
+      val configuration = context.getConfiguration
+      val clazz =
+        configuration.getClass(SQLConf.OUTPUT_COMMITTER_CLASS.key, null, classOf[OutputCommitter])
+
+      if (clazz != null) {
+        logInfo(s"Using user defined output committer class ${clazz.getCanonicalName}")
+
+        // Every output format based on org.apache.hadoop.mapreduce.lib.output.OutputFormat
+        // has an associated output committer. To override this output committer,
+        // we will first try to use the output committer set in SQLConf.OUTPUT_COMMITTER_CLASS.
+        // If a data source needs to override the output committer, it needs to set the
+        // output committer in prepareForWrite method.
+        if (classOf[FileOutputCommitter].isAssignableFrom(clazz)) {
+          // The specified output committer is a FileOutputCommitter.
+          // So, we will use the FileOutputCommitter-specified constructor.
+          val ctor = clazz.getDeclaredConstructor(classOf[Path], classOf[TaskAttemptContext])
+          ctor.newInstance(new Path(path), context)
+        } else {
+          // The specified output committer is just an OutputCommitter.
+          // So, we will use the no-argument constructor.
+          val ctor = clazz.getDeclaredConstructor()
+          ctor.newInstance()
+        }
+      } else {
+        // If output committer class is not set, we will use the one associated with the
+        // file output format.
+        logInfo(
+          s"Using output committer class ${defaultOutputCommitter.getClass.getCanonicalName}")
+        defaultOutputCommitter
+      }
+    }
+  }
+}
+
+object WriterContainer {
+  val DATASOURCE_WRITEJOBUUID = "spark.sql.sources.writeJobUUID"
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
deleted file mode 100644
index 253aa4405defa..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
+++ /dev/null
@@ -1,445 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution.datasources
-
-import java.util.{Date, UUID}
-
-import org.apache.hadoop.fs.Path
-import org.apache.hadoop.mapreduce._
-import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter => MapReduceFileOutputCommitter}
-import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
-
-import org.apache.spark._
-import org.apache.spark.internal.Logging
-import org.apache.spark.mapred.SparkHadoopMapRedUtil
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.catalyst.catalog.BucketSpec
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.execution.UnsafeKVExternalSorter
-import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
-import org.apache.spark.util.{SerializableConfiguration, Utils}
-import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter
-
-
-/** A container for all the details required when writing to a table. */
-private[datasources] case class WriteRelation(
-    sparkSession: SparkSession,
-    dataSchema: StructType,
-    path: String,
-    prepareJobForWrite: Job => OutputWriterFactory,
-    bucketSpec: Option[BucketSpec])
-
-object WriterContainer {
-  val DATASOURCE_WRITEJOBUUID = "spark.sql.sources.writeJobUUID"
-}
-
-private[datasources] abstract class BaseWriterContainer(
-    @transient val relation: WriteRelation,
-    @transient private val job: Job,
-    isAppend: Boolean)
-  extends Logging with Serializable {
-
-  protected val dataSchema = relation.dataSchema
-
-  protected val serializableConf =
-    new SerializableConfiguration(job.getConfiguration)
-
-  // This UUID is used to avoid output file name collision between different appending write jobs.
-  // These jobs may belong to different SparkContext instances. Concrete data source implementations
-  // may use this UUID to generate unique file names (e.g., `part-r-<task-id>-<job-uuid>.parquet`).
-  //  The reason why this ID is used to identify a job rather than a single task output file is
-  // that, speculative tasks must generate the same output file name as the original task.
-  private val uniqueWriteJobId = UUID.randomUUID()
-
-  // This is only used on driver side.
-  @transient private val jobContext: JobContext = job
-
-  // The following fields are initialized and used on both driver and executor side.
-  @transient protected var outputCommitter: OutputCommitter = _
-  @transient private var jobId: JobID = _
-  @transient private var taskId: TaskID = _
-  @transient private var taskAttemptId: TaskAttemptID = _
-  @transient protected var taskAttemptContext: TaskAttemptContext = _
-
-  protected val outputPath: String = relation.path
-
-  protected var outputWriterFactory: OutputWriterFactory = _
-
-  private var outputFormatClass: Class[_ <: OutputFormat[_, _]] = _
-
-  def writeRows(taskContext: TaskContext, iterator: Iterator[InternalRow]): Unit
-
-  def driverSideSetup(): Unit = {
-    setupIDs(0, 0, 0)
-    setupConf()
-
-    // This UUID is sent to executor side together with the serialized `Configuration` object within
-    // the `Job` instance.  `OutputWriters` on the executor side should use this UUID to generate
-    // unique task output files.
-    job.getConfiguration.set(WriterContainer.DATASOURCE_WRITEJOBUUID, uniqueWriteJobId.toString)
-
-    // Order of the following two lines is important.  For Hadoop 1, TaskAttemptContext constructor
-    // clones the Configuration object passed in.  If we initialize the TaskAttemptContext first,
-    // configurations made in prepareJobForWrite(job) are not populated into the TaskAttemptContext.
-    //
-    // Also, the `prepareJobForWrite` call must happen before initializing output format and output
-    // committer, since their initialization involve the job configuration, which can be potentially
-    // decorated in `prepareJobForWrite`.
-    outputWriterFactory = relation.prepareJobForWrite(job)
-    taskAttemptContext = new TaskAttemptContextImpl(serializableConf.value, taskAttemptId)
-
-    outputFormatClass = job.getOutputFormatClass
-    outputCommitter = newOutputCommitter(taskAttemptContext)
-    outputCommitter.setupJob(jobContext)
-  }
-
-  def executorSideSetup(taskContext: TaskContext): Unit = {
-    setupIDs(taskContext.stageId(), taskContext.partitionId(), taskContext.attemptNumber())
-    setupConf()
-    taskAttemptContext = new TaskAttemptContextImpl(serializableConf.value, taskAttemptId)
-    outputCommitter = newOutputCommitter(taskAttemptContext)
-    outputCommitter.setupTask(taskAttemptContext)
-  }
-
-  protected def getWorkPath: String = {
-    outputCommitter match {
-      // FileOutputCommitter writes to a temporary location returned by `getWorkPath`.
-      case f: MapReduceFileOutputCommitter => f.getWorkPath.toString
-      case _ => outputPath
-    }
-  }
-
-  protected def newOutputWriter(path: String, bucketId: Option[Int] = None): OutputWriter = {
-    try {
-      outputWriterFactory.newInstance(path, bucketId, dataSchema, taskAttemptContext)
-    } catch {
-      case e: org.apache.hadoop.fs.FileAlreadyExistsException =>
-        if (outputCommitter.getClass.getName.contains("Direct")) {
-          // SPARK-11382: DirectParquetOutputCommitter is not idempotent, meaning on retry
-          // attempts, the task will fail because the output file is created from a prior attempt.
-          // This often means the most visible error to the user is misleading. Augment the error
-          // to tell the user to look for the actual error.
-          throw new SparkException("The output file already exists but this could be due to a " +
-            "failure from an earlier attempt. Look through the earlier logs or stage page for " +
-            "the first error.\n  File exists error: " + e, e)
-        } else {
-          throw e
-        }
-    }
-  }
-
-  private def newOutputCommitter(context: TaskAttemptContext): OutputCommitter = {
-    val defaultOutputCommitter = outputFormatClass.newInstance().getOutputCommitter(context)
-
-    if (isAppend) {
-      // If we are appending data to an existing dir, we will only use the output committer
-      // associated with the file output format since it is not safe to use a custom
-      // committer for appending. For example, in S3, direct parquet output committer may
-      // leave partial data in the destination dir when the appending job fails.
-      //
-      // See SPARK-8578 for more details
-      logInfo(
-        s"Using default output committer ${defaultOutputCommitter.getClass.getCanonicalName} " +
-          "for appending.")
-      defaultOutputCommitter
-    } else {
-      val configuration = context.getConfiguration
-      val committerClass = configuration.getClass(
-        SQLConf.OUTPUT_COMMITTER_CLASS.key, null, classOf[OutputCommitter])
-
-      Option(committerClass).map { clazz =>
-        logInfo(s"Using user defined output committer class ${clazz.getCanonicalName}")
-
-        // Every output format based on org.apache.hadoop.mapreduce.lib.output.OutputFormat
-        // has an associated output committer. To override this output committer,
-        // we will first try to use the output committer set in SQLConf.OUTPUT_COMMITTER_CLASS.
-        // If a data source needs to override the output committer, it needs to set the
-        // output committer in prepareForWrite method.
-        if (classOf[MapReduceFileOutputCommitter].isAssignableFrom(clazz)) {
-          // The specified output committer is a FileOutputCommitter.
-          // So, we will use the FileOutputCommitter-specified constructor.
-          val ctor = clazz.getDeclaredConstructor(classOf[Path], classOf[TaskAttemptContext])
-          ctor.newInstance(new Path(outputPath), context)
-        } else {
-          // The specified output committer is just an OutputCommitter.
-          // So, we will use the no-argument constructor.
-          val ctor = clazz.getDeclaredConstructor()
-          ctor.newInstance()
-        }
-      }.getOrElse {
-        // If output committer class is not set, we will use the one associated with the
-        // file output format.
-        logInfo(
-          s"Using output committer class ${defaultOutputCommitter.getClass.getCanonicalName}")
-        defaultOutputCommitter
-      }
-    }
-  }
-
-  private def setupIDs(jobId: Int, splitId: Int, attemptId: Int): Unit = {
-    this.jobId = SparkHadoopWriter.createJobID(new Date, jobId)
-    this.taskId = new TaskID(this.jobId, TaskType.MAP, splitId)
-    this.taskAttemptId = new TaskAttemptID(taskId, attemptId)
-  }
-
-  private def setupConf(): Unit = {
-    serializableConf.value.set("mapred.job.id", jobId.toString)
-    serializableConf.value.set("mapred.tip.id", taskAttemptId.getTaskID.toString)
-    serializableConf.value.set("mapred.task.id", taskAttemptId.toString)
-    serializableConf.value.setBoolean("mapred.task.is.map", true)
-    serializableConf.value.setInt("mapred.task.partition", 0)
-  }
-
-  def commitTask(): Unit = {
-    SparkHadoopMapRedUtil.commitTask(outputCommitter, taskAttemptContext, jobId.getId, taskId.getId)
-  }
-
-  def abortTask(): Unit = {
-    if (outputCommitter != null) {
-      outputCommitter.abortTask(taskAttemptContext)
-    }
-    logError(s"Task attempt $taskAttemptId aborted.")
-  }
-
-  def commitJob(): Unit = {
-    outputCommitter.commitJob(jobContext)
-    logInfo(s"Job $jobId committed.")
-  }
-
-  def abortJob(): Unit = {
-    if (outputCommitter != null) {
-      outputCommitter.abortJob(jobContext, JobStatus.State.FAILED)
-    }
-    logError(s"Job $jobId aborted.")
-  }
-}
-
-/**
- * A writer that writes all of the rows in a partition to a single file.
- */
-private[datasources] class DefaultWriterContainer(
-    relation: WriteRelation,
-    job: Job,
-    isAppend: Boolean)
-  extends BaseWriterContainer(relation, job, isAppend) {
-
-  def writeRows(taskContext: TaskContext, iterator: Iterator[InternalRow]): Unit = {
-    executorSideSetup(taskContext)
-    var writer = newOutputWriter(getWorkPath)
-    writer.initConverter(dataSchema)
-
-    // If anything below fails, we should abort the task.
-    try {
-      Utils.tryWithSafeFinallyAndFailureCallbacks {
-        while (iterator.hasNext) {
-          val internalRow = iterator.next()
-          writer.writeInternal(internalRow)
-        }
-        commitTask()
-      }(catchBlock = abortTask())
-    } catch {
-      case t: Throwable =>
-        throw new SparkException("Task failed while writing rows", t)
-    }
-
-    def commitTask(): Unit = {
-      try {
-        if (writer != null) {
-          writer.close()
-          writer = null
-        }
-        super.commitTask()
-      } catch {
-        case cause: Throwable =>
-          // This exception will be handled in `InsertIntoHadoopFsRelation.insert$writeRows`, and
-          // will cause `abortTask()` to be invoked.
-          throw new RuntimeException("Failed to commit task", cause)
-      }
-    }
-
-    def abortTask(): Unit = {
-      try {
-        if (writer != null) {
-          writer.close()
-        }
-      } finally {
-        super.abortTask()
-      }
-    }
-  }
-}
-
-/**
- * A writer that dynamically opens files based on the given partition columns.  Internally this is
- * done by maintaining a HashMap of open files until `maxFiles` is reached.  If this occurs, the
- * writer externally sorts the remaining rows and then writes out them out one file at a time.
- */
-private[datasources] class DynamicPartitionWriterContainer(
-    relation: WriteRelation,
-    job: Job,
-    partitionColumns: Seq[Attribute],
-    dataColumns: Seq[Attribute],
-    inputSchema: Seq[Attribute],
-    defaultPartitionName: String,
-    maxOpenFiles: Int,
-    isAppend: Boolean)
-  extends BaseWriterContainer(relation, job, isAppend) {
-
-  private val bucketSpec = relation.bucketSpec
-
-  private val bucketColumns: Seq[Attribute] = bucketSpec.toSeq.flatMap {
-    spec => spec.bucketColumnNames.map(c => inputSchema.find(_.name == c).get)
-  }
-
-  private val sortColumns: Seq[Attribute] = bucketSpec.toSeq.flatMap {
-    spec => spec.sortColumnNames.map(c => inputSchema.find(_.name == c).get)
-  }
-
-  private def bucketIdExpression: Option[Expression] = bucketSpec.map { spec =>
-    // Use `HashPartitioning.partitionIdExpression` as our bucket id expression, so that we can
-    // guarantee the data distribution is same between shuffle and bucketed data source, which
-    // enables us to only shuffle one side when join a bucketed table and a normal one.
-    HashPartitioning(bucketColumns, spec.numBuckets).partitionIdExpression
-  }
-
-  // Expressions that given a partition key build a string like: col1=val/col2=val/...
-  private def partitionStringExpression: Seq[Expression] = {
-    partitionColumns.zipWithIndex.flatMap { case (c, i) =>
-      val escaped =
-        ScalaUDF(
-          PartitioningUtils.escapePathName _,
-          StringType,
-          Seq(Cast(c, StringType)),
-          Seq(StringType))
-      val str = If(IsNull(c), Literal(defaultPartitionName), escaped)
-      val partitionName = Literal(c.name + "=") :: str :: Nil
-      if (i == 0) partitionName else Literal(Path.SEPARATOR) :: partitionName
-    }
-  }
-
-  private def getBucketIdFromKey(key: InternalRow): Option[Int] = bucketSpec.map { _ =>
-    key.getInt(partitionColumns.length)
-  }
-
-  /**
-   * Open and returns a new OutputWriter given a partition key and optional bucket id.
-   * If bucket id is specified, we will append it to the end of the file name, but before the
-   * file extension, e.g. part-r-00009-ea518ad4-455a-4431-b471-d24e03814677-00002.gz.parquet
-   */
-  private def newOutputWriter(
-      key: InternalRow,
-      getPartitionString: UnsafeProjection): OutputWriter = {
-    val path = if (partitionColumns.nonEmpty) {
-      val partitionPath = getPartitionString(key).getString(0)
-      new Path(getWorkPath, partitionPath).toString
-    } else {
-      getWorkPath
-    }
-    val bucketId = getBucketIdFromKey(key)
-    val newWriter = super.newOutputWriter(path, bucketId)
-    newWriter.initConverter(dataSchema)
-    newWriter
-  }
-
-  def writeRows(taskContext: TaskContext, iterator: Iterator[InternalRow]): Unit = {
-    executorSideSetup(taskContext)
-
-    // We should first sort by partition columns, then bucket id, and finally sorting columns.
-    val sortingExpressions: Seq[Expression] = partitionColumns ++ bucketIdExpression ++ sortColumns
-    val getSortingKey = UnsafeProjection.create(sortingExpressions, inputSchema)
-
-    val sortingKeySchema = StructType(sortingExpressions.map {
-      case a: Attribute => StructField(a.name, a.dataType, a.nullable)
-      // The sorting expressions are all `Attribute` except bucket id.
-      case _ => StructField("bucketId", IntegerType, nullable = false)
-    })
-
-    // Returns the data columns to be written given an input row
-    val getOutputRow = UnsafeProjection.create(dataColumns, inputSchema)
-
-    // Returns the partition path given a partition key.
-    val getPartitionString =
-      UnsafeProjection.create(Concat(partitionStringExpression) :: Nil, partitionColumns)
-
-    // Sorts the data before write, so that we only need one writer at the same time.
-    // TODO: inject a local sort operator in planning.
-    val sorter = new UnsafeKVExternalSorter(
-      sortingKeySchema,
-      StructType.fromAttributes(dataColumns),
-      SparkEnv.get.blockManager,
-      SparkEnv.get.serializerManager,
-      TaskContext.get().taskMemoryManager().pageSizeBytes,
-      SparkEnv.get.conf.getLong("spark.shuffle.spill.numElementsForceSpillThreshold",
-        UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD))
-
-    while (iterator.hasNext) {
-      val currentRow = iterator.next()
-      sorter.insertKV(getSortingKey(currentRow), getOutputRow(currentRow))
-    }
-    logInfo(s"Sorting complete. Writing out partition files one at a time.")
-
-    val getBucketingKey: InternalRow => InternalRow = if (sortColumns.isEmpty) {
-      identity
-    } else {
-      UnsafeProjection.create(sortingExpressions.dropRight(sortColumns.length).zipWithIndex.map {
-        case (expr, ordinal) => BoundReference(ordinal, expr.dataType, expr.nullable)
-      })
-    }
-
-    val sortedIterator = sorter.sortedIterator()
-
-    // If anything below fails, we should abort the task.
-    var currentWriter: OutputWriter = null
-    try {
-      Utils.tryWithSafeFinallyAndFailureCallbacks {
-        var currentKey: UnsafeRow = null
-        while (sortedIterator.next()) {
-          val nextKey = getBucketingKey(sortedIterator.getKey).asInstanceOf[UnsafeRow]
-          if (currentKey != nextKey) {
-            if (currentWriter != null) {
-              currentWriter.close()
-              currentWriter = null
-            }
-            currentKey = nextKey.copy()
-            logDebug(s"Writing partition: $currentKey")
-
-            currentWriter = newOutputWriter(currentKey, getPartitionString)
-          }
-          currentWriter.writeInternal(sortedIterator.getValue)
-        }
-        if (currentWriter != null) {
-          currentWriter.close()
-          currentWriter = null
-        }
-
-        commitTask()
-      }(catchBlock = {
-        if (currentWriter != null) {
-          currentWriter.close()
-        }
-        abortTask()
-      })
-    } catch {
-      case t: Throwable =>
-        throw new SparkException("Task failed while writing rows", t)
-    }
-  }
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 8afd39d657865..9061b1b9a2638 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -339,13 +339,6 @@ object SQLConf {
       .booleanConf
       .createWithDefault(true)
 
-  val PARTITION_MAX_FILES =
-    SQLConfigBuilder("spark.sql.sources.maxConcurrentWrites")
-      .doc("The maximum number of concurrent files to open before falling back on sorting when " +
-            "writing out files using dynamic partitioning.")
-      .intConf
-      .createWithDefault(1)
-
   val BUCKETING_ENABLED = SQLConfigBuilder("spark.sql.sources.bucketing.enabled")
     .doc("When false, we will treat bucketed table as normal table")
     .booleanConf
@@ -733,8 +726,6 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
   def partitionColumnTypeInferenceEnabled: Boolean =
     getConf(SQLConf.PARTITION_COLUMN_TYPE_INFERENCE)
 
-  def partitionMaxFiles: Int = getConf(PARTITION_MAX_FILES)
-
   def parallelPartitionDiscoveryThreshold: Int =
     getConf(SQLConf.PARALLEL_PARTITION_DISCOVERY_THRESHOLD)
 

From 39755169fb5bb07332eef263b4c18ede1528812d Mon Sep 17 00:00:00 2001
From: WeichenXu <WeichenXu123@outlook.com>
Date: Wed, 19 Oct 2016 23:41:38 -0700
Subject: [PATCH 062/162] [SPARK-18003][SPARK CORE] Fix bug of RDD zipWithIndex
 & zipWithUniqueId index value overflowing

## What changes were proposed in this pull request?

- Fix bug of RDD `zipWithIndex` generating wrong result when one partition contains more than 2147483647 records.

- Fix bug of RDD `zipWithUniqueId` generating wrong result when one partition contains more than 2147483647 records.

## How was this patch tested?

test added.

Author: WeichenXu <WeichenXu123@outlook.com>

Closes #15550 from WeichenXu123/fix_rdd_zipWithIndex_overflow.
---
 .../src/main/scala/org/apache/spark/rdd/RDD.scala |  2 +-
 .../org/apache/spark/rdd/ZippedWithIndexRDD.scala |  5 ++---
 .../main/scala/org/apache/spark/util/Utils.scala  | 15 +++++++++++++++
 .../scala/org/apache/spark/util/UtilsSuite.scala  |  7 +++++++
 4 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index 6dc334ceb52ea..be119578d2c35 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -1278,7 +1278,7 @@ abstract class RDD[T: ClassTag](
   def zipWithUniqueId(): RDD[(T, Long)] = withScope {
     val n = this.partitions.length.toLong
     this.mapPartitionsWithIndex { case (k, iter) =>
-      iter.zipWithIndex.map { case (item, i) =>
+      Utils.getIteratorZipWithIndex(iter, 0L).map { case (item, i) =>
         (item, i * n + k)
       }
     }
diff --git a/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala
index b5738b9a95c36..b0e5ba0865c63 100644
--- a/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala
@@ -64,8 +64,7 @@ class ZippedWithIndexRDD[T: ClassTag](prev: RDD[T]) extends RDD[(T, Long)](prev)
 
   override def compute(splitIn: Partition, context: TaskContext): Iterator[(T, Long)] = {
     val split = splitIn.asInstanceOf[ZippedWithIndexRDDPartition]
-    firstParent[T].iterator(split.prev, context).zipWithIndex.map { x =>
-      (x._1, split.startIndex + x._2)
-    }
+    val parentIter = firstParent[T].iterator(split.prev, context)
+    Utils.getIteratorZipWithIndex(parentIter, split.startIndex)
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 7fba901b85695..bfc609419ccdb 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -1759,6 +1759,21 @@ private[spark] object Utils extends Logging {
     count
   }
 
+  /**
+   * Generate a zipWithIndex iterator, avoid index value overflowing problem
+   * in scala's zipWithIndex
+   */
+  def getIteratorZipWithIndex[T](iterator: Iterator[T], startIndex: Long): Iterator[(T, Long)] = {
+    new Iterator[(T, Long)] {
+      var index: Long = startIndex - 1L
+      def hasNext: Boolean = iterator.hasNext
+      def next(): (T, Long) = {
+        index += 1L
+        (iterator.next(), index)
+      }
+    }
+  }
+
   /**
    * Creates a symlink.
    *
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index b427f7fb50158..4dda80f10a08a 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -396,6 +396,13 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
     assert(Utils.getIteratorSize(iterator) === 5L)
   }
 
+  test("getIteratorZipWithIndex") {
+    val iterator = Utils.getIteratorZipWithIndex(Iterator(0, 1, 2), -1L + Int.MaxValue)
+    assert(iterator.toArray === Array(
+      (0, -1L + Int.MaxValue), (1, 0L + Int.MaxValue), (2, 1L + Int.MaxValue)
+    ))
+  }
+
   test("doesDirectoryContainFilesNewerThan") {
     // create some temporary directories and files
     val parent: File = Utils.createTempDir()

From 4bd17c4606764242bc29888b8eedc8e4b5a00f46 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Wed, 19 Oct 2016 23:55:05 -0700
Subject: [PATCH 063/162] [SPARK-17991][SQL] Enable metastore partition pruning
 by default.

## What changes were proposed in this pull request?

This should apply to non-converted metastore relations. WIP to see if this causes any test failures.

## How was this patch tested?

Existing tests.

Author: Eric Liang <ekl@databricks.com>

Closes #15475 from ericl/try-enabling-pruning.
---
 .../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 9061b1b9a2638..ebf4fad5cbcff 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -267,7 +267,7 @@ object SQLConf {
       .doc("When true, some predicates will be pushed down into the Hive metastore so that " +
            "unmatching partitions can be eliminated earlier.")
       .booleanConf
-      .createWithDefault(false)
+      .createWithDefault(true)
 
   val HIVE_FILESOURCE_PARTITION_PRUNING =
     SQLConfigBuilder("spark.sql.hive.filesourcePartitionPruning")

From c2c107abad8b462218d33c70b946e840663228a1 Mon Sep 17 00:00:00 2001
From: Mike Ihbe <mikejihbe@gmail.com>
Date: Thu, 20 Oct 2016 09:49:58 +0100
Subject: [PATCH 064/162] [SPARK-11653][DEPLOY] Allow spark-daemon.sh to run in
 the foreground

## What changes were proposed in this pull request?

Add a SPARK_NO_DAEMONIZE environment variable flag to spark-daemon.sh that causes the process it would run to be run in the foreground.

It looks like there has been some prior work in https://github.com/apache/spark/pull/3881, but there was some talk about these being refactored. I'm not sure if that happened or not, but that PR is almost 2 years old at this point so it was worth revisiting.

## How was this patch tested?

./dev/run-tests still seems to work. It doesn't look like these scripts have tests, but if I missed them just let me know.

Author: Mike Ihbe <mikejihbe@gmail.com>

Closes #15338 from mikejihbe/SPARK-11653.
---
 conf/spark-env.sh.template |  1 +
 sbin/spark-daemon.sh       | 54 ++++++++++++++++++++++----------------
 2 files changed, 33 insertions(+), 22 deletions(-)

diff --git a/conf/spark-env.sh.template b/conf/spark-env.sh.template
index c750c72d19880..5c1e876ef9afc 100755
--- a/conf/spark-env.sh.template
+++ b/conf/spark-env.sh.template
@@ -63,3 +63,4 @@
 # - SPARK_PID_DIR       Where the pid file is stored. (Default: /tmp)
 # - SPARK_IDENT_STRING  A string representing this instance of spark. (Default: $USER)
 # - SPARK_NICENESS      The scheduling priority for daemons. (Default: 0)
+# - SPARK_NO_DAEMONIZE  Run the proposed command in the foreground. It will not output a PID file.
diff --git a/sbin/spark-daemon.sh b/sbin/spark-daemon.sh
index 59823571124f1..061019a55e997 100755
--- a/sbin/spark-daemon.sh
+++ b/sbin/spark-daemon.sh
@@ -27,6 +27,7 @@
 #   SPARK_PID_DIR   The pid files are stored. /tmp by default.
 #   SPARK_IDENT_STRING   A string representing this instance of spark. $USER by default
 #   SPARK_NICENESS The scheduling priority for daemons. Defaults to 0.
+#   SPARK_NO_DAEMONIZE   If set, will run the proposed command in the foreground. It will not output a PID file.
 ##
 
 usage="Usage: spark-daemon.sh [--config <conf-dir>] (start|stop|submit|status) <spark-command> <spark-instance-number> <args...>"
@@ -122,6 +123,35 @@ if [ "$SPARK_NICENESS" = "" ]; then
     export SPARK_NICENESS=0
 fi
 
+execute_command() {
+  local command="$@"
+  if [ -z ${SPARK_NO_DAEMONIZE+set} ]; then
+      nohup -- $command >> $log 2>&1 < /dev/null &
+      newpid="$!"
+
+      echo "$newpid" > "$pid"
+
+      # Poll for up to 5 seconds for the java process to start
+      for i in {1..10}
+      do
+        if [[ $(ps -p "$newpid" -o comm=) =~ "java" ]]; then
+           break
+        fi
+        sleep 0.5
+      done
+
+      sleep 2
+      # Check if the process has died; in that case we'll tail the log so the user can see
+      if [[ ! $(ps -p "$newpid" -o comm=) =~ "java" ]]; then
+        echo "failed to launch $command:"
+        tail -2 "$log" | sed 's/^/  /'
+        echo "full log in $log"
+      fi
+  else
+      $command
+  fi
+}
+
 run_command() {
   mode="$1"
   shift
@@ -146,13 +176,11 @@ run_command() {
 
   case "$mode" in
     (class)
-      nohup nice -n "$SPARK_NICENESS" "${SPARK_HOME}"/bin/spark-class $command "$@" >> "$log" 2>&1 < /dev/null &
-      newpid="$!"
+      execute_command nice -n "$SPARK_NICENESS" "${SPARK_HOME}"/bin/spark-class $command $@
       ;;
 
     (submit)
-      nohup nice -n "$SPARK_NICENESS" "${SPARK_HOME}"/bin/spark-submit --class $command "$@" >> "$log" 2>&1 < /dev/null &
-      newpid="$!"
+      execute_command nice -n "$SPARK_NICENESS" bash "${SPARK_HOME}"/bin/spark-submit --class $command $@
       ;;
 
     (*)
@@ -161,24 +189,6 @@ run_command() {
       ;;
   esac
 
-  echo "$newpid" > "$pid"
-  
-  #Poll for up to 5 seconds for the java process to start
-  for i in {1..10}
-  do
-    if [[ $(ps -p "$newpid" -o comm=) =~ "java" ]]; then
-       break
-    fi
-    sleep 0.5
-  done
-
-  sleep 2
-  # Check if the process has died; in that case we'll tail the log so the user can see
-  if [[ ! $(ps -p "$newpid" -o comm=) =~ "java" ]]; then
-    echo "failed to launch $command:"
-    tail -2 "$log" | sed 's/^/  /'
-    echo "full log in $log"
-  fi
 }
 
 case $option in

From 986a3b8b5bedb1d64e2cf7c95bfdf5505f3e8c69 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Thu, 20 Oct 2016 09:53:12 +0100
Subject: [PATCH 065/162] [SPARK-17796][SQL] Support wildcard character in
 filename for LOAD DATA LOCAL INPATH

## What changes were proposed in this pull request?

Currently, Spark 2.0 raises an `input path does not exist` AnalysisException if the file name contains '*'. It is misleading since it occurs when there exist some matched files. Also, it was a supported feature in Spark 1.6.2. This PR aims to support wildcard characters in filename for `LOAD DATA LOCAL INPATH` SQL command like Spark 1.6.2.

**Reported Error Scenario**
```scala
scala> sql("CREATE TABLE t(a string)")
res0: org.apache.spark.sql.DataFrame = []

scala> sql("LOAD DATA LOCAL INPATH '/tmp/x*' INTO TABLE t")
org.apache.spark.sql.AnalysisException: LOAD DATA input path does not exist: /tmp/x*;
```

## How was this patch tested?

Pass the Jenkins test with a new test case.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #15376 from dongjoon-hyun/SPARK-17796.
---
 .../spark/sql/execution/command/tables.scala  | 23 +++++++++++++-
 .../sql/hive/execution/SQLQuerySuite.scala    | 30 +++++++++++++++++++
 2 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index 403b479a0e1bc..4c0675adb4973 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.command
 
 import java.io.File
 import java.net.URI
+import java.nio.file.FileSystems
 import java.util.Date
 
 import scala.collection.mutable.ArrayBuffer
@@ -245,7 +246,27 @@ case class LoadDataCommand(
     val loadPath =
       if (isLocal) {
         val uri = Utils.resolveURI(path)
-        if (!new File(uri.getPath()).exists()) {
+        val filePath = uri.getPath()
+        val exists = if (filePath.contains("*")) {
+          val fileSystem = FileSystems.getDefault
+          val pathPattern = fileSystem.getPath(filePath)
+          val dir = pathPattern.getParent.toString
+          if (dir.contains("*")) {
+            throw new AnalysisException(
+              s"LOAD DATA input path allows only filename wildcard: $path")
+          }
+
+          val files = new File(dir).listFiles()
+          if (files == null) {
+            false
+          } else {
+            val matcher = fileSystem.getPathMatcher("glob:" + pathPattern.toAbsolutePath)
+            files.exists(f => matcher.matches(fileSystem.getPath(f.getAbsolutePath)))
+          }
+        } else {
+          new File(filePath).exists()
+        }
+        if (!exists) {
           throw new AnalysisException(s"LOAD DATA input path does not exist: $path")
         }
         uri
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index e26b6b57ef569..495b4f874a1d6 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -17,11 +17,14 @@
 
 package org.apache.spark.sql.hive.execution
 
+import java.io.{File, PrintWriter}
+import java.nio.charset.StandardCharsets
 import java.sql.{Date, Timestamp}
 
 import scala.sys.process.{Process, ProcessLogger}
 import scala.util.Try
 
+import com.google.common.io.Files
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.sql._
@@ -1917,6 +1920,33 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     }
   }
 
+  test("SPARK-17796 Support wildcard character in filename for LOAD DATA LOCAL INPATH") {
+    withTempDir { dir =>
+      for (i <- 1 to 3) {
+        Files.write(s"$i", new File(s"$dir/part-r-0000$i"), StandardCharsets.UTF_8)
+      }
+      for (i <- 5 to 7) {
+        Files.write(s"$i", new File(s"$dir/part-s-0000$i"), StandardCharsets.UTF_8)
+      }
+
+      withTable("load_t") {
+        sql("CREATE TABLE load_t (a STRING)")
+        sql(s"LOAD DATA LOCAL INPATH '$dir/*part-r*' INTO TABLE load_t")
+        checkAnswer(sql("SELECT * FROM load_t"), Seq(Row("1"), Row("2"), Row("3")))
+
+        val m = intercept[AnalysisException] {
+          sql("LOAD DATA LOCAL INPATH '/non-exist-folder/*part*' INTO TABLE load_t")
+        }.getMessage
+        assert(m.contains("LOAD DATA input path does not exist"))
+
+        val m2 = intercept[AnalysisException] {
+          sql(s"LOAD DATA LOCAL INPATH '$dir*/*part*' INTO TABLE load_t")
+        }.getMessage
+        assert(m2.contains("LOAD DATA input path allows only filename wildcard"))
+      }
+    }
+  }
+
   def testCommandAvailable(command: String): Boolean = {
     val attempt = Try(Process(command).run(ProcessLogger(_ => ())).exitValue())
     attempt.isSuccess && attempt.get == 0

From e895bc25481f73b433a3cc3ad46df066ec602862 Mon Sep 17 00:00:00 2001
From: Dilip Biswal <dbiswal@us.ibm.com>
Date: Thu, 20 Oct 2016 19:39:25 +0800
Subject: [PATCH 066/162] [SPARK-17860][SQL] SHOW COLUMN's database conflict
 check should respect case sensitivity configuration

## What changes were proposed in this pull request?
SHOW COLUMNS command validates the user supplied database
name with database name from qualified table name name to make
sure both of them are consistent. This comparison should respect
case sensitivity.

## How was this patch tested?
Added tests in DDLSuite and existing tests were moved to use new sql based test infrastructure.

Author: Dilip Biswal <dbiswal@us.ibm.com>

Closes #15423 from dilipbiswal/dkb_show_column_fix.
---
 .../spark/sql/execution/SparkSqlParser.scala  |  12 +-
 .../spark/sql/execution/command/tables.scala  |  14 +-
 .../sql-tests/inputs/show_columns.sql         |  58 +++++
 .../sql-tests/results/show_columns.sql.out    | 217 ++++++++++++++++++
 .../apache/spark/sql/SQLQueryTestSuite.scala  |   1 +
 .../execution/command/DDLCommandSuite.scala   |  18 +-
 .../sql/execution/command/DDLSuite.scala      |  17 ++
 .../sql/hive/execution/HiveCommandSuite.scala |  23 +-
 .../hive/execution/HiveComparisonTest.scala   |   2 +-
 9 files changed, 318 insertions(+), 44 deletions(-)
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/show_columns.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/results/show_columns.sql.out

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index ea22b02d40b80..1cc166d5a7a9d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -168,17 +168,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
    * }}}
    */
   override def visitShowColumns(ctx: ShowColumnsContext): LogicalPlan = withOrigin(ctx) {
-    val table = visitTableIdentifier(ctx.tableIdentifier)
-
-    val lookupTable = Option(ctx.db) match {
-      case None => table
-      case Some(db) if table.database.exists(_ != db) =>
-        operationNotAllowed(
-          s"SHOW COLUMNS with conflicting databases: '$db' != '${table.database.get}'",
-          ctx)
-      case Some(db) => TableIdentifier(table.identifier, Some(db.getText))
-    }
-    ShowColumnsCommand(lookupTable)
+    ShowColumnsCommand(Option(ctx.db).map(_.getText), visitTableIdentifier(ctx.tableIdentifier))
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index 4c0675adb4973..aec25430b719d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -671,14 +671,24 @@ case class ShowTablePropertiesCommand(table: TableIdentifier, propertyKey: Optio
  *   SHOW COLUMNS (FROM | IN) table_identifier [(FROM | IN) database];
  * }}}
  */
-case class ShowColumnsCommand(tableName: TableIdentifier) extends RunnableCommand {
+case class ShowColumnsCommand(
+    databaseName: Option[String],
+    tableName: TableIdentifier) extends RunnableCommand {
   override val output: Seq[Attribute] = {
     AttributeReference("col_name", StringType, nullable = false)() :: Nil
   }
 
   override def run(sparkSession: SparkSession): Seq[Row] = {
     val catalog = sparkSession.sessionState.catalog
-    val table = catalog.getTempViewOrPermanentTableMetadata(tableName)
+    val resolver = sparkSession.sessionState.conf.resolver
+    val lookupTable = databaseName match {
+      case None => tableName
+      case Some(db) if tableName.database.exists(!resolver(_, db)) =>
+        throw new AnalysisException(
+          s"SHOW COLUMNS with conflicting databases: '$db' != '${tableName.database.get}'")
+      case Some(db) => TableIdentifier(tableName.identifier, Some(db))
+    }
+    val table = catalog.getTempViewOrPermanentTableMetadata(lookupTable)
     table.schema.map { c =>
       Row(c.name)
     }
diff --git a/sql/core/src/test/resources/sql-tests/inputs/show_columns.sql b/sql/core/src/test/resources/sql-tests/inputs/show_columns.sql
new file mode 100644
index 0000000000000..3894082255088
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/show_columns.sql
@@ -0,0 +1,58 @@
+CREATE DATABASE showdb;
+
+USE showdb;
+
+CREATE TABLE showcolumn1 (col1 int, `col 2` int);
+CREATE TABLE showcolumn2 (price int, qty int) partitioned by (year int, month int);
+CREATE TEMPORARY VIEW showColumn3 (col3 int, `col 4` int) USING parquet;
+CREATE GLOBAL TEMP VIEW showColumn4 AS SELECT 1 as col1, 'abc' as `col 5`;
+
+
+-- only table name
+SHOW COLUMNS IN showcolumn1;
+
+-- qualified table name
+SHOW COLUMNS IN showdb.showcolumn1;
+
+-- table name and database name
+SHOW COLUMNS IN showcolumn1 FROM showdb;
+
+-- partitioned table
+SHOW COLUMNS IN showcolumn2 IN showdb;
+
+-- Non-existent table. Raise an error in this case
+SHOW COLUMNS IN badtable FROM showdb;
+
+-- database in table identifier and database name in different case
+SHOW COLUMNS IN showdb.showcolumn1 from SHOWDB;
+
+-- different database name in table identifier and database name.
+-- Raise an error in this case.
+SHOW COLUMNS IN showdb.showcolumn1 FROM baddb;
+
+-- show column on temporary view
+SHOW COLUMNS IN showcolumn3;
+
+-- error temp view can't be qualified with a database
+SHOW COLUMNS IN showdb.showcolumn3;
+
+-- error temp view can't be qualified with a database
+SHOW COLUMNS IN showcolumn3 FROM showdb;
+
+-- error global temp view needs to be qualified
+SHOW COLUMNS IN showcolumn4;
+
+-- global temp view qualified with database
+SHOW COLUMNS IN global_temp.showcolumn4;
+
+-- global temp view qualified with database
+SHOW COLUMNS IN showcolumn4 FROM global_temp;
+
+DROP TABLE showcolumn1;
+DROP TABLE showColumn2;
+DROP VIEW  showcolumn3;
+DROP VIEW  global_temp.showcolumn4;
+
+use default;
+
+DROP DATABASE showdb;
diff --git a/sql/core/src/test/resources/sql-tests/results/show_columns.sql.out b/sql/core/src/test/resources/sql-tests/results/show_columns.sql.out
new file mode 100644
index 0000000000000..832e6e25bb2bd
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/show_columns.sql.out
@@ -0,0 +1,217 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 25
+
+
+-- !query 0
+CREATE DATABASE showdb
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+USE showdb
+-- !query 1 schema
+struct<>
+-- !query 1 output
+
+
+
+-- !query 2
+CREATE TABLE showcolumn1 (col1 int, `col 2` int)
+-- !query 2 schema
+struct<>
+-- !query 2 output
+
+
+
+-- !query 3
+CREATE TABLE showcolumn2 (price int, qty int) partitioned by (year int, month int)
+-- !query 3 schema
+struct<>
+-- !query 3 output
+
+
+
+-- !query 4
+CREATE TEMPORARY VIEW showColumn3 (col3 int, `col 4` int) USING parquet
+-- !query 4 schema
+struct<>
+-- !query 4 output
+
+
+
+-- !query 5
+CREATE GLOBAL TEMP VIEW showColumn4 AS SELECT 1 as col1, 'abc' as `col 5`
+-- !query 5 schema
+struct<>
+-- !query 5 output
+
+
+
+-- !query 6
+SHOW COLUMNS IN showcolumn1
+-- !query 6 schema
+struct<col_name:string>
+-- !query 6 output
+col 2
+col1
+
+
+-- !query 7
+SHOW COLUMNS IN showdb.showcolumn1
+-- !query 7 schema
+struct<col_name:string>
+-- !query 7 output
+col 2
+col1
+
+
+-- !query 8
+SHOW COLUMNS IN showcolumn1 FROM showdb
+-- !query 8 schema
+struct<col_name:string>
+-- !query 8 output
+col 2
+col1
+
+
+-- !query 9
+SHOW COLUMNS IN showcolumn2 IN showdb
+-- !query 9 schema
+struct<col_name:string>
+-- !query 9 output
+month
+price
+qty
+year
+
+
+-- !query 10
+SHOW COLUMNS IN badtable FROM showdb
+-- !query 10 schema
+struct<>
+-- !query 10 output
+org.apache.spark.sql.catalyst.analysis.NoSuchTableException
+Table or view 'badtable' not found in database 'showdb';
+
+
+-- !query 11
+SHOW COLUMNS IN showdb.showcolumn1 from SHOWDB
+-- !query 11 schema
+struct<col_name:string>
+-- !query 11 output
+col 2
+col1
+
+
+-- !query 12
+SHOW COLUMNS IN showdb.showcolumn1 FROM baddb
+-- !query 12 schema
+struct<>
+-- !query 12 output
+org.apache.spark.sql.AnalysisException
+SHOW COLUMNS with conflicting databases: 'baddb' != 'showdb';
+
+
+-- !query 13
+SHOW COLUMNS IN showcolumn3
+-- !query 13 schema
+struct<col_name:string>
+-- !query 13 output
+col 4
+col3
+
+
+-- !query 14
+SHOW COLUMNS IN showdb.showcolumn3
+-- !query 14 schema
+struct<>
+-- !query 14 output
+org.apache.spark.sql.catalyst.analysis.NoSuchTableException
+Table or view 'showcolumn3' not found in database 'showdb';
+
+
+-- !query 15
+SHOW COLUMNS IN showcolumn3 FROM showdb
+-- !query 15 schema
+struct<>
+-- !query 15 output
+org.apache.spark.sql.catalyst.analysis.NoSuchTableException
+Table or view 'showcolumn3' not found in database 'showdb';
+
+
+-- !query 16
+SHOW COLUMNS IN showcolumn4
+-- !query 16 schema
+struct<>
+-- !query 16 output
+org.apache.spark.sql.catalyst.analysis.NoSuchTableException
+Table or view 'showcolumn4' not found in database 'showdb';
+
+
+-- !query 17
+SHOW COLUMNS IN global_temp.showcolumn4
+-- !query 17 schema
+struct<col_name:string>
+-- !query 17 output
+col 5
+col1
+
+
+-- !query 18
+SHOW COLUMNS IN showcolumn4 FROM global_temp
+-- !query 18 schema
+struct<col_name:string>
+-- !query 18 output
+col 5
+col1
+
+
+-- !query 19
+DROP TABLE showcolumn1
+-- !query 19 schema
+struct<>
+-- !query 19 output
+
+
+
+-- !query 20
+DROP TABLE showColumn2
+-- !query 20 schema
+struct<>
+-- !query 20 output
+
+
+
+-- !query 21
+DROP VIEW  showcolumn3
+-- !query 21 schema
+struct<>
+-- !query 21 output
+
+
+
+-- !query 22
+DROP VIEW  global_temp.showcolumn4
+-- !query 22 schema
+struct<>
+-- !query 22 output
+
+
+
+-- !query 23
+use default
+-- !query 23 schema
+struct<>
+-- !query 23 output
+
+
+
+-- !query 24
+DROP DATABASE showdb
+-- !query 24 schema
+struct<>
+-- !query 24 output
+
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
index 02841d7bb03ff..6857dd37286dd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
@@ -26,6 +26,7 @@ import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
 import org.apache.spark.sql.catalyst.util.{fileToString, stringToFile}
+import org.apache.spark.sql.execution.command.ShowColumnsCommand
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types.StructType
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
index a3dbc9234f2f3..d31e7aeb3a78a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
@@ -824,22 +824,24 @@ class DDLCommandSuite extends PlanTest {
     val sql1 = "SHOW COLUMNS FROM t1"
     val sql2 = "SHOW COLUMNS IN db1.t1"
     val sql3 = "SHOW COLUMNS FROM t1 IN db1"
-    val sql4 = "SHOW COLUMNS FROM db1.t1 IN db1"
-    val sql5 = "SHOW COLUMNS FROM db1.t1 IN db2"
+    val sql4 = "SHOW COLUMNS FROM db1.t1 IN db2"
 
     val parsed1 = parser.parsePlan(sql1)
-    val expected1 = ShowColumnsCommand(TableIdentifier("t1", None))
+    val expected1 = ShowColumnsCommand(None, TableIdentifier("t1", None))
     val parsed2 = parser.parsePlan(sql2)
-    val expected2 = ShowColumnsCommand(TableIdentifier("t1", Some("db1")))
+    val expected2 = ShowColumnsCommand(None, TableIdentifier("t1", Some("db1")))
     val parsed3 = parser.parsePlan(sql3)
-    val parsed4 = parser.parsePlan(sql3)
+    val expected3 = ShowColumnsCommand(Some("db1"), TableIdentifier("t1", None))
+    val parsed4 = parser.parsePlan(sql4)
+    val expected4 = ShowColumnsCommand(Some("db2"), TableIdentifier("t1", Some("db1")))
+
     comparePlans(parsed1, expected1)
     comparePlans(parsed2, expected2)
-    comparePlans(parsed3, expected2)
-    comparePlans(parsed4, expected2)
-    assertUnsupported(sql5)
+    comparePlans(parsed3, expected3)
+    comparePlans(parsed4, expected4)
   }
 
+
   test("show partitions") {
     val sql1 = "SHOW PARTITIONS t1"
     val sql2 = "SHOW PARTITIONS db1.t1"
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index c8b8e9ebabc75..a6da8a86c1623 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -1749,4 +1749,21 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
       assert(sql("show user functions").count() === 1L)
     }
   }
+
+  test("show columns - negative test") {
+    // When case sensitivity is true, the user supplied database name in table identifier
+    // should match the supplied database name in case sensitive way.
+    withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") {
+      withTempDatabase { db =>
+        val tabName = s"$db.showcolumn"
+        withTable(tabName) {
+          sql(s"CREATE TABLE $tabName(col1 int, col2 string) USING parquet ")
+          val message = intercept[AnalysisException] {
+            sql(s"SHOW COLUMNS IN $db.showcolumn FROM ${db.toUpperCase}")
+          }.getMessage
+          assert(message.contains("SHOW COLUMNS with conflicting databases"))
+        }
+      }
+    }
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala
index 2c772ce2155ef..ad1e9b17a9f71 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala
@@ -22,6 +22,7 @@ import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.NoSuchTableException
 import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType}
 import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types.StructType
 
@@ -336,28 +337,6 @@ class HiveCommandSuite extends QueryTest with SQLTestUtils with TestHiveSingleto
     }
   }
 
-  test("show columns") {
-    checkAnswer(
-      sql("SHOW COLUMNS IN parquet_tab3"),
-      Row("col1") :: Row("col 2") :: Nil)
-
-    checkAnswer(
-      sql("SHOW COLUMNS IN default.parquet_tab3"),
-      Row("col1") :: Row("col 2") :: Nil)
-
-    checkAnswer(
-      sql("SHOW COLUMNS IN parquet_tab3 FROM default"),
-      Row("col1") :: Row("col 2") :: Nil)
-
-    checkAnswer(
-      sql("SHOW COLUMNS IN parquet_tab4 IN default"),
-      Row("price") :: Row("qty") :: Row("year") :: Row("month") :: Nil)
-
-    val message = intercept[NoSuchTableException] {
-      sql("SHOW COLUMNS IN badtable FROM default")
-    }.getMessage
-    assert(message.contains("'badtable' not found in database"))
-  }
 
   test("show partitions - show everything") {
     checkAnswer(
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
index 80e75aa898c38..13ceed7c79e35 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
@@ -167,7 +167,7 @@ abstract class HiveComparisonTest
       // and does not return it as a query answer.
       case _: SetCommand => Seq("0")
       case _: ExplainCommand => answer
-      case _: DescribeTableCommand | ShowColumnsCommand(_) =>
+      case _: DescribeTableCommand | ShowColumnsCommand(_, _) =>
         // Filter out non-deterministic lines and lines which do not have actual results but
         // can introduce problems because of the way Hive formats these lines.
         // Then, remove empty lines. Do not sort the results.

From fb0894b3a87331a731129ad3fc7ebe598d90a6ee Mon Sep 17 00:00:00 2001
From: Tejas Patil <tejasp@fb.com>
Date: Thu, 20 Oct 2016 09:50:55 -0700
Subject: [PATCH 067/162] [SPARK-17698][SQL] Join predicates should not contain
 filter clauses

## What changes were proposed in this pull request?

Jira : https://issues.apache.org/jira/browse/SPARK-17698

`ExtractEquiJoinKeys` is incorrectly using filter predicates as the join condition for joins. `canEvaluate` [0] tries to see if the an `Expression` can be evaluated using output of a given `Plan`. In case of filter predicates (eg. `a.id='1'`), the `Expression` passed for the right hand side (ie. '1' ) is a `Literal` which does not have any attribute references. Thus `expr.references` is an empty set which theoretically is a subset of any set. This leads to `canEvaluate` returning `true` and `a.id='1'` is treated as a join predicate. While this does not lead to incorrect results but in case of bucketed + sorted tables, we might miss out on avoiding un-necessary shuffle + sort. See example below:

[0] : https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala#L91

eg.

```
val df = (1 until 10).toDF("id").coalesce(1)
hc.sql("DROP TABLE IF EXISTS table1").collect
df.write.bucketBy(8, "id").sortBy("id").saveAsTable("table1")
hc.sql("DROP TABLE IF EXISTS table2").collect
df.write.bucketBy(8, "id").sortBy("id").saveAsTable("table2")

sqlContext.sql("""
  SELECT a.id, b.id
  FROM table1 a
  FULL OUTER JOIN table2 b
  ON a.id = b.id AND a.id='1' AND b.id='1'
""").explain(true)
```

BEFORE: This is doing shuffle + sort over table scan outputs which is not needed as both tables are bucketed and sorted on the same columns and have same number of buckets. This should be a single stage job.

```
SortMergeJoin [id#38, cast(id#38 as double), 1.0], [id#39, 1.0, cast(id#39 as double)], FullOuter
:- *Sort [id#38 ASC NULLS FIRST, cast(id#38 as double) ASC NULLS FIRST, 1.0 ASC NULLS FIRST], false, 0
:  +- Exchange hashpartitioning(id#38, cast(id#38 as double), 1.0, 200)
:     +- *FileScan parquet default.table1[id#38] Batched: true, Format: ParquetFormat, InputPaths: file:spark-warehouse/table1, PartitionFilters: [], PushedFilters: [], ReadSchema: struct<id:int>
+- *Sort [id#39 ASC NULLS FIRST, 1.0 ASC NULLS FIRST, cast(id#39 as double) ASC NULLS FIRST], false, 0
   +- Exchange hashpartitioning(id#39, 1.0, cast(id#39 as double), 200)
      +- *FileScan parquet default.table2[id#39] Batched: true, Format: ParquetFormat, InputPaths: file:spark-warehouse/table2, PartitionFilters: [], PushedFilters: [], ReadSchema: struct<id:int>
```

AFTER :

```
SortMergeJoin [id#32], [id#33], FullOuter, ((cast(id#32 as double) = 1.0) && (cast(id#33 as double) = 1.0))
:- *FileScan parquet default.table1[id#32] Batched: true, Format: ParquetFormat, InputPaths: file:spark-warehouse/table1, PartitionFilters: [], PushedFilters: [], ReadSchema: struct<id:int>
+- *FileScan parquet default.table2[id#33] Batched: true, Format: ParquetFormat, InputPaths: file:spark-warehouse/table2, PartitionFilters: [], PushedFilters: [], ReadSchema: struct<id:int>
```

## How was this patch tested?

- Added a new test case for this scenario : `SPARK-17698 Join predicates should not contain filter clauses`
- Ran all the tests in `BucketedReadSuite`

Author: Tejas Patil <tejasp@fb.com>

Closes #15272 from tejasapatil/SPARK-17698_join_predicate_filter_clause.
---
 .../sql/catalyst/expressions/predicates.scala |   5 +-
 .../spark/sql/catalyst/optimizer/joins.scala  |   4 +-
 .../sql/catalyst/planning/patterns.scala      |   2 +
 .../spark/sql/sources/BucketedReadSuite.scala | 124 ++++++++++++++----
 4 files changed, 109 insertions(+), 26 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 799858a6865e5..9394e39aadd9d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -84,8 +84,9 @@ trait PredicateHelper {
    *
    * For example consider a join between two relations R(a, b) and S(c, d).
    *
-   * `canEvaluate(EqualTo(a,b), R)` returns `true` where as `canEvaluate(EqualTo(a,c), R)` returns
-   * `false`.
+   * - `canEvaluate(EqualTo(a,b), R)` returns `true`
+   * - `canEvaluate(EqualTo(a,c), R)` returns `false`
+   * - `canEvaluate(Literal(1), R)` returns `true` as literals CAN be evaluated on any plan
    */
   protected def canEvaluate(expr: Expression, plan: LogicalPlan): Boolean =
     expr.references.subsetOf(plan.outputSet)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala
index 2626057e492ef..180ad2e0ad1fa 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala
@@ -65,7 +65,9 @@ object ReorderJoin extends Rule[LogicalPlan] with PredicateHelper {
       val conditionalJoin = rest.find { planJoinPair =>
         val plan = planJoinPair._1
         val refs = left.outputSet ++ plan.outputSet
-        conditions.filterNot(canEvaluate(_, left)).filterNot(canEvaluate(_, plan))
+        conditions
+          .filterNot(l => l.references.nonEmpty && canEvaluate(l, left))
+          .filterNot(r => r.references.nonEmpty && canEvaluate(r, plan))
           .exists(_.references.subsetOf(refs))
       }
       // pick the next one if no condition left
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
index bdae56881bf46..c5f92c59c88f4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
@@ -112,6 +112,7 @@ object ExtractEquiJoinKeys extends Logging with PredicateHelper {
       // as join keys.
       val predicates = condition.map(splitConjunctivePredicates).getOrElse(Nil)
       val joinKeys = predicates.flatMap {
+        case EqualTo(l, r) if l.references.isEmpty || r.references.isEmpty => None
         case EqualTo(l, r) if canEvaluate(l, left) && canEvaluate(r, right) => Some((l, r))
         case EqualTo(l, r) if canEvaluate(l, right) && canEvaluate(r, left) => Some((r, l))
         // Replace null with default value for joining key, then those rows with null in it could
@@ -125,6 +126,7 @@ object ExtractEquiJoinKeys extends Logging with PredicateHelper {
         case other => None
       }
       val otherPredicates = predicates.filterNot {
+        case EqualTo(l, r) if l.references.isEmpty || r.references.isEmpty => false
         case EqualTo(l, r) =>
           canEvaluate(l, left) && canEvaluate(r, right) ||
             canEvaluate(l, right) && canEvaluate(r, left)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala
index 3ff85176de10e..9ed454e578d69 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala
@@ -235,7 +235,8 @@ class BucketedReadSuite extends QueryTest with SQLTestUtils with TestHiveSinglet
   private def testBucketing(
       bucketSpecLeft: Option[BucketSpec],
       bucketSpecRight: Option[BucketSpec],
-      joinColumns: Seq[String],
+      joinType: String = "inner",
+      joinCondition: (DataFrame, DataFrame) => Column,
       shuffleLeft: Boolean,
       shuffleRight: Boolean,
       sortLeft: Boolean = true,
@@ -268,12 +269,12 @@ class BucketedReadSuite extends QueryTest with SQLTestUtils with TestHiveSinglet
         SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false") {
         val t1 = spark.table("bucketed_table1")
         val t2 = spark.table("bucketed_table2")
-        val joined = t1.join(t2, joinCondition(t1, t2, joinColumns))
+        val joined = t1.join(t2, joinCondition(t1, t2), joinType)
 
         // First check the result is corrected.
         checkAnswer(
           joined.sort("bucketed_table1.k", "bucketed_table2.k"),
-          df1.join(df2, joinCondition(df1, df2, joinColumns)).sort("df1.k", "df2.k"))
+          df1.join(df2, joinCondition(df1, df2), joinType).sort("df1.k", "df2.k"))
 
         assert(joined.queryExecution.executedPlan.isInstanceOf[SortMergeJoinExec])
         val joinOperator = joined.queryExecution.executedPlan.asInstanceOf[SortMergeJoinExec]
@@ -297,56 +298,102 @@ class BucketedReadSuite extends QueryTest with SQLTestUtils with TestHiveSinglet
     }
   }
 
-  private def joinCondition(left: DataFrame, right: DataFrame, joinCols: Seq[String]): Column = {
+  private def joinCondition(joinCols: Seq[String]) (left: DataFrame, right: DataFrame): Column = {
     joinCols.map(col => left(col) === right(col)).reduce(_ && _)
   }
 
   test("avoid shuffle when join 2 bucketed tables") {
     val bucketSpec = Some(BucketSpec(8, Seq("i", "j"), Nil))
-    testBucketing(bucketSpec, bucketSpec, Seq("i", "j"), shuffleLeft = false, shuffleRight = false)
+    testBucketing(
+      bucketSpecLeft = bucketSpec,
+      bucketSpecRight = bucketSpec,
+      joinCondition = joinCondition(Seq("i", "j")),
+      shuffleLeft = false,
+      shuffleRight = false
+    )
   }
 
   // Enable it after fix https://issues.apache.org/jira/browse/SPARK-12704
   ignore("avoid shuffle when join keys are a super-set of bucket keys") {
     val bucketSpec = Some(BucketSpec(8, Seq("i"), Nil))
-    testBucketing(bucketSpec, bucketSpec, Seq("i", "j"), shuffleLeft = false, shuffleRight = false)
+    testBucketing(
+      bucketSpecLeft = bucketSpec,
+      bucketSpecRight = bucketSpec,
+      joinCondition = joinCondition(Seq("i", "j")),
+      shuffleLeft = false,
+      shuffleRight = false
+    )
   }
 
   test("only shuffle one side when join bucketed table and non-bucketed table") {
     val bucketSpec = Some(BucketSpec(8, Seq("i", "j"), Nil))
-    testBucketing(bucketSpec, None, Seq("i", "j"), shuffleLeft = false, shuffleRight = true)
+    testBucketing(
+      bucketSpecLeft = bucketSpec,
+      bucketSpecRight = None,
+      joinCondition = joinCondition(Seq("i", "j")),
+      shuffleLeft = false,
+      shuffleRight = true
+    )
   }
 
   test("only shuffle one side when 2 bucketed tables have different bucket number") {
     val bucketSpec1 = Some(BucketSpec(8, Seq("i", "j"), Nil))
     val bucketSpec2 = Some(BucketSpec(5, Seq("i", "j"), Nil))
-    testBucketing(bucketSpec1, bucketSpec2, Seq("i", "j"), shuffleLeft = false, shuffleRight = true)
+    testBucketing(
+      bucketSpecLeft = bucketSpec1,
+      bucketSpecRight = bucketSpec2,
+      joinCondition = joinCondition(Seq("i", "j")),
+      shuffleLeft = false,
+      shuffleRight = true
+    )
   }
 
   test("only shuffle one side when 2 bucketed tables have different bucket keys") {
     val bucketSpec1 = Some(BucketSpec(8, Seq("i"), Nil))
     val bucketSpec2 = Some(BucketSpec(8, Seq("j"), Nil))
-    testBucketing(bucketSpec1, bucketSpec2, Seq("i"), shuffleLeft = false, shuffleRight = true)
+    testBucketing(
+      bucketSpecLeft = bucketSpec1,
+      bucketSpecRight = bucketSpec2,
+      joinCondition = joinCondition(Seq("i")),
+      shuffleLeft = false,
+      shuffleRight = true
+    )
   }
 
   test("shuffle when join keys are not equal to bucket keys") {
     val bucketSpec = Some(BucketSpec(8, Seq("i"), Nil))
-    testBucketing(bucketSpec, bucketSpec, Seq("j"), shuffleLeft = true, shuffleRight = true)
+    testBucketing(
+      bucketSpecLeft = bucketSpec,
+      bucketSpecRight = bucketSpec,
+      joinCondition = joinCondition(Seq("j")),
+      shuffleLeft = true,
+      shuffleRight = true
+    )
   }
 
   test("shuffle when join 2 bucketed tables with bucketing disabled") {
     val bucketSpec = Some(BucketSpec(8, Seq("i", "j"), Nil))
     withSQLConf(SQLConf.BUCKETING_ENABLED.key -> "false") {
-      testBucketing(bucketSpec, bucketSpec, Seq("i", "j"), shuffleLeft = true, shuffleRight = true)
+      testBucketing(
+        bucketSpecLeft = bucketSpec,
+        bucketSpecRight = bucketSpec,
+        joinCondition = joinCondition(Seq("i", "j")),
+        shuffleLeft = true,
+        shuffleRight = true
+      )
     }
   }
 
   test("avoid shuffle and sort when bucket and sort columns are join keys") {
     val bucketSpec = Some(BucketSpec(8, Seq("i", "j"), Seq("i", "j")))
     testBucketing(
-      bucketSpec, bucketSpec, Seq("i", "j"),
-      shuffleLeft = false, shuffleRight = false,
-      sortLeft = false, sortRight = false
+      bucketSpecLeft = bucketSpec,
+      bucketSpecRight = bucketSpec,
+      joinCondition = joinCondition(Seq("i", "j")),
+      shuffleLeft = false,
+      shuffleRight = false,
+      sortLeft = false,
+      sortRight = false
     )
   }
 
@@ -354,9 +401,13 @@ class BucketedReadSuite extends QueryTest with SQLTestUtils with TestHiveSinglet
     val bucketSpec1 = Some(BucketSpec(8, Seq("i"), Seq("i", "j")))
     val bucketSpec2 = Some(BucketSpec(8, Seq("i"), Seq("i", "k")))
     testBucketing(
-      bucketSpec1, bucketSpec2, Seq("i"),
-      shuffleLeft = false, shuffleRight = false,
-      sortLeft = false, sortRight = false
+      bucketSpecLeft = bucketSpec1,
+      bucketSpecRight = bucketSpec2,
+      joinCondition = joinCondition(Seq("i")),
+      shuffleLeft = false,
+      shuffleRight = false,
+      sortLeft = false,
+      sortRight = false
     )
   }
 
@@ -364,9 +415,13 @@ class BucketedReadSuite extends QueryTest with SQLTestUtils with TestHiveSinglet
     val bucketSpec1 = Some(BucketSpec(8, Seq("i", "j"), Seq("i", "j")))
     val bucketSpec2 = Some(BucketSpec(8, Seq("i", "j"), Seq("k")))
     testBucketing(
-      bucketSpec1, bucketSpec2, Seq("i", "j"),
-      shuffleLeft = false, shuffleRight = false,
-      sortLeft = false, sortRight = true
+      bucketSpecLeft = bucketSpec1,
+      bucketSpecRight = bucketSpec2,
+      joinCondition = joinCondition(Seq("i", "j")),
+      shuffleLeft = false,
+      shuffleRight = false,
+      sortLeft = false,
+      sortRight = true
     )
   }
 
@@ -374,9 +429,13 @@ class BucketedReadSuite extends QueryTest with SQLTestUtils with TestHiveSinglet
     val bucketSpec1 = Some(BucketSpec(8, Seq("i", "j"), Seq("i", "j")))
     val bucketSpec2 = Some(BucketSpec(8, Seq("i", "j"), Seq("j", "i")))
     testBucketing(
-      bucketSpec1, bucketSpec2, Seq("i", "j"),
-      shuffleLeft = false, shuffleRight = false,
-      sortLeft = false, sortRight = true
+      bucketSpecLeft = bucketSpec1,
+      bucketSpecRight = bucketSpec2,
+      joinCondition = joinCondition(Seq("i", "j")),
+      shuffleLeft = false,
+      shuffleRight = false,
+      sortLeft = false,
+      sortRight = true
     )
   }
 
@@ -408,6 +467,25 @@ class BucketedReadSuite extends QueryTest with SQLTestUtils with TestHiveSinglet
     }
   }
 
+  test("SPARK-17698 Join predicates should not contain filter clauses") {
+    val bucketSpec = Some(BucketSpec(8, Seq("i"), Seq("i")))
+    testBucketing(
+      bucketSpecLeft = bucketSpec,
+      bucketSpecRight = bucketSpec,
+      joinType = "fullouter",
+      joinCondition = (left: DataFrame, right: DataFrame) => {
+        val joinPredicates = Seq("i").map(col => left(col) === right(col)).reduce(_ && _)
+        val filterLeft = left("i") === Literal("1")
+        val filterRight = right("i") === Literal("1")
+        joinPredicates && filterLeft && filterRight
+      },
+      shuffleLeft = false,
+      shuffleRight = false,
+      sortLeft = false,
+      sortRight = false
+    )
+  }
+
   test("error if there exists any malformed bucket files") {
     withTable("bucketed_table") {
       df1.write.format("parquet").bucketBy(8, "i").saveAsTable("bucketed_table")

From 84b245f2dd31c1cebbf12458bf11f67e287e93f4 Mon Sep 17 00:00:00 2001
From: Koert Kuipers <koert@tresata.com>
Date: Thu, 20 Oct 2016 10:08:12 -0700
Subject: [PATCH 068/162] [SPARK-15780][SQL] Support mapValues on
 KeyValueGroupedDataset

## What changes were proposed in this pull request?

Add mapValues to KeyValueGroupedDataset

## How was this patch tested?

New test in DatasetSuite for groupBy function, mapValues, flatMap

Author: Koert Kuipers <koert@tresata.com>

Closes #13526 from koertkuipers/feat-keyvaluegroupeddataset-mapvalues.
---
 .../sql/catalyst/plans/logical/object.scala   | 13 ++++++
 .../spark/sql/KeyValueGroupedDataset.scala    | 42 +++++++++++++++++++
 .../org/apache/spark/sql/DatasetSuite.scala   | 11 +++++
 3 files changed, 66 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala
index fefe5a3953a6e..0ab4c9016623e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala
@@ -230,6 +230,19 @@ object AppendColumns {
       encoderFor[U].namedExpressions,
       child)
   }
+
+  def apply[T : Encoder, U : Encoder](
+      func: T => U,
+      inputAttributes: Seq[Attribute],
+      child: LogicalPlan): AppendColumns = {
+    new AppendColumns(
+      func.asInstanceOf[Any => Any],
+      implicitly[Encoder[T]].clsTag.runtimeClass,
+      implicitly[Encoder[T]].schema,
+      UnresolvedDeserializer(encoderFor[T].deserializer, inputAttributes),
+      encoderFor[U].namedExpressions,
+      child)
+  }
 }
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
index 828eb94efe598..4cb0313aa9037 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
@@ -66,6 +66,48 @@ class KeyValueGroupedDataset[K, V] private[sql](
       dataAttributes,
       groupingAttributes)
 
+  /**
+   * Returns a new [[KeyValueGroupedDataset]] where the given function `func` has been applied
+   * to the data. The grouping key is unchanged by this.
+   *
+   * {{{
+   *   // Create values grouped by key from a Dataset[(K, V)]
+   *   ds.groupByKey(_._1).mapValues(_._2) // Scala
+   * }}}
+   *
+   * @since 2.1.0
+   */
+  def mapValues[W : Encoder](func: V => W): KeyValueGroupedDataset[K, W] = {
+    val withNewData = AppendColumns(func, dataAttributes, logicalPlan)
+    val projected = Project(withNewData.newColumns ++ groupingAttributes, withNewData)
+    val executed = sparkSession.sessionState.executePlan(projected)
+
+    new KeyValueGroupedDataset(
+      encoderFor[K],
+      encoderFor[W],
+      executed,
+      withNewData.newColumns,
+      groupingAttributes)
+  }
+
+  /**
+   * Returns a new [[KeyValueGroupedDataset]] where the given function `func` has been applied
+   * to the data. The grouping key is unchanged by this.
+   *
+   * {{{
+   *   // Create Integer values grouped by String key from a Dataset<Tuple2<String, Integer>>
+   *   Dataset<Tuple2<String, Integer>> ds = ...;
+   *   KeyValueGroupedDataset<String, Integer> grouped =
+   *     ds.groupByKey(t -> t._1, Encoders.STRING()).mapValues(t -> t._2, Encoders.INT()); // Java 8
+   * }}}
+   *
+   * @since 2.1.0
+   */
+  def mapValues[W](func: MapFunction[V, W], encoder: Encoder[W]): KeyValueGroupedDataset[K, W] = {
+    implicit val uEnc = encoder
+    mapValues { (v: V) => func.call(v) }
+  }
+
   /**
    * Returns a [[Dataset]] that contains each unique key. This is equivalent to doing mapping
    * over the Dataset to extract the keys and then running a distinct operation on those.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 5fce9b4fe97ea..cc367acae2ba4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -336,6 +336,17 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
       "a", "30", "b", "3", "c", "1")
   }
 
+  test("groupBy function, mapValues, flatMap") {
+    val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS()
+    val keyValue = ds.groupByKey(_._1).mapValues(_._2)
+    val agged = keyValue.mapGroups { case (g, iter) => (g, iter.sum) }
+    checkDataset(agged, ("a", 30), ("b", 3), ("c", 1))
+
+    val keyValue1 = ds.groupByKey(t => (t._1, "key")).mapValues(t => (t._2, "value"))
+    val agged1 = keyValue1.mapGroups { case (g, iter) => (g._1, iter.map(_._1).sum) }
+    checkDataset(agged, ("a", 30), ("b", 3), ("c", 1))
+  }
+
   test("groupBy function, reduce") {
     val ds = Seq("abc", "xyz", "hello").toDS()
     val agged = ds.groupByKey(_.length).reduceGroups(_ + _)

From 947f4f25273161dc4719419a35613a71c2e2a150 Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Thu, 20 Oct 2016 10:50:34 -0700
Subject: [PATCH 069/162] [SPARK-17999][KAFKA][SQL] Add getPreferredLocations
 for KafkaSourceRDD

## What changes were proposed in this pull request?

The newly implemented Structured Streaming `KafkaSource` did calculate the preferred locations for each topic partition, but didn't offer this information through RDD's `getPreferredLocations` method. So here propose to add this method in `KafkaSourceRDD`.

## How was this patch tested?

Manual verification.

Author: jerryshao <sshao@hortonworks.com>

Closes #15545 from jerryshao/SPARK-17999.
---
 .../scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala
index 496af7e39abab..802dd040aed93 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala
@@ -112,6 +112,11 @@ private[kafka010] class KafkaSourceRDD(
     buf.toArray
   }
 
+  override def getPreferredLocations(split: Partition): Seq[String] = {
+    val part = split.asInstanceOf[KafkaSourceRDDPartition]
+    part.offsetRange.preferredLoc.map(Seq(_)).getOrElse(Seq.empty)
+  }
+
   override def compute(
       thePart: Partition,
       context: TaskContext): Iterator[ConsumerRecord[Array[Byte], Array[Byte]]] = {

From 7f9ec19eae60abe589ffd22259a9065e7e353a57 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 20 Oct 2016 12:18:56 -0700
Subject: [PATCH 070/162] [SPARK-18021][SQL] Refactor file name specification
 for data sources

## What changes were proposed in this pull request?
Currently each data source OutputWriter is responsible for specifying the entire file name for each file output. This, however, does not make any sense because we rely on file naming schemes for certain behaviors in Spark SQL, e.g. bucket id. The current approach allows individual data sources to break the implementation of bucketing.

On the flip side, we also don't want to move file naming entirely out of data sources, because different data sources do want to specify different extensions.

This patch divides file name specification into two parts: the first part is a prefix specified by the caller of OutputWriter (in WriteOutput), and the second part is the suffix that can be specified by the OutputWriter itself. Note that a side effect of this change is that now all file based data sources also support bucketing automatically.

There are also some other minor cleanups:

- Removed the UUID passed through generic Configuration string
- Some minor rewrites for better clarity
- Renamed "path" in multiple places to "stagingDir", to more accurately reflect its meaning

## How was this patch tested?
This should be covered by existing data source tests.

Author: Reynold Xin <rxin@databricks.com>

Closes #15562 from rxin/SPARK-18021.
---
 .../ml/source/libsvm/LibSVMRelation.scala     | 16 ++----
 .../execution/datasources/OutputWriter.scala  | 17 ++++--
 .../execution/datasources/WriteOutput.scala   | 56 +++++++++----------
 .../datasources/csv/CSVRelation.scala         | 18 +++---
 .../datasources/json/JsonFileFormat.scala     | 17 ++----
 .../parquet/ParquetFileFormat.scala           |  7 +--
 .../parquet/ParquetOutputWriter.scala         | 32 +++--------
 .../datasources/text/TextFileFormat.scala     | 21 +++----
 .../spark/sql/hive/orc/OrcFileFormat.scala    | 21 +++----
 .../sql/sources/BucketedWriteSuite.scala      |  5 --
 .../sql/sources/CommitFailureTestSource.scala |  6 +-
 .../sql/sources/SimpleTextRelation.scala      | 26 ++++-----
 12 files changed, 99 insertions(+), 143 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
index 8577803743c8e..fff86686b550c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
@@ -40,7 +40,8 @@ import org.apache.spark.sql.types._
 import org.apache.spark.util.SerializableConfiguration
 
 private[libsvm] class LibSVMOutputWriter(
-    path: String,
+    stagingDir: String,
+    fileNamePrefix: String,
     dataSchema: StructType,
     context: TaskAttemptContext)
   extends OutputWriter {
@@ -50,11 +51,7 @@ private[libsvm] class LibSVMOutputWriter(
   private val recordWriter: RecordWriter[NullWritable, Text] = {
     new TextOutputFormat[NullWritable, Text]() {
       override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
-        val configuration = context.getConfiguration
-        val uniqueWriteJobId = configuration.get(WriterContainer.DATASOURCE_WRITEJOBUUID)
-        val taskAttemptId = context.getTaskAttemptID
-        val split = taskAttemptId.getTaskID.getId
-        new Path(path, f"part-r-$split%05d-$uniqueWriteJobId$extension")
+        new Path(stagingDir, fileNamePrefix + extension)
       }
     }.getRecordWriter(context)
   }
@@ -132,12 +129,11 @@ private[libsvm] class LibSVMFileFormat extends TextBasedFileFormat with DataSour
       dataSchema: StructType): OutputWriterFactory = {
     new OutputWriterFactory {
       override def newInstance(
-          path: String,
-          bucketId: Option[Int],
+          stagingDir: String,
+          fileNamePrefix: String,
           dataSchema: StructType,
           context: TaskAttemptContext): OutputWriter = {
-        if (bucketId.isDefined) { sys.error("LibSVM doesn't support bucketing") }
-        new LibSVMOutputWriter(path, dataSchema, context)
+        new LibSVMOutputWriter(stagingDir, fileNamePrefix, dataSchema, context)
       }
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/OutputWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/OutputWriter.scala
index d2eec7b1413f8..f4cefdab077e9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/OutputWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/OutputWriter.scala
@@ -34,18 +34,23 @@ abstract class OutputWriterFactory extends Serializable {
    * When writing to a [[HadoopFsRelation]], this method gets called by each task on executor side
    * to instantiate new [[OutputWriter]]s.
    *
-   * @param path Path of the file to which this [[OutputWriter]] is supposed to write.  Note that
-   *        this may not point to the final output file.  For example, `FileOutputFormat` writes to
-   *        temporary directories and then merge written files back to the final destination.  In
-   *        this case, `path` points to a temporary output file under the temporary directory.
+   * @param stagingDir Base path (directory) of the file to which this [[OutputWriter]] is supposed
+   *                   to write.  Note that this may not point to the final output file.  For
+   *                   example, `FileOutputFormat` writes to temporary directories and then merge
+   *                   written files back to the final destination.  In this case, `path` points to
+   *                   a temporary output file under the temporary directory.
+   * @param fileNamePrefix Prefix of the file name. The returned OutputWriter must make sure this
+   *                       prefix is used in the actual file name. For example, if the prefix is
+   *                       "part-1-2-3", then the file name must start with "part_1_2_3" but can
+   *                       end in arbitrary extension.
    * @param dataSchema Schema of the rows to be written. Partition columns are not included in the
    *        schema if the relation being written is partitioned.
    * @param context The Hadoop MapReduce task context.
    * @since 1.4.0
    */
   def newInstance(
-      path: String,
-      bucketId: Option[Int], // TODO: This doesn't belong here...
+      stagingDir: String,
+      fileNamePrefix: String,
       dataSchema: StructType,
       context: TaskAttemptContext): OutputWriter
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriteOutput.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriteOutput.scala
index 54d0f3bd6291a..bd56e511d0ccf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriteOutput.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriteOutput.scala
@@ -46,6 +46,7 @@ object WriteOutput extends Logging {
 
   /** A shared job description for all the write tasks. */
   private class WriteJobDescription(
+      val uuid: String,  // prevent collision between different (appending) write jobs
       val serializableHadoopConf: SerializableConfiguration,
       val outputWriterFactory: OutputWriterFactory,
       val allColumns: Seq[Attribute],
@@ -102,6 +103,7 @@ object WriteOutput extends Logging {
       fileFormat.prepareWrite(sparkSession, job, options, dataColumns.toStructType)
 
     val description = new WriteJobDescription(
+      uuid = UUID.randomUUID().toString,
       serializableHadoopConf = new SerializableConfiguration(job.getConfiguration),
       outputWriterFactory = outputWriterFactory,
       allColumns = plan.output,
@@ -213,6 +215,11 @@ object WriteOutput extends Logging {
   private trait ExecuteWriteTask {
     def execute(iterator: Iterator[InternalRow]): Unit
     def releaseResources(): Unit
+
+    final def filePrefix(split: Int, uuid: String, bucketId: Option[Int]): String = {
+      val bucketString = bucketId.map(BucketingUtils.bucketIdToString).getOrElse("")
+      f"part-r-$split%05d-$uuid$bucketString"
+    }
   }
 
   /** Writes data to a single directory (used for non-dynamic-partition writes). */
@@ -222,9 +229,11 @@ object WriteOutput extends Logging {
       stagingPath: String) extends ExecuteWriteTask {
 
     private[this] var outputWriter: OutputWriter = {
+      val split = taskAttemptContext.getTaskAttemptID.getTaskID.getId
+
       val outputWriter = description.outputWriterFactory.newInstance(
-        path = stagingPath,
-        bucketId = None,
+        stagingDir = stagingPath,
+        fileNamePrefix = filePrefix(split, description.uuid, None),
         dataSchema = description.nonPartitionColumns.toStructType,
         context = taskAttemptContext)
       outputWriter.initConverter(dataSchema = description.nonPartitionColumns.toStructType)
@@ -287,29 +296,31 @@ object WriteOutput extends Logging {
       }
     }
 
-    private def getBucketIdFromKey(key: InternalRow): Option[Int] =
-      description.bucketSpec.map { _ => key.getInt(description.partitionColumns.length) }
-
     /**
      * Open and returns a new OutputWriter given a partition key and optional bucket id.
      * If bucket id is specified, we will append it to the end of the file name, but before the
      * file extension, e.g. part-r-00009-ea518ad4-455a-4431-b471-d24e03814677-00002.gz.parquet
      */
-    private def newOutputWriter(
-        key: InternalRow,
-        getPartitionString: UnsafeProjection): OutputWriter = {
+    private def newOutputWriter(key: InternalRow, partString: UnsafeProjection): OutputWriter = {
       val path =
         if (description.partitionColumns.nonEmpty) {
-          val partitionPath = getPartitionString(key).getString(0)
+          val partitionPath = partString(key).getString(0)
           new Path(stagingPath, partitionPath).toString
         } else {
           stagingPath
         }
-      val bucketId = getBucketIdFromKey(key)
 
+      // If the bucket spec is defined, the bucket column is right after the partition columns
+      val bucketId = if (description.bucketSpec.isDefined) {
+        Some(key.getInt(description.partitionColumns.length))
+      } else {
+        None
+      }
+
+      val split = taskAttemptContext.getTaskAttemptID.getTaskID.getId
       val newWriter = description.outputWriterFactory.newInstance(
-        path = path,
-        bucketId = bucketId,
+        stagingDir = path,
+        fileNamePrefix = filePrefix(split, description.uuid, bucketId),
         dataSchema = description.nonPartitionColumns.toStructType,
         context = taskAttemptContext)
       newWriter.initConverter(description.nonPartitionColumns.toStructType)
@@ -319,7 +330,7 @@ object WriteOutput extends Logging {
     override def execute(iter: Iterator[InternalRow]): Unit = {
       // We should first sort by partition columns, then bucket id, and finally sorting columns.
       val sortingExpressions: Seq[Expression] =
-      description.partitionColumns ++ bucketIdExpression ++ sortColumns
+        description.partitionColumns ++ bucketIdExpression ++ sortColumns
       val getSortingKey = UnsafeProjection.create(sortingExpressions, description.allColumns)
 
       val sortingKeySchema = StructType(sortingExpressions.map {
@@ -333,8 +344,8 @@ object WriteOutput extends Logging {
         description.nonPartitionColumns, description.allColumns)
 
       // Returns the partition path given a partition key.
-      val getPartitionString =
-      UnsafeProjection.create(Seq(Concat(partitionStringExpression)), description.partitionColumns)
+      val getPartitionString = UnsafeProjection.create(
+        Seq(Concat(partitionStringExpression)), description.partitionColumns)
 
       // Sorts the data before write, so that we only need one writer at the same time.
       val sorter = new UnsafeKVExternalSorter(
@@ -405,17 +416,6 @@ object WriteOutput extends Logging {
     job.getConfiguration.setBoolean("mapred.task.is.map", true)
     job.getConfiguration.setInt("mapred.task.partition", 0)
 
-    // This UUID is sent to executor side together with the serialized `Configuration` object within
-    // the `Job` instance.  `OutputWriters` on the executor side should use this UUID to generate
-    // unique task output files.
-    // This UUID is used to avoid output file name collision between different appending write jobs.
-    // These jobs may belong to different SparkContext instances. Concrete data source
-    // implementations may use this UUID to generate unique file names (e.g.,
-    // `part-r-<task-id>-<job-uuid>.parquet`). The reason why this ID is used to identify a job
-    // rather than a single task output file is that, speculative tasks must generate the same
-    // output file name as the original task.
-    job.getConfiguration.set(WriterContainer.DATASOURCE_WRITEJOBUUID, UUID.randomUUID().toString)
-
     val taskAttemptContext = new TaskAttemptContextImpl(job.getConfiguration, taskAttemptId)
     val outputCommitter = newOutputCommitter(
       job.getOutputFormatClass, taskAttemptContext, path, isAppend)
@@ -474,7 +474,3 @@ object WriteOutput extends Logging {
     }
   }
 }
-
-object WriterContainer {
-  val DATASOURCE_WRITEJOBUUID = "spark.sql.sources.writeJobUUID"
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala
index 55cb26d6513af..eefacbf05ba0d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala
@@ -31,7 +31,7 @@ import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
-import org.apache.spark.sql.execution.datasources.{OutputWriter, OutputWriterFactory, PartitionedFile, WriterContainer}
+import org.apache.spark.sql.execution.datasources.{OutputWriter, OutputWriterFactory, PartitionedFile}
 import org.apache.spark.sql.types._
 
 object CSVRelation extends Logging {
@@ -170,17 +170,17 @@ object CSVRelation extends Logging {
 
 private[csv] class CSVOutputWriterFactory(params: CSVOptions) extends OutputWriterFactory {
   override def newInstance(
-      path: String,
-      bucketId: Option[Int],
+      stagingDir: String,
+      fileNamePrefix: String,
       dataSchema: StructType,
       context: TaskAttemptContext): OutputWriter = {
-    if (bucketId.isDefined) sys.error("csv doesn't support bucketing")
-    new CsvOutputWriter(path, dataSchema, context, params)
+    new CsvOutputWriter(stagingDir, fileNamePrefix, dataSchema, context, params)
   }
 }
 
 private[csv] class CsvOutputWriter(
-    path: String,
+    stagingDir: String,
+    fileNamePrefix: String,
     dataSchema: StructType,
     context: TaskAttemptContext,
     params: CSVOptions) extends OutputWriter with Logging {
@@ -199,11 +199,7 @@ private[csv] class CsvOutputWriter(
   private val recordWriter: RecordWriter[NullWritable, Text] = {
     new TextOutputFormat[NullWritable, Text]() {
       override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
-        val configuration = context.getConfiguration
-        val uniqueWriteJobId = configuration.get(WriterContainer.DATASOURCE_WRITEJOBUUID)
-        val taskAttemptId = context.getTaskAttemptID
-        val split = taskAttemptId.getTaskID.getId
-        new Path(path, f"part-r-$split%05d-$uniqueWriteJobId.csv$extension")
+        new Path(stagingDir, s"$fileNamePrefix.csv$extension")
       }
     }.getRecordWriter(context)
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala
index 9fe38ccc9fdc6..cdbb2f7292613 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala
@@ -82,11 +82,11 @@ class JsonFileFormat extends TextBasedFileFormat with DataSourceRegister {
 
     new OutputWriterFactory {
       override def newInstance(
-          path: String,
-          bucketId: Option[Int],
+          stagingDir: String,
+          fileNamePrefix: String,
           dataSchema: StructType,
           context: TaskAttemptContext): OutputWriter = {
-        new JsonOutputWriter(path, parsedOptions, bucketId, dataSchema, context)
+        new JsonOutputWriter(stagingDir, parsedOptions, fileNamePrefix, dataSchema, context)
       }
     }
   }
@@ -153,9 +153,9 @@ class JsonFileFormat extends TextBasedFileFormat with DataSourceRegister {
 }
 
 private[json] class JsonOutputWriter(
-    path: String,
+    stagingDir: String,
     options: JSONOptions,
-    bucketId: Option[Int],
+    fileNamePrefix: String,
     dataSchema: StructType,
     context: TaskAttemptContext)
   extends OutputWriter with Logging {
@@ -168,12 +168,7 @@ private[json] class JsonOutputWriter(
   private val recordWriter: RecordWriter[NullWritable, Text] = {
     new TextOutputFormat[NullWritable, Text]() {
       override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
-        val configuration = context.getConfiguration
-        val uniqueWriteJobId = configuration.get(WriterContainer.DATASOURCE_WRITEJOBUUID)
-        val taskAttemptId = context.getTaskAttemptID
-        val split = taskAttemptId.getTaskID.getId
-        val bucketString = bucketId.map(BucketingUtils.bucketIdToString).getOrElse("")
-        new Path(path, f"part-r-$split%05d-$uniqueWriteJobId$bucketString.json$extension")
+        new Path(stagingDir, s"$fileNamePrefix.json$extension")
       }
     }.getRecordWriter(context)
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
index 6faafed1e6290..87b944ba523ca 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
@@ -27,7 +27,7 @@ import scala.util.{Failure, Try}
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, Path}
 import org.apache.hadoop.mapreduce._
-import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit}
+import org.apache.hadoop.mapreduce.lib.input.FileSplit
 import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
 import org.apache.parquet.{Log => ApacheParquetLog}
 import org.apache.parquet.filter2.compat.FilterCompat
@@ -45,7 +45,6 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
 import org.apache.spark.sql.catalyst.parser.LegacyTypeStringParser
 import org.apache.spark.sql.execution.datasources._
-import org.apache.spark.sql.execution.metric.SQLMetric
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
@@ -134,10 +133,10 @@ class ParquetFileFormat
     new OutputWriterFactory {
       override def newInstance(
           path: String,
-          bucketId: Option[Int],
+          fileNamePrefix: String,
           dataSchema: StructType,
           context: TaskAttemptContext): OutputWriter = {
-        new ParquetOutputWriter(path, bucketId, context)
+        new ParquetOutputWriter(path, fileNamePrefix, context)
       }
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOutputWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOutputWriter.scala
index f89ce05d82d90..39c199784cd6d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOutputWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOutputWriter.scala
@@ -26,7 +26,7 @@ import org.apache.parquet.hadoop.util.ContextUtil
 
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.execution.datasources.{BucketingUtils, OutputWriter, OutputWriterFactory, WriterContainer}
+import org.apache.spark.sql.execution.datasources.{OutputWriter, OutputWriterFactory}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.SerializableConfiguration
@@ -122,13 +122,12 @@ private[parquet] class ParquetOutputWriterFactory(
   }
 
   /** Disable the use of the older API. */
-  def newInstance(
+  override def newInstance(
       path: String,
-      bucketId: Option[Int],
+      fileNamePrefix: String,
       dataSchema: StructType,
       context: TaskAttemptContext): OutputWriter = {
-    throw new UnsupportedOperationException(
-      "this version of newInstance not supported for " +
+    throw new UnsupportedOperationException("this version of newInstance not supported for " +
         "ParquetOutputWriterFactory")
   }
 }
@@ -136,33 +135,16 @@ private[parquet] class ParquetOutputWriterFactory(
 
 // NOTE: This class is instantiated and used on executor side only, no need to be serializable.
 private[parquet] class ParquetOutputWriter(
-    path: String,
-    bucketId: Option[Int],
+    stagingDir: String,
+    fileNamePrefix: String,
     context: TaskAttemptContext)
   extends OutputWriter {
 
   private val recordWriter: RecordWriter[Void, InternalRow] = {
     val outputFormat = {
       new ParquetOutputFormat[InternalRow]() {
-        // Here we override `getDefaultWorkFile` for two reasons:
-        //
-        //  1. To allow appending.  We need to generate unique output file names to avoid
-        //     overwriting existing files (either exist before the write job, or are just written
-        //     by other tasks within the same write job).
-        //
-        //  2. To allow dynamic partitioning.  Default `getDefaultWorkFile` uses
-        //     `FileOutputCommitter.getWorkPath()`, which points to the base directory of all
-        //     partitions in the case of dynamic partitioning.
         override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
-          val configuration = context.getConfiguration
-          val uniqueWriteJobId = configuration.get(WriterContainer.DATASOURCE_WRITEJOBUUID)
-          val taskAttemptId = context.getTaskAttemptID
-          val split = taskAttemptId.getTaskID.getId
-          val bucketString = bucketId.map(BucketingUtils.bucketIdToString).getOrElse("")
-          // It has the `.parquet` extension at the end because (de)compression tools
-          // such as gunzip would not be able to decompress this as the compression
-          // is not applied on this whole file but on each "page" in Parquet format.
-          new Path(path, f"part-r-$split%05d-$uniqueWriteJobId$bucketString$extension")
+          new Path(stagingDir, fileNamePrefix + extension)
         }
       }
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala
index 9f96667311015..6cd2351c5749a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala
@@ -73,14 +73,11 @@ class TextFileFormat extends TextBasedFileFormat with DataSourceRegister {
 
     new OutputWriterFactory {
       override def newInstance(
-          path: String,
-          bucketId: Option[Int],
+          stagingDir: String,
+          fileNamePrefix: String,
           dataSchema: StructType,
           context: TaskAttemptContext): OutputWriter = {
-        if (bucketId.isDefined) {
-          throw new AnalysisException("Text doesn't support bucketing")
-        }
-        new TextOutputWriter(path, dataSchema, context)
+        new TextOutputWriter(stagingDir, fileNamePrefix, dataSchema, context)
       }
     }
   }
@@ -124,7 +121,11 @@ class TextFileFormat extends TextBasedFileFormat with DataSourceRegister {
   }
 }
 
-class TextOutputWriter(path: String, dataSchema: StructType, context: TaskAttemptContext)
+class TextOutputWriter(
+    stagingDir: String,
+    fileNamePrefix: String,
+    dataSchema: StructType,
+    context: TaskAttemptContext)
   extends OutputWriter {
 
   private[this] val buffer = new Text()
@@ -132,11 +133,7 @@ class TextOutputWriter(path: String, dataSchema: StructType, context: TaskAttemp
   private val recordWriter: RecordWriter[NullWritable, Text] = {
     new TextOutputFormat[NullWritable, Text]() {
       override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
-        val configuration = context.getConfiguration
-        val uniqueWriteJobId = configuration.get(WriterContainer.DATASOURCE_WRITEJOBUUID)
-        val taskAttemptId = context.getTaskAttemptID
-        val split = taskAttemptId.getTaskID.getId
-        new Path(path, f"part-r-$split%05d-$uniqueWriteJobId.txt$extension")
+        new Path(stagingDir, s"$fileNamePrefix.txt$extension")
       }
     }.getRecordWriter(context)
   }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
index 1af3280e18a89..1ceacb458ae6e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
@@ -83,11 +83,11 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
 
     new OutputWriterFactory {
       override def newInstance(
-          path: String,
-          bucketId: Option[Int],
+          stagingDir: String,
+          fileNamePrefix: String,
           dataSchema: StructType,
           context: TaskAttemptContext): OutputWriter = {
-        new OrcOutputWriter(path, bucketId, dataSchema, context)
+        new OrcOutputWriter(stagingDir, fileNamePrefix, dataSchema, context)
       }
     }
   }
@@ -210,8 +210,8 @@ private[orc] class OrcSerializer(dataSchema: StructType, conf: Configuration)
 }
 
 private[orc] class OrcOutputWriter(
-    path: String,
-    bucketId: Option[Int],
+    stagingDir: String,
+    fileNamePrefix: String,
     dataSchema: StructType,
     context: TaskAttemptContext)
   extends OutputWriter {
@@ -226,10 +226,7 @@ private[orc] class OrcOutputWriter(
 
   private lazy val recordWriter: RecordWriter[NullWritable, Writable] = {
     recordWriterInstantiated = true
-    val uniqueWriteJobId = conf.get(WriterContainer.DATASOURCE_WRITEJOBUUID)
-    val taskAttemptId = context.getTaskAttemptID
-    val partition = taskAttemptId.getTaskID.getId
-    val bucketString = bucketId.map(BucketingUtils.bucketIdToString).getOrElse("")
+
     val compressionExtension = {
       val name = conf.get(OrcRelation.ORC_COMPRESSION)
       OrcRelation.extensionsForCompressionCodecNames.getOrElse(name, "")
@@ -237,12 +234,12 @@ private[orc] class OrcOutputWriter(
     // It has the `.orc` extension at the end because (de)compression tools
     // such as gunzip would not be able to decompress this as the compression
     // is not applied on this whole file but on each "stream" in ORC format.
-    val filename = f"part-r-$partition%05d-$uniqueWriteJobId$bucketString$compressionExtension.orc"
+    val filename = s"$fileNamePrefix$compressionExtension.orc"
 
     new OrcOutputFormat().getRecordWriter(
-      new Path(path, filename).getFileSystem(conf),
+      new Path(stagingDir, filename).getFileSystem(conf),
       conf.asInstanceOf[JobConf],
-      new Path(path, filename).toString,
+      new Path(stagingDir, filename).toString,
       Reporter.NULL
     ).asInstanceOf[RecordWriter[NullWritable, Writable]]
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedWriteSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedWriteSuite.scala
index 997445114ba58..2eafe18b85844 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedWriteSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedWriteSuite.scala
@@ -54,11 +54,6 @@ class BucketedWriteSuite extends QueryTest with SQLTestUtils with TestHiveSingle
     intercept[AnalysisException](df.write.bucketBy(2, "i").sortBy("j").saveAsTable("tt"))
   }
 
-  test("write bucketed data to unsupported data source") {
-    val df = Seq(Tuple1("a"), Tuple1("b")).toDF("i")
-    intercept[SparkException](df.write.bucketBy(3, "i").format("text").saveAsTable("tt"))
-  }
-
   test("write bucketed data using save()") {
     val df = Seq(1 -> "a", 2 -> "b").toDF("i", "j")
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/CommitFailureTestSource.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/CommitFailureTestSource.scala
index 5a8a7f0ab5d7b..d5044684020e2 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/CommitFailureTestSource.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/CommitFailureTestSource.scala
@@ -39,11 +39,11 @@ class CommitFailureTestSource extends SimpleTextSource {
       dataSchema: StructType): OutputWriterFactory =
     new OutputWriterFactory {
       override def newInstance(
-          path: String,
-          bucketId: Option[Int],
+          stagingDir: String,
+          fileNamePrefix: String,
           dataSchema: StructType,
           context: TaskAttemptContext): OutputWriter = {
-        new SimpleTextOutputWriter(path, context) {
+        new SimpleTextOutputWriter(stagingDir, fileNamePrefix, context) {
           var failed = false
           TaskContext.get().addTaskFailureListener { (t: TaskContext, e: Throwable) =>
             failed = true
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
index 906de6bbcbee5..9e13b217ec305 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
@@ -23,7 +23,7 @@ import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, Path}
 import org.apache.hadoop.io.{NullWritable, Text}
 import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext}
-import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, TextOutputFormat}
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
 
 import org.apache.spark.sql.{sources, Row, SparkSession}
 import org.apache.spark.sql.catalyst.{expressions, InternalRow}
@@ -51,11 +51,11 @@ class SimpleTextSource extends TextBasedFileFormat with DataSourceRegister {
     SimpleTextRelation.lastHadoopConf = Option(job.getConfiguration)
     new OutputWriterFactory {
       override def newInstance(
-          path: String,
-          bucketId: Option[Int],
+          stagingDir: String,
+          fileNamePrefix: String,
           dataSchema: StructType,
           context: TaskAttemptContext): OutputWriter = {
-        new SimpleTextOutputWriter(path, context)
+        new SimpleTextOutputWriter(stagingDir, fileNamePrefix, context)
       }
     }
   }
@@ -120,9 +120,11 @@ class SimpleTextSource extends TextBasedFileFormat with DataSourceRegister {
   }
 }
 
-class SimpleTextOutputWriter(path: String, context: TaskAttemptContext) extends OutputWriter {
+class SimpleTextOutputWriter(
+    stagingDir: String, fileNamePrefix: String, context: TaskAttemptContext)
+  extends OutputWriter {
   private val recordWriter: RecordWriter[NullWritable, Text] =
-    new AppendingTextOutputFormat(new Path(path)).getRecordWriter(context)
+    new AppendingTextOutputFormat(new Path(stagingDir), fileNamePrefix).getRecordWriter(context)
 
   override def write(row: Row): Unit = {
     val serialized = row.toSeq.map { v =>
@@ -136,19 +138,15 @@ class SimpleTextOutputWriter(path: String, context: TaskAttemptContext) extends
   }
 }
 
-class AppendingTextOutputFormat(outputFile: Path) extends TextOutputFormat[NullWritable, Text] {
-  val numberFormat = NumberFormat.getInstance()
+class AppendingTextOutputFormat(stagingDir: Path, fileNamePrefix: String)
+  extends TextOutputFormat[NullWritable, Text] {
 
+  val numberFormat = NumberFormat.getInstance()
   numberFormat.setMinimumIntegerDigits(5)
   numberFormat.setGroupingUsed(false)
 
   override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
-    val configuration = context.getConfiguration
-    val uniqueWriteJobId = configuration.get(WriterContainer.DATASOURCE_WRITEJOBUUID)
-    val taskAttemptId = context.getTaskAttemptID
-    val split = taskAttemptId.getTaskID.getId
-    val name = FileOutputFormat.getOutputName(context)
-    new Path(outputFile, s"$name-${numberFormat.format(split)}-$uniqueWriteJobId")
+    new Path(stagingDir, fileNamePrefix)
   }
 }
 

From 2d14ab7e644b64ff911772e71f42653ba949cb07 Mon Sep 17 00:00:00 2001
From: Mark Grover <mark@apache.org>
Date: Thu, 20 Oct 2016 15:30:01 -0700
Subject: [PATCH 071/162] [DOCS] Update docs to not suggest to package Spark
 before running tests.

## What changes were proposed in this pull request?

Update docs to not suggest to package Spark before running tests.

## How was this patch tested?

Not creating a JIRA since this pretty small. We haven't had the need to run mvn package before mvn test since 1.6 at least, or so I am told. So, updating the docs to not be misguiding.

Author: Mark Grover <mark@apache.org>

Closes #15572 from markgrover/doc_update.
---
 docs/building-spark.md | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/docs/building-spark.md b/docs/building-spark.md
index f5acee6b90059..ebe46a42a15c6 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -217,9 +217,8 @@ For help in setting up IntelliJ IDEA or Eclipse for Spark development, and troub
 Tests are run by default via the [ScalaTest Maven plugin](http://www.scalatest.org/user_guide/using_the_scalatest_maven_plugin).
 Note that tests should not be run as root or an admin user.
 
-Some of the tests require Spark to be packaged first, so always run `mvn package` with `-DskipTests` the first time.  The following is an example of a correct (build, test) sequence:
+The following is an example of a command to run the tests:
 
-    ./build/mvn -Pyarn -Phadoop-2.3 -DskipTests -Phive -Phive-thriftserver clean package
     ./build/mvn -Pyarn -Phadoop-2.3 -Phive -Phive-thriftserver test
 
 The ScalaTest plugin also supports running only a specific Scala test suite as follows:
@@ -233,9 +232,8 @@ or a Java test:
 
 ## Testing with SBT
 
-Some of the tests require Spark to be packaged first, so always run `build/sbt package` the first time.  The following is an example of a correct (build, test) sequence:
+The following is an example of a command to run the tests:
 
-    ./build/sbt -Pyarn -Phadoop-2.3 -Phive -Phive-thriftserver package
     ./build/sbt -Pyarn -Phadoop-2.3 -Phive -Phive-thriftserver test
 
 To run only a specific test suite as follows:

From 1bb99c4887e97ae5f55c8c2b392ba5ca72d6168b Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Thu, 20 Oct 2016 20:44:32 -0700
Subject: [PATCH 072/162] [SPARK-18030][TESTS] Adds more checks to collect more
 info about FileStreamSourceSuite failure

## What changes were proposed in this pull request?

My hunch is `mkdirs` fails. Just add more checks to collect more info.

## How was this patch tested?

Jenkins

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #15577 from zsxwing/SPARK-18030-debug.
---
 .../apache/spark/sql/streaming/FileStreamSourceSuite.scala    | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
index aabdccaaf319d..b9e9da9a1ec53 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
@@ -664,7 +664,9 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
     def createFile(content: String, src: File, tmp: File): Unit = {
       val tempFile = Utils.tempFileWith(new File(tmp, "text"))
       val finalFile = new File(src, tempFile.getName)
-      src.mkdirs()
+      require(!src.exists(), s"$src exists, dir: ${src.isDirectory}, file: ${src.isFile}")
+      require(src.mkdirs(), s"Cannot create $src")
+      require(src.isDirectory(), s"$src is not a directory")
       require(stringToFile(tempFile, content).renameTo(finalFile))
     }
 

From 3180272d2d49e440516085c0e4aebd5bad18bcad Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Thu, 20 Oct 2016 21:12:55 -0700
Subject: [PATCH 073/162] [SPARKR] fix warnings

## What changes were proposed in this pull request?

Fix for a bunch of test warnings that were added recently.
We need to investigate why warnings are not turning into errors.

```
Warnings -----------------------------------------------------------------------
1. createDataFrame uses files for large objects (test_sparkSQL.R#215) - Use Sepal_Length instead of Sepal.Length  as column name

2. createDataFrame uses files for large objects (test_sparkSQL.R#215) - Use Sepal_Width instead of Sepal.Width  as column name

3. createDataFrame uses files for large objects (test_sparkSQL.R#215) - Use Petal_Length instead of Petal.Length  as column name

4. createDataFrame uses files for large objects (test_sparkSQL.R#215) - Use Petal_Width instead of Petal.Width  as column name

Consider adding
  importFrom("utils", "object.size")
to your NAMESPACE file.
```

## How was this patch tested?

unit tests

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #15560 from felixcheung/rwarnings.
---
 R/pkg/NAMESPACE                           | 2 +-
 R/pkg/inst/tests/testthat/test_sparkSQL.R | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 267a38c21530b..5960c6206a6f1 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -3,7 +3,7 @@
 importFrom("methods", "setGeneric", "setMethod", "setOldClass")
 importFrom("methods", "is", "new", "signature", "show")
 importFrom("stats", "gaussian", "setNames")
-importFrom("utils", "download.file", "packageVersion", "untar")
+importFrom("utils", "download.file", "object.size", "packageVersion", "untar")
 
 # Disable native libraries till we figure out how to package it
 # See SPARKR-7839
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index af81d0586e0a6..1c806869e9fbe 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -212,7 +212,7 @@ test_that("createDataFrame uses files for large objects", {
   # To simulate a large file scenario, we set spark.r.maxAllocationLimit to a smaller value
   conf <- callJMethod(sparkSession, "conf")
   callJMethod(conf, "set", "spark.r.maxAllocationLimit", "100")
-  df <- createDataFrame(iris)
+  df <- suppressWarnings(createDataFrame(iris))
 
   # Resetting the conf back to default value
   callJMethod(conf, "set", "spark.r.maxAllocationLimit", toString(.Machine$integer.max / 10))

From 57e97fcbd6fe62af4acd60896feeacfa21efc222 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Fri, 21 Oct 2016 12:27:53 +0800
Subject: [PATCH 074/162] [SPARK-18029][SQL] PruneFileSourcePartitions should
 not change the output of LogicalRelation

## What changes were proposed in this pull request?

In `PruneFileSourcePartitions`, we will replace the `LogicalRelation` with a pruned one. However, this replacement may change the output of the `LogicalRelation` if it doesn't have `expectedOutputAttributes`. This PR fixes it.

## How was this patch tested?

the new `PruneFileSourcePartitionsSuite`

Author: Wenchen Fan <wenchen@databricks.com>

Closes #15569 from cloud-fan/partition-bug.
---
 .../sql/catalyst/catalog/interface.scala      |  4 +-
 .../PruneFileSourcePartitions.scala           |  4 +-
 .../spark/sql/hive/HiveDataFrameSuite.scala   |  7 +-
 .../sql/hive/HiveMetadataCacheSuite.scala     |  3 +-
 .../PruneFileSourcePartitionsSuite.scala      | 74 +++++++++++++++++++
 5 files changed, 85 insertions(+), 7 deletions(-)
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index 1a57a7707caa1..a97ed701c4207 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -102,8 +102,8 @@ case class CatalogTablePartition(
    * Given the partition schema, returns a row with that schema holding the partition values.
    */
   def toRow(partitionSchema: StructType): InternalRow = {
-    InternalRow.fromSeq(partitionSchema.map { case StructField(name, dataType, _, _) =>
-      Cast(Literal(spec(name)), dataType).eval()
+    InternalRow.fromSeq(partitionSchema.map { field =>
+      Cast(Literal(spec(field.name)), field.dataType).eval()
     })
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
index 29121a47d92d1..8689017c3ed75 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
@@ -59,7 +59,9 @@ private[sql] object PruneFileSourcePartitions extends Rule[LogicalPlan] {
         val prunedFileCatalog = tableFileCatalog.filterPartitions(partitionKeyFilters.toSeq)
         val prunedFsRelation =
           fsRelation.copy(location = prunedFileCatalog)(sparkSession)
-        val prunedLogicalRelation = logicalRelation.copy(relation = prunedFsRelation)
+        val prunedLogicalRelation = logicalRelation.copy(
+          relation = prunedFsRelation,
+          expectedOutputAttributes = Some(logicalRelation.output))
 
         // Keep partition-pruning predicates so that they are visible in physical planning
         val filterExpression = filters.reduceLeft(And)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
index f65e74de87a57..15523437a3404 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
@@ -20,9 +20,10 @@ package org.apache.spark.sql.hive
 import java.io.File
 
 import org.apache.spark.metrics.source.HiveCatalogMetrics
+import org.apache.spark.sql.QueryTest
 import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SQLTestUtils
-import org.apache.spark.sql.QueryTest
 
 class HiveDataFrameSuite extends QueryTest with TestHiveSingleton with SQLTestUtils {
   test("table name with schema") {
@@ -78,7 +79,7 @@ class HiveDataFrameSuite extends QueryTest with TestHiveSingleton with SQLTestUt
   }
 
   test("lazy partition pruning reads only necessary partition data") {
-    withSQLConf("spark.sql.hive.filesourcePartitionPruning" -> "true") {
+    withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_PRUNING.key -> "true") {
       withTable("test") {
         withTempDir { dir =>
           setupPartitionedTable("test", dir)
@@ -114,7 +115,7 @@ class HiveDataFrameSuite extends QueryTest with TestHiveSingleton with SQLTestUt
   }
 
   test("all partitions read and cached when filesource partition pruning is off") {
-    withSQLConf("spark.sql.hive.filesourcePartitionPruning" -> "false") {
+    withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_PRUNING.key -> "false") {
       withTable("test") {
         withTempDir { dir =>
           setupPartitionedTable("test", dir)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
index 2ca1cd4c07fdb..d290fe9962db2 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
@@ -22,6 +22,7 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.SparkException
 import org.apache.spark.sql.QueryTest
 import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SQLTestUtils
 
 /**
@@ -62,7 +63,7 @@ class HiveMetadataCacheSuite extends QueryTest with SQLTestUtils with TestHiveSi
 
   def testCaching(pruningEnabled: Boolean): Unit = {
     test(s"partitioned table is cached when partition pruning is $pruningEnabled") {
-      withSQLConf("spark.sql.hive.filesourcePartitionPruning" -> pruningEnabled.toString) {
+      withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_PRUNING.key -> pruningEnabled.toString) {
         withTable("test") {
           withTempDir { dir =>
             spark.range(5).selectExpr("id", "id as f1", "id as f2").write
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala
new file mode 100644
index 0000000000000..346ea0ca4367e
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.execution
+
+import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.dsl.plans._
+import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project}
+import org.apache.spark.sql.catalyst.rules.RuleExecutor
+import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation, PruneFileSourcePartitions, TableFileCatalog}
+import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
+import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.test.SQLTestUtils
+import org.apache.spark.sql.types.StructType
+
+class PruneFileSourcePartitionsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
+
+  object Optimize extends RuleExecutor[LogicalPlan] {
+    val batches = Batch("PruneFileSourcePartitions", Once, PruneFileSourcePartitions) :: Nil
+  }
+
+  test("PruneFileSourcePartitions should not change the output of LogicalRelation") {
+    withTable("test") {
+      withTempDir { dir =>
+        sql(
+          s"""
+            |CREATE EXTERNAL TABLE test(i int)
+            |PARTITIONED BY (p int)
+            |STORED AS parquet
+            |LOCATION '${dir.getAbsolutePath}'""".stripMargin)
+
+        val tableMeta = spark.sharedState.externalCatalog.getTable("default", "test")
+        val tableFileCatalog = new TableFileCatalog(
+          spark,
+          tableMeta.database,
+          tableMeta.identifier.table,
+          Some(tableMeta.partitionSchema),
+          0)
+
+        val dataSchema = StructType(tableMeta.schema.filterNot { f =>
+          tableMeta.partitionColumnNames.contains(f.name)
+        })
+        val relation = HadoopFsRelation(
+          location = tableFileCatalog,
+          partitionSchema = tableMeta.partitionSchema,
+          dataSchema = dataSchema,
+          bucketSpec = None,
+          fileFormat = new ParquetFileFormat(),
+          options = Map.empty)(sparkSession = spark)
+
+        val logicalRelation = LogicalRelation(relation, catalogTable = Some(tableMeta))
+        val query = Project(Seq('i, 'p), Filter('p === 1, logicalRelation)).analyze
+
+        val optimized = Optimize.execute(query)
+        assert(optimized.missingInput.isEmpty)
+      }
+    }
+  }
+}

From 595893d33a26c838c8c5c0c599fbee7fa61cbdff Mon Sep 17 00:00:00 2001
From: Jagadeesan <as2@us.ibm.com>
Date: Fri, 21 Oct 2016 09:48:24 +0100
Subject: [PATCH 075/162] [SPARK-17960][PYSPARK][UPGRADE TO PY4J 0.10.4]

## What changes were proposed in this pull request?

1) Upgrade the Py4J version on the Java side
2) Update the py4j src zip file we bundle with Spark

## How was this patch tested?

Existing doctests & unit tests pass

Author: Jagadeesan <as2@us.ibm.com>

Closes #15514 from jagadeesanas2/SPARK-17960.
---
 LICENSE                                         |   2 +-
 bin/pyspark                                     |   2 +-
 bin/pyspark2.cmd                                |   2 +-
 core/pom.xml                                    |   2 +-
 .../apache/spark/api/python/PythonUtils.scala   |   2 +-
 dev/deps/spark-deps-hadoop-2.2                  |   2 +-
 dev/deps/spark-deps-hadoop-2.3                  |   2 +-
 dev/deps/spark-deps-hadoop-2.4                  |   2 +-
 dev/deps/spark-deps-hadoop-2.6                  |   2 +-
 dev/deps/spark-deps-hadoop-2.7                  |   2 +-
 python/docs/Makefile                            |   2 +-
 python/lib/py4j-0.10.3-src.zip                  | Bin 91275 -> 0 bytes
 python/lib/py4j-0.10.4-src.zip                  | Bin 0 -> 74096 bytes
 sbin/spark-config.sh                            |   2 +-
 .../org/apache/spark/deploy/yarn/Client.scala   |   2 +-
 .../spark/deploy/yarn/YarnClusterSuite.scala    |   2 +-
 16 files changed, 14 insertions(+), 14 deletions(-)
 delete mode 100644 python/lib/py4j-0.10.3-src.zip
 create mode 100644 python/lib/py4j-0.10.4-src.zip

diff --git a/LICENSE b/LICENSE
index d68609cc28733..7950dd6ceb6db 100644
--- a/LICENSE
+++ b/LICENSE
@@ -263,7 +263,7 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
      (New BSD license) Protocol Buffer Java API (org.spark-project.protobuf:protobuf-java:2.4.1-shaded - http://code.google.com/p/protobuf)
      (The BSD License) Fortran to Java ARPACK (net.sourceforge.f2j:arpack_combined_all:0.1 - http://f2j.sourceforge.net)
      (The BSD License) xmlenc Library (xmlenc:xmlenc:0.52 - http://xmlenc.sourceforge.net)
-     (The New BSD License) Py4J (net.sf.py4j:py4j:0.10.3 - http://py4j.sourceforge.net/)
+     (The New BSD License) Py4J (net.sf.py4j:py4j:0.10.4 - http://py4j.sourceforge.net/)
      (Two-clause BSD-style license) JUnit-Interface (com.novocode:junit-interface:0.10 - http://github.com/szeiger/junit-interface/)
      (BSD licence) sbt and sbt-launch-lib.bash
      (BSD 3 Clause) d3.min.js (https://github.com/mbostock/d3/blob/master/LICENSE)
diff --git a/bin/pyspark b/bin/pyspark
index 7590309b442ed..d6b3ab0a44321 100755
--- a/bin/pyspark
+++ b/bin/pyspark
@@ -57,7 +57,7 @@ export PYSPARK_PYTHON
 
 # Add the PySpark classes to the Python path:
 export PYTHONPATH="${SPARK_HOME}/python/:$PYTHONPATH"
-export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.3-src.zip:$PYTHONPATH"
+export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.4-src.zip:$PYTHONPATH"
 
 # Load the PySpark shell.py script when ./pyspark is used interactively:
 export OLD_PYTHONSTARTUP="$PYTHONSTARTUP"
diff --git a/bin/pyspark2.cmd b/bin/pyspark2.cmd
index 1217a4f2f97a2..f211c0873ad2f 100644
--- a/bin/pyspark2.cmd
+++ b/bin/pyspark2.cmd
@@ -30,7 +30,7 @@ if "x%PYSPARK_DRIVER_PYTHON%"=="x" (
 )
 
 set PYTHONPATH=%SPARK_HOME%\python;%PYTHONPATH%
-set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.10.3-src.zip;%PYTHONPATH%
+set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.10.4-src.zip;%PYTHONPATH%
 
 set OLD_PYTHONSTARTUP=%PYTHONSTARTUP%
 set PYTHONSTARTUP=%SPARK_HOME%\python\pyspark\shell.py
diff --git a/core/pom.xml b/core/pom.xml
index 205bbc588be09..eac99ab82a2e4 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -331,7 +331,7 @@
     <dependency>
       <groupId>net.sf.py4j</groupId>
       <artifactId>py4j</artifactId>
-      <version>0.10.3</version>
+      <version>0.10.4</version>
     </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala b/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
index 701097ace8974..c4e55b5e89027 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
@@ -32,7 +32,7 @@ private[spark] object PythonUtils {
     val pythonPath = new ArrayBuffer[String]
     for (sparkHome <- sys.env.get("SPARK_HOME")) {
       pythonPath += Seq(sparkHome, "python", "lib", "pyspark.zip").mkString(File.separator)
-      pythonPath += Seq(sparkHome, "python", "lib", "py4j-0.10.3-src.zip").mkString(File.separator)
+      pythonPath += Seq(sparkHome, "python", "lib", "py4j-0.10.4-src.zip").mkString(File.separator)
     }
     pythonPath ++= SparkContext.jarOfObject(this)
     pythonPath.mkString(File.pathSeparator)
diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2
index 525dcef5b7d99..99279a4ca8be9 100644
--- a/dev/deps/spark-deps-hadoop-2.2
+++ b/dev/deps/spark-deps-hadoop-2.2
@@ -140,7 +140,7 @@ parquet-jackson-1.8.1.jar
 pmml-model-1.2.15.jar
 pmml-schema-1.2.15.jar
 protobuf-java-2.5.0.jar
-py4j-0.10.3.jar
+py4j-0.10.4.jar
 pyrolite-4.13.jar
 scala-compiler-2.11.8.jar
 scala-library-2.11.8.jar
diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3
index 562fe6461e753..f094b4a7e167a 100644
--- a/dev/deps/spark-deps-hadoop-2.3
+++ b/dev/deps/spark-deps-hadoop-2.3
@@ -147,7 +147,7 @@ parquet-jackson-1.8.1.jar
 pmml-model-1.2.15.jar
 pmml-schema-1.2.15.jar
 protobuf-java-2.5.0.jar
-py4j-0.10.3.jar
+py4j-0.10.4.jar
 pyrolite-4.13.jar
 scala-compiler-2.11.8.jar
 scala-library-2.11.8.jar
diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4
index 747521aa2a566..7f0ef98680a15 100644
--- a/dev/deps/spark-deps-hadoop-2.4
+++ b/dev/deps/spark-deps-hadoop-2.4
@@ -147,7 +147,7 @@ parquet-jackson-1.8.1.jar
 pmml-model-1.2.15.jar
 pmml-schema-1.2.15.jar
 protobuf-java-2.5.0.jar
-py4j-0.10.3.jar
+py4j-0.10.4.jar
 pyrolite-4.13.jar
 scala-compiler-2.11.8.jar
 scala-library-2.11.8.jar
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index afd4502c59d33..4a27bf3deecb6 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -155,7 +155,7 @@ parquet-jackson-1.8.1.jar
 pmml-model-1.2.15.jar
 pmml-schema-1.2.15.jar
 protobuf-java-2.5.0.jar
-py4j-0.10.3.jar
+py4j-0.10.4.jar
 pyrolite-4.13.jar
 scala-compiler-2.11.8.jar
 scala-library-2.11.8.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 687b855b649d8..151670a8e23e4 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -156,7 +156,7 @@ parquet-jackson-1.8.1.jar
 pmml-model-1.2.15.jar
 pmml-schema-1.2.15.jar
 protobuf-java-2.5.0.jar
-py4j-0.10.3.jar
+py4j-0.10.4.jar
 pyrolite-4.13.jar
 scala-compiler-2.11.8.jar
 scala-library-2.11.8.jar
diff --git a/python/docs/Makefile b/python/docs/Makefile
index de86e97d862f0..5e4cfb8ab6fe3 100644
--- a/python/docs/Makefile
+++ b/python/docs/Makefile
@@ -7,7 +7,7 @@ SPHINXBUILD   ?= sphinx-build
 PAPER         ?=
 BUILDDIR      ?= _build
 
-export PYTHONPATH=$(realpath ..):$(realpath ../lib/py4j-0.10.3-src.zip)
+export PYTHONPATH=$(realpath ..):$(realpath ../lib/py4j-0.10.4-src.zip)
 
 # User-friendly check for sphinx-build
 ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
diff --git a/python/lib/py4j-0.10.3-src.zip b/python/lib/py4j-0.10.3-src.zip
deleted file mode 100644
index bc54f33af1515c0676bd831bc5a02f112b28e0a3..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 91275
zcmdRVV~{4@)@9kYZChQoZ9ZkA%Qm{pwr$(C?JnDP&3o@R_lt>{ckayJn;9n~|E!3O
z$hFSi`=p`_C>RXTKQ2Le45@#<{LcqC5E77s2b&e6nkF2OZfCnj%U|me7!64AF9;eC
z)L(a=%$MZ<3PJi8#K6GP&eGYyfWg7zUub#L%zvU))S*h*Zk3@VWvAt36%M6o$7j@j
z#VF9yQ`1XF#b}I9(vLFIC`roFOwUl$OGSm<5RX3q>P^eUhSFX=baiiU6)6-;=w&q#
zr4nKPG6fQf3;?<+_m>s+f31I*_Fvw}{BPbs|7Dtift-+tf{NxpJ(~W@n}4i-dGz-%
zg#Q)x-_iKrv40;8-2Vg1VDN7OV+=qyHU}61ZscUuZxGdgDf)xNVu4a3;R4@65JB7$
zG9!WGi3kVg%Em@x3tog22O|?Rq-W+-q-ZL}CX{PvrfVjqYQ(3dj4PJf0{{RETdVBE
z)Wk#+o6GE+oa_V~6+@GqOr^w}448RCTT6>GGl1b~@%e?tLB6%o!CtwwA;7NC+R((x
z>d3@s*W_$(_#fefgz`^x0!sKRw0r+EynpZOe`nsv(Ae6+-qHDQGQXrL6SKjA)cvl3
zG-hbpiyV%(Zlr{HLMt{)LKO?{2?Y{D8?`biT_~*FQQ?=fE5ahC)XcdBRw^IL_Hpz%
zH9I3kO*3rIAXVbV=&Q@;^<?u%OK-13%&uy(a;wbRhrT8?7n^Psdw@__e?1u8*tHES
zUoTFj_k3XbHQ?FQz8(~~Lzg6Ip_h=J)QjHl+9{0J1$j5Kp>bgU%+oTiQOVr?`)tYT
zkR3hp8f-1y&Eo|o#Es1R_Ui7?t9~o~;xN>^r&|}<yK;AkZ&nBT$KKrS@whL&bvYWf
zeu2Rv*1NfcY0~fW7)2{YK=$#0)ePEjxV8IiVP58TE+mRwYJWjEQ&q0GyImK}59$DG
zZ)-qEV_rWZa5nSkp3<<8-LJE8zPxY?A(~Q4dr*ozWVc3jET$<d$V*wM*iPm?Tz5gI
zyH$7i!|VnHe558!J-s+<LGV;%SIAVJt`xxv%<wCJlki)VL&1%Ms{17<bwearIF1Fe
zIP4tG<)pa;A`AKF>p%kwWs+9$^Y?lP_BbZ2S1=IPsMoRL+B$G7{n+hb$>T2SBy6kZ
zY$(ISP1Hyj%=AuVykaB_3P#<pT0Tc@HTfk`*}yBZmT|KuJ}~4!_&Hq1PnQc-DAaWx
z%u!o>p?=_+KK}~t`WRvAbcihMaAuvbs(Z%uFN}vg(`YqmG4?GS#(c?Ds?-<`5+2xk
zJ`xYY*(oz)L_m~{T$X0d^6%Ev&`pGV=Hll~m&>{vh(Mv(1;+wLG1$%7GpxqU=A$Hb
zzig0u!77YK1(HM!*`&|{ym{V8e#Miv@1}5^JIn*aB6!j*VSDHilmO%^6o)!WsE;cj
zVLLcEo-i!ojGg@=o7i>?aAMZ^pDQCtsTN<J7UgNX3-OLhc)<gs@?Op}Rz#1Ri^T9Q
z+u<BKGcUjC7f=`8*9N6V)66Lgu}Pxv^*#XS!fE+ro(G5~r#u6qnr#VU_J8G@(=%++
z$I^U;X?hkMPre0+K2Ow=n(4F@$>C^sadr=C6SCkvD92nDmzJ!wx~~Z6=Y;l-XDnm(
zHTKRX3)0V4>zz{zukhjPHLNankDu<$6$x52eC!o*!#3w~OBQRC6r)VzM|y~e<c;aN
zkMtkJMX~n-+cz2($c4PZH?<^21>~xF&=SUbpNw>Im_wl^p;(&%Q=SQv{qPp$O%DOB
zRN>)Cqt}?YI+BLCXhYfMK1u+n$&xak0WP^=N0SXMR@mS;EJarAjUDmx*yyQX`u(pV
z`jBzelkRj|d1ISUi+57!%%ho@XK4Rj3cs(iEy>w~qze??cpG@@V9Ms$Ao{dv$|n1i
zo?L?y3|#c_n|0<<6U<+S&3+czTZkFsVphn%ezP{wWR9fsvpYq@xYpL4^~hOzOpzjs
zv9rk5+y99PIwGI=pxA*!<qFu_7*a{8!){R&F}}9jEYtO8(cKd}ZZH`Asa`mVu!8se
z*{LHo<mx+00DSBabG|aGnk=*o9q<99p2;?Z58bBcvXZ=xNF`H@LD_I(8L2hd7tU39
zwXZ=R0+xCDY^=tVcXea!nSm%H5IwUBwm;9Aypwj=H<_dv@VP)9zKFdrV3k?MkCgL+
z3Hv5dqZvwgXo4ck01Csc3~6)3UL7hdvB=vau#mda#>qhYiE1*X!Gz06)hD(TM>qSU
zZSAfyMx1R~!I$KimeAWq?t<X5NWi5!hZ9m@YixBPL&Edzo|KOs=Eb?3XX>jLZJUyf
zUDmFMLy423I&iz0Q@F=Yo6|l2{cZB?@@ekKiOZ_nLMIQ&u+!v+A1ZiEbacODzDmP)
zkQ;?3=*^=05+}UH*!Z5*u?ftR48>897R*J?s#E7ebf!YK`V5K(f99&}m$}M=W+6uN
z{lVpR#9GJxy;g?<6IK)8k-EUfarrXGby418N+VhggO&DbmJmF;ctJe!+g<hv%ne}M
zy4SH}{tqo!9><C41_uH{!1*t!{d+C=JKg&?vhW|RduadK@fh1!n%X%#nL4_fI{uCJ
zUFz!C?~5n<KG&2g2?VDr!H$_hU_sr0=<95qY8l`(|5zjXkxN}q##$^1FV&>;WA(fH
zSXpd2s{Z2Y;X(w=*+@yPq<s3ZLZ#$Iy7Oz_I!;|nzuIhX;4)*yZ-`Q&VD2P;yg3Bz
z(q}&0yNmijJK2^LCGK#dsB#zfLwy3QBitovOqkB(oyycB@mCKyA?mHMe#=H75~h@A
zP6|qU_n)R`HR`+iq{rgqn?I*>a~&v9hwuxj4l9L|&iicCAs>m`l!wyc+8p1j;}FNd
z^gA2HG`MR(V!6a(36+P)GgdU)s_f3P4ri!wjD|1T%I3g9?6%4+ECbP2&4k-!k4Tn<
z!>C*!SCXk>N`f_3^lj|?nTi!x74DO;2f%PN3sof1^gQD#2j_Eh_0`KH(J33QIul&t
zSL!ab&8E>NdnAzEWMZtyWBFw;xoA!i1*A)kNoPIbR!tyZ=QVOk{G-W`Cm{<>A!4$F
z^n5FP1g3Y26GiEZX3xh4B|ZItdusFJBPJ#9;u0YI->XOUQ49H2fXk1fwm;2UkDBtO
zLZAvTSjx+&x5?C*7~G{W5o&yU%_Y5aV)6W&qi+S3e<;~U5iqu$`KBaXjVW>=OUIRq
zZS|AvTcon&2G>5O#=`D5UJfoUJbj%O1l;{xd?xfdC<Xp}Ik;f=^mM<~<h&ktKcAj{
zeA;@yUm5v60&8dIWKR3F@(5%YoEPoM?lt1_*{xeO#I3cO7aeUESFh7l!NjqBwDetG
zaYig$yZL=>bXL>ukbZeUhpH_ho=G)8Y(=q-@w5uZsh+c2*medGx_iE$!P)4f+>R8k
zj^EDBA=5-u4%IEEQ?v1un9uz>ijS{!HzmD@!;o5tJ@GUY&2}GV#X)g?FW~{kSPXoR
zI|qVjlF|?C3;b!*&VMFRYs-`h-w5FZ+)adkQa7mqzyYMWRr1q4b)04q{QAlnBTrS*
zKeEMR-FPU88$y|+oahx{8_5<XDu|)4LK<tX+B0Q{xOjnI&Cb+(IMJAY-{0`0RdY(E
znk-kt50>>$_^CuLgd(Q`2vhg9ieaok*+?17kG<8duJFVR`p6Y{({%c!hLaxxf!Fe?
zt*%2+OqQFr;FE?tz)os*5VeT8)9l_kv*6`98gGtR++|`EwMP4<+Nl$+*YpWNzQ}fB
zD5%KEd0f)j`GaU-TPN#!-YboN@L6pkITxt0{|BCCn=fC<BgU{ErLC1)4B&UZcC=-A
zy5wSHQ;?hx%qxq65c6q=qkgn>E=<7d7=(-W88dWJAh)RN3!9g0vZhGZm}xyd)#S&9
zLHGBUmtUp+h8b4cmE^EnsW20u=>@$soJ@{9W5`F(R^xe)+8Xb&fUv?547J*g>omWd
zpcz+a5Er@c!c7lg6whsuRXT1&iNO)UG@Qpa=cf40{QE7skbREqhfysAl3H|^D_i9x
zo#O_)HcnIv>NEE~7&I~ZFVzVKeONS#p-q>&*_K<g<zFCbq6|W_R8hdy@kuNiOPx~w
zS*18}fv*5({R_CY^#Nk|Y$9;gS+IFkfT?nU=MrqK8j&`dpsJaOis3gvA+*SZRe^oc
ze;^Zv7POMj&wnlVEQv^Ds2X02890{Kj`K1xrnB3DO`IC%DbPVhgHUMAw5!tUhMyYN
zka)oawqgcdR@jo3TKY2YQNni}jVKZ$Id`;{``#0W1uliPE9D`$@UgARNs3lQc|ZW#
z^3&ZA9uZ`b#;d_vyDhtT11I2=zI7Nj)qOh_B%_gdFHx<6TIb+RiK5k0lp{*e2J{b8
zF4KYRUFb_ps;b@k4i3D5Lc}2Q+Hp`yqq7q30uS&<tC)e60(9G&mwHYbC~>g(A=A7x
zTNXko2RDeCs`*COeoUnZ2?+167yhKevLaVWHOmC-f<P)A2jP%<*ex+)j|(;cO~qm=
z3k*8bmFnE@;a*}zhQ)7|eTM>|n2Oltk76JfrI70i+qk^_?@=!d#ZXlwpo$jL-wzMn
zLu5rGM%R4>yFDXh_P5-!P#y`1F-Tf~hJerzoV2>nweWJA@F-HDa}bHAvMM1>z>f`-
zs*csyPG(R(=951|EvTYvt%xc!%upQu;4%&sV<@#r?9PM(iL@e#>#G$fPfEhJc(N4Q
zz4^w|7#Do?KX-!_SKM*LO=$XApICO62OTyzs#%bih>~$al~5zR_I<CICd!m~di&36
z5H9MgaU9$%a_*KC?ZZ5J$W9(P_%X`pfbEiBza^t=J0OQcOQ3-aSav8Kt_c7UK&A_>
zvrn^KsbB7`?t4JSBc0uJ3|}cE<MM*6sP)Jm4w!6YoPF?ARMs8mOjwp_`{LAyAgJOz
zClcJifo8b@Y;nmsM2hR05UBKGyB>!yUn;cy@Q)h}5GrGpkF5zz>ua8>8(OK+j>~cZ
z-rUu|LHGGU$CCGj=)mSS#Go_kTq5CzhK@{oD3$eXh<!q+51mf2WesGT@M&;OQBtN=
zYy|8LG^;8?8^=T_h4ZA5_Bht8MLWL-VAqA>j?c{PCHESHd5`J1M$c=}L@f>Y$K8}>
zVej)3V{pJ8^0uE+$AuB$9*ydThyqmf{U-_VQtI`MS(u;;WHzJ-;9yr@-9yXBOBWfz
z5IxuSh*E9IXLc`VW!cs51d-&hJcV^lr!tUy;wp)R?~sDhARc(Nu0Ou=Q`n3l7KX7}
z+mfZm`j+y+1`I20QCENTWn-z9kop_6kA=Y98bAz>vX!@kX5l1wIDATZ9|@<K6E{we
zz=M5*&ZSjVG7g%wYKMx(C8=Ezud;72*eb_Gfq}-A&@^*QiZ>xEWtfp0-4Sj<Lt^8f
z;dni*$kitv6?0JoUc`4e$MgJA51pHV*WMD@t|B@PdXndbxT5QCa7A2k(>op|ZLN&)
z=SA8CC4S@Avvz3>v<a`W=F*(%$5mQXZOd1LRoG#LzX%3{9~hIkp&M90a7bguZ9C3u
z_~9@dn|VUr5*-##VA!-*IfzKfH9CxFUCs;7+cS`6uX3F_#raU2o(WRDufYEC*-1?i
zft#H4Fyed$1IO<)cg0+G8jzY*0)1()*gWSYK=yzqU4H!vs2Ze%f0@8b4$F4!o#jC;
z#ZxqQsXloZGDFjcI59jELEJ!Y5xa&%OLU86REZEfgR+D9CJ6%Nn@|s;YgluZTm7BD
z2U>BKoOsIal5GdWu^2?Unv>58T{E8kRrKWlVtKwb*`(~l|MW{jL4jouc83BuqnQ)E
z!gldopyX^&YGaujh*9d;f`fiD(hU(S*i+|+<C^y-2N}3EPvdg!4P4AY-J~{~I0^jn
z=TZ;K3R1d2xvq<j2HI`pMOM~3!h_$6EBB~&Hv}9|dY{9btT%CQabkyze{!l}B>3Y{
znkLT0&9{WC6r}sN^d^%w9r@c;y1bAiE2r93;gOX57HW)IinFqb9b5^FRT*Jeqe=wr
z5xpwhuRqBv#O=se9SUOoW@r1rD_#l^nGb{4r=j5Jjv)7Q>R6y}`L|r2;_^h$UN^Cu
z&$r>6Q10S-*NN{XJ(SC?#wya7i=8MMzdA~pY5c<<Vx&mxdA==<Y@*~;YXnXjJTwzu
z6?vPBkIB-68y+xh;RlCk-WWwdch3C~ZGDlDV=JPOJBH6P#}_yTPOEBxj11`1oM5fD
zHXN=ILaxuztA|c*+mHh-;xnP9Xr#6;2}0D9GQCgTW_N~HKLd=JP>P(nAUV3n`)=o=
zrxY@|nQ);&TlbopI^b=rPQ2k}_TyDakb`6{j4QpFYSzcV1-#ycMzK0s7QDwoiPW?`
zd#?bDn!JF7UXiJb1=eY8t4diNHSI*6;)l8A>-Q+kE6(ig)e7y>t{=bT)X=ww0pSV*
zZ+$kr94_>!M@o5pJLh#Hr#0Q*4lB@nn-cGvC60)Zm=sVYCiP!_n5`wvr92bn0eFbR
zuJZ``xFe_)1_za%22g)+5lEiEx~vx^Fd?3~W=x+jJ=PjE0|mPlP4pR2YU7~IG$Yx@
zHIFTSm|Kf_x_H~!j%=|k*2IR@g;;Xi>1g*HTl<Q9zv7Cs%Y?I53<s<fST3<7L8&Z?
zhyV0X8OvfR916_D;{>!;DTzs&tt(R&i=5qUv}uC68;h~STp}am(OH60w2Ve{Zmk7B
zDxLjw(0(&FXK@$hnGJ}@`7G)-!Lxhl>_bz@+~rk=r;}x84Sn!}fMB0L5m8}J=koEG
zIVUrJp>(|I%R%^^6Fpf@GW#oggMOdHUi<fm0|RE10b0bz@53mB1pxAhHxTALr0M8L
zaT#PtUgg6ohl%;@?9&86mQff_rQ%7S#RND)s25}dhg4n%+<6rbLlbiPPtA2obr_dq
z+(KN%SxRcRbDU9dt}!@957P?5@4W7cT7Etc)SiyaKURh8U2Qy{GcQL^p1zLE%@&6f
zOn#L?tq#)Ch;%wj44(1I?l$*3-hULev!b3BMtIh!M$tWdvc6D#@^UNUo;Ek#f6ejO
zql)S0{N8^UCMH=YO(k0y3(`Fbi4=b{LOVG~CCfg8#DPzZUD76pJZ{gNH*5HLNXBx+
z>6;%HWaMM0B|mH~_Hvv{$fBd<Vqq_$V0iWU9u-Rs|Kj*%T(dxy?Z&#W;`+ds(`(ru
zRVZ>%U8dx8nhfQ-bwZVNH|7J~6mq|XGmnivO72e~@RSLMK8n&Mi|$nk+(=)mug=gC
zQZA0P14EuQ?Bc8%fC^ts>wXdn{9x;}Rwdd;yF?aQlbG|!_2d5VkHPkDaU7F#L$BX}
z&j32spssTpo^>~d8(sPw5+6X#wb#k?+`~~AVvD(kfZnjZg59w|)(2jWzMgKTZuU?L
zCFVV~{|OD~{h#KE6+irt4~Ut;i91BBJKCS>r@DU#-^7U~y<Pr<)5x`GDt3U{t&PBW
zTU(}&4Y{I6wMTCj6dO_9s0>P!=M$OHG!e)J`f4a(Hv3G3TVYIrN>|q1?cr%7T?&xC
zc#!M3&O9!Uy0)mO8FK!*3k*RM_aj(Og`IUX-^<SmaPG{4%$3(PLm3Fw`h#J8VZ6q&
zAmGdmcrz>7wy&A49+aBEBtCTx4bSNHFXPr<8O(@nw7wlaKUpQ=yZnTiHlp}?z5I?Y
zcBamYdLSDfaP4DK9$-o?e(o?vX-~CQoX|5JuJD@Pjnf7{LpmW}?@$+0Q&nHG`VMou
z6i3zAm(}jtXR3L7O~%E6q*h3jo2jX(sot-ccro+!Xr4)90?+(~XdPegLFR0u&&H;Q
zuv<*R*!Sj_*7bFDZ5$I!zl%yy$}tcf?9o`Ku%Gk-d|u8BHR}Uj5UqNbW+Fba+~Ta*
zYX^2so+Mn&dC4HCKrfqwgX5w(T!nB@<2U_Vt!fdQJJvqGj*CC^W#`^5hKU{btpKP!
zGT<?Q>rdPau#;-6Xpl$MFK{2eo=*oKY=cRAJNfPD5m40K1N?=T!;ZB-UFs-EvaL1I
zPb1}CwMtDd?9&YcY$0lgRFYxSKbPD`q>~gBrf1Xw+Obpn8!k0)Yk;5jE-SADF)#32
zigR3%72HT53QAr*#;e?I@iC1R%>8(m2-jykGTpr#KM9FU00XbD9}iDjvd|MowH_1^
zr5o$t{U0!?&nbIz4$c_((ytDlFwz7YFn&}&!WwY}#wDaUrV$TCEb})%g+@O+Q3S^9
zBXOm%2wyc(^V1omi)ozPIpC!AmGm6uah4Eu^S1t=*bnAc6PTR7Q+;|67jQe{jUUbY
zjxivm((dELBCa_uxpxzoo=TK}tg6ERc1TL;t4nr<nflRdsh`Ek!4#^(>4rpyZ+<QK
z)o&y3<WLvw+b8VK`EfuXG({q~?wd1^caQ}k6(DamBFFNhBv;0PpH<EBaxf4Yf)lv7
zaBoR_2U2P+!4;eTCLNN6-T?FS=adnQ3-5|$CgK==qdj%zdP#W|+teDbv0_{?vvc&|
zg(*!oID{9TgL|Z#WhW3tE19e~Sk~H!cAm^aka4%rmVU_X_bYzu1HgP^Mqx1+^ot--
zfqs~ZgVv-hH$*%P46z43jNrRFjO4=u>I62Q@JIQz<$c<(<B9w%bD}Ks+{~0aBc>f+
zf^uweHUpR`Uc|9fcT@jOSj9<LtT&Q}{7@(Ya*LHy)8=Q-{JE-j$Uz1dV|*Cn-}ygR
z%nzRB8&L@%#1%}}UN~SW>}1f~3_D8zII@-Sbc2JcJ$)>Za3nuHZPbq*_#^sKpL9Li
z)|&OX3wG~AUHXC%*Mm|%sf(w?7zOvj3}p5onA7JMb|d2bKhr2NLheZ^b~5r`uPAtp
zl5FRjo35KEtn~L!Vq_g+6DP<s1ojk~%~AaB>QH<)Tja!!L@D`1E-f!Kv_CAKi*!Uu
z<<E2L(-=1tamG@-T&ikn0=FfCN}fyAr@g4{Zjc6q<q|pSUXOHv73YmVogeEHo*26*
z8Xdblitlce%k~L*wz_$p7#S9W<F+t;#R|5dg2TI}L<IGY_v0*z6Ky6)-pn}N`!?Tc
zEQs(cu_Gc$3cGbL%sO_8m*&frJ%_EK%+JD;eKQRF3~Rg&W0fL{r~SEVB%#}idzNLN
z+ha6?F5TWvF=sSeA~!h^&dx)5<E|I2BOSE7@snw5PKY_PoKqs(H%gUq9o_M~mvULx
zkC)oY+}PdCu|77{37uL{g9Oj?WQD$;@0vv>HXyXP78qfbLJmHyD<Z+}=F_1*V+ozH
zNgR4lVFpPqS5mlZARg}E`29r6jpH8ld+*fFf0cEwJFC2(S9HvmYzsc=uY1dXKIEx$
zyvVHhU8L}Bs14e!_f;3OR2vAC6t2ORnyDpo5{I$A!8yhnm>D*2l`asYoAa7;Ms4Ao
z9n+orOWXE%uV?SheFOia!Rq;hS3rse1mvdi?;5Oxe=}J8fAk;!qnoV%7{dN@_G)Zz
z>tOhIqs1omExQFKwC))dJQvPWRgy*GU>Z+m;vEp^0Kq*FxwwSP%5CKz4O#Ra%=-SA
z`>WIT3odZK^%tkzy6^Y*Yq<G>RLyjC?Q>aPBRKnv>Y%pAe8%7u-23D}J;;>@ie_VF
zz>{{kN8nI4>%8Oz1CM}}dQ*lk3jJMFK~b@FMP1OqaMh8^IUKYA@Ptd(21b+-T@&%N
zZDCx~v?|n*#yjuIG{&fkVpuA6s+F^}fZxetrc6(fL>6hfCnzcld}reCsb(dyYpgEB
zJwJ$`u9Pf!Bqzlf46X6=nMDJfus{Vk&uLffh;?gr9vxAEd~`$WJzP-9>AD_WUg!j3
zw|qrF9Uz3a>wsXRNOK@rXeSAcXB>Mv16v$+K~0utcJ#xvbV5I=3a6n*|Jo5k7+3R5
zX|NIoG(n$pcSUCwnbJ^H$9ZbJ%NcYvp~q`?KTZtBGDWRZT}>8*L6>#8898@YMh+9H
zgtJ@Y7I<Olx|Kf`PEE$$t-W)Mi-;2d>-}-IhI;Ho)=l_pc0(OXCQTBBTP;KR9fA5q
zC^_k#I+3+^YQyJ}xL_piXxMLZ#h@mBL%gQ$8k`BoYxHuJxzhymrEM63;PB(DNQX2g
z=D<(63w(v*g8YLH9`UUSrs%^pZG5o`)kV1{`d7V1Uv57^I<*2}X>>PY5l8-7!$aT?
z*M(^lMvp${@mHf)1yAQebDFy{sBe|rF$1fGK3c)ukww?-LZSMS^mj>r^$9mt`ae-k
zHU`r_iqrCp*2_dzT%v|am|gmAoa^uww;fmVWbG_`=bj`@l`-^cEpV(HbY5E^J{dxD
zR4_BcZCX?T+Duh_h;{;3!aa?@EqCNg+FJteBfJ?IB82%*d}9@;$P59N;Y9(FyV(~l
zh_Z9R<it>R`3z$Y^gnIjf8tru-JxfqW>oKfY`#X>mq)dP%qRn5aF*Z_p{F2!No$3|
zBF>w-(pBn3<<xrApncw^22-69@#R*asBv9kbuE@Hnl5ViYC6sSNFeY&vjjOOd<=P$
zyMg=%h1|YJyF-Ek0U1I6I|{-6I|})~eO>?2<bnJz@-VZsGqkbv{QL2Y##Y3JIFjFr
z0kbZl6{b!h`78<7iQa?|ktJ~#f6|_g45_tKg<f4p_<r!mb=GGr`P$u9+e(;lKBX61
zPS5KU+#5vlA9`X9@RSU=ug)&5uLH8Np&_w5ar7#|TJ#1*z!*6Wz+r$K^@G%E6r2@;
zEnQq{V5Lx-M(&$)Q*-eEDV0tVHmy)VY7LuWj1ebo2qZC>Q+6N*_GNeInK7QsxT-2~
zdMDjdC$HBNvZo$idN@j<Y_U9asoci4;%5_xKBu&f1$haI134GCSwEeBRTN6qklL(d
zU{MuSEo~F4_q+Dt^Q~!v*DJ%6+LmJ5EqhyqTs&FcypH`RB<;_bKG~{2hIYqVAP<V6
z8V;O!u8^BTEyz|=qJvfJVE$@eoq&jo?T$IkeDO5Y>?}}F)@&nb7mucZ@#Ky0LT*%}
zsk?WNba%oQbTPPq))=*BkY;r0{>;eKMl-a`$2!5JMJ}i$4|9I;X_sx7;`dbwm!CJm
zhpSe)sR8NsG_WXt5|(g$-oPLk%fztoy{#d#%YJ#;&{ew04|uUwUa1<-af%9j^qz_D
z^uLDx)(z?H-m#t;I}bRp5`rmKuA&m8kuj_V#yBGoksIH{K{JJOt6vB(lM!L`<_i2X
z1ub0x?T$3aE#YJ0afn|Qs%hO8U$)<h3D4sPox|^y*3to&nKc!(J<~nHKEH~lFBk+<
zNu~Z*0lYZ{%81dOiJpYIhJ*diq1+qEg=9fK#(;62U1^xUL^k35$72^~o3k>k{Fe`p
zAQRuiqAA)3{=n88D?UZ%H$D`F?3w?mwPQ@e6VlgEL=SW;!#e1wW#y6=FT^zxO9?wf
zq<2~7d)3i>y&ceu@C^No?a=)=Y*-aipqYuPflx=BFdu44qtetQ^FF6G3?lOQrNdT%
z5eS`-bb^<M$PY{ffasW;Etw5OUrg-U`^1i#)IyO?r;a;hkR<@;d1>G}IC0xCC_*Vo
zI(!RDRGZvx2sN+H?&2!e3bce&;^%2}6FghE%z5>%4JZ_W);)`A5yGjDi|QLk#rKmb
z(V0$)sBcAZ=r3at391#n@yvO+5gU#t4uR%y=(stXtK_u(;l^cAN66stu=_qIkFU5)
z-q^8R1A~}@?-R-B_wz`^8RLiZlH~CRz7iBTxzt|M+327(i>5p^LwUk+#xEM*xsvO1
zAFqq<A$;yTw<n*B!LJ8&uY*8*gBm}#wrtlXf;LUc6Q-fylG@&)9FK9+g&f#Mpsoh_
zGCy>~FT~M(=myArkHJ#`l`SU@OP+pNKwNexpTnVL`u9K2E~&f~1TXx>h4V=NjtlYr
zh7124S^P)SA=ba>(8|!&(7@Q<#>Ujx+0x$5>2HVDw)PubC|_3weNH`+IxCXx@4HAg
z7^7xe9S`wrzqY_JwuFmpdDLlT_?6`MXrFriOx}nU>f=$$Z_p`%irG4vab;rjXA&3j
z8Z-L3_ff2|$=4vPZp>9+eh((gw`^Qynl7l3XfhnI$e8kmUN_J|?mcK9fHT#ue54Ot
zlOZRNOe=FpwI&MJlNe*38%5JMHx&GMw_%GR*P+_Iq1Q3uR9JOat0mi4VK&`o&j`mh
zAQ$UdIK?{-#GHk>)4xyb@?JA$e13Z7$pY;TRJsFwqMeColN>af^4-@UIjk$Bp1iZ^
z%eQEzQE&T2g29-qrC+OtGmt2jI2bbOLE;q|W13tEgG?PBy+_xS63TwU^paTxgYd+)
zKS92lHxjznLmMYAI`gm!k>X7SLq^{pjX|-Z9Mh=#{ZJ<`h9RT>LYfzPV-L(zyod2}
ztb3O~+6PgGR|DI3pINiSRHqxi<TvQ5b->4Qgwo?5oyR&Y;h^ITPSLe%&5F~0H}yg~
z*(~vAX3&UEus?|^KLn^uWyv>EpUxEf$ut`sJ2m2GS@(~DHGX>|%<~+5#5>*v<K9h7
z-YjvMn?kEGa4!0&YL9x5j-x%AWIzO!d<rdD2@x2)RO;oFddLdC@QdP3+#Th~70E=9
zWAiRBv_AmYTzXh;hk__3GUfqmR#&WJ1)=ZvWI-E%Q1_J(kVhuv3Q7yy-Y802*{H^2
z${kEoi6tM%4;Num7A!dF0|2=kusb3l7gFyBQx{6_Gm@$it<Df51SKfy9S=y2S!n&0
zFBopJYz#g#SIEqz^v;(~Rwp_2aI3Oj*u&mpD&FrKjEqy{O}t8cz86zX?mjGzjXT1$
zPnP|A%o8L+AfvAl2w0bOpO=|m2A6c71#^WdoAlBue^qP=+w%zvPTJ2roTgB6EgtK!
z5+#rtx~Mj$?8cmv8v-i&1vo<>^@&R@&hQ~t8yOdyg654)S(8QDfedIe5rjCq&rfz!
z9&DF*f+9g}8f32L?76b@-ljEu5)#N65EYOI58RWA$rFu-6=Gy#wI7sgr0%4Zj?RU-
zRH>R+BqmLHUhRhd^}KVB!UYmjM=b2QIBQMpL?X}LIHVU><AaZn*;0X%2$Gm5%~mcc
zGVxjru21qiu@kB|%78Z$m<n1~@UbO=WAv52xGMD68q(fhbYwv(zE{7j2`F)@a6KVl
zAv`q@YS<}2u_93P{Sy}yR52iHyi4@nEw2c-eOw6i%GEVk!Dy?~SIW$Pi=m^k;Th|a
zC>qTuOpnCDQ;+z!B&V>1!SFX~zz7Ta7(~-=YRrfW?M1e3)Y7+JU~UnlC-~kMkpXp$
z8?Gds)XCT{iOTb2+ABrUKzNVeYy=s06*o%#APCP&x$hp=HK0!(-!*oGccb8pAO{Hi
zA*~$)Dd$WNC(LUfU<ksh^_H<XG?nG6pC8zT-)iAk!p@HZW<PO|bWSKiEM`N&!`rkb
z_JSGA)%t6^Dy4soKZ#+B1vNy>5zaX<_3NQp1i4)gFc3dzKM_a85n(TH(RsBzXMaxm
znvei1l8o0@m;$-6fe4jI6a=7yVysEW*KM_kL-S{G<byT3RW!y5+;qmJds;nMy|S~E
zGGK-Q1h>|H;g?z{4n<kgQS$y7Eg{YwFU!cl?(<kYu$z(dLaVe~@7b(^&)jGXXekwP
zt*AB`S?#MP^59&w{q3NJ%WkM^>lp+8lNExZw0W7l9kwA_uIWKSXwO!1$ITj#U>R5;
zqGffrZv%Nbn4mY!vk}4`%v`i)$Ce7?78X7%`P?5GTsO9&ge4nBHs}WpKN@%vnQ?%r
z4{roGolE5rvlaxc9=cOPWyb^7Tmp9(&Tv`5u=gEw3`V9(h-SQgI<k}LLF0)uI8!9w
zE^S$2U;sc&u>*`qemPQsYC`AemV{x?nyNLobR1mch^3HI>+3h%j7sgWCpA&h%JSRW
z><PX9Q2aE-d|9gtD=z<MYy1#(%Fr$?LdBL|YwZWnrA=geK*xwLzbJ4mIFXj|i*O4l
zS#ROV^XQj19d&>pcNL0c(6Nh9(R`H+E|F~B7PPkTuAYY4!D?^r+lb%<b8-6=MX*UO
zvx4-HT(Al+8|?%qPmxThdX5?pebU`$!Nf#?K>j|bxw*t~kJzz<Iq1`ef`WC*m4QFh
zKueEZVGFihqSB^dFc>jb!@K!<RoU%Ft!p64$?2ktuU#TR@2q%8M8QN{(Lq@!avOUv
zVTunm0E(V&_Istb%+ni{D$RB%Gv`}^<u|3-53utA#8WOS&V57$g=okaV_z^Bbl21z
z8op1>tplRHk00&kB3YmjKay*O(D3}V)ie$!#v#gVhoFMq-ZTJlG&Z8(O;i&%x$FcV
zfIZR&=Hhbki`}1k)sTqW?Uo|B$To;L{)-SMW=%j216fF+Xd1DO&lW9#U>nI4O>%@&
zKs-B7_+mHkGgSFhm1e}yWth8t<t^f)XnZbTU>(h@N_Py0dH|ab##&KdEMuqyEWdJ^
zdsRq0A!<4sO*&0a+WR3n_NL1%#S}k2C<0$=sA^>E2=-WhQ?AvsR#O>OL1;}iHVS&O
z5@ZfI9X<+D7z?5r!4L_k%r@5iy|Or6>-g}LhYQJJfnR%Xxy`NUx*yS*FpO4cFQ<1P
zaBZsQaLU6PmT5F1;v#LTEorE^;BA24fek~d*efh;BlM+!n8hl88~g*wdn?ODIKjAK
zLbo~EJVYR)2-K@J-Hvp2I*zfOl;FMLc{ff$9{fxJ9wI)z9S534l^d0NXkD4CGyRFz
zmY>gDzeumnr0~4Jf8@3vt`=UAGBwvhsHHFtKH7cnj64%}{5DL4){|iEeATa(GnC0^
zby0YJduCPRLTJjxx{gwh??<G-3Tgt_WQOTRjiDLNsMZo`pwFSYBm^ZiX$t{&rBB|*
zqmS$^SXN$z=a%<hE+n2xE0o8ZvobL%nR|Gnm+%rBnzY*cF$;3eJ@cf$k5eijGIZ+6
z$*w%ldmYbjJYwqut&yF>9;J^b9Rf1$HauOcISYnAcr&KRwjwXrC<G;Y3{?>9e`nWe
z{6vUsm2&ct<N#*^N(E_2;i)qKQdnF`2FaaF@IGE)(5|v`TY_8;j$#j#DY`c5I$wUv
z)y9xjm?%dqxqRD;y?igrWU+&@HvYrD4V6>tFDiB*Hl#1WxdHnVLr#6VZ8ujB8qQ&s
z;AgOlX*MB_92^Ac45+YFRlkVHz1_7>L4Rn2o*v3#VGT}R`3+~43}Frb;RLxE?{gLu
zO<CfqjShMDu6)1vZFR5ami$KSt>v;cS24Wq(?nj<cKe48-q8vPLk^9^xDdibQ~h_h
zOKS3?g~py9Xjfstlg=UMC2mFtr<#>cO2^Au{=t$7?zsMC<5Ur&Z!@jm?fnf(YAazL
z+^|AJfF;GBiQQqxmoJH@63T%&j3VZ^HPH9#<HtWU=Y+?dU)FTtQ;5Mn&;y){C@*Ua
z9U}_#HKE{z!FDMCfzdeE#*&l4^&ra9Q@xoiq^J~a`4V+O^+{qrA3PIYUngI0wsmlv
zxxP|Gksd}-Q~}e%3C^^&7;dpLi_UfH1gJf=u@FPs*sE}z%6PqxtEbE^zOALXCV08B
zMLYwA#i0zkvkpxqj#hVxl17H%1j_yVFt1@mfL=X^z|WHK_0STcN8<j^@rgHzpKga6
z#s!_)iLK23<7zYKa9ath`fSNud04x-@Z$W^@x8=UVss6rmZt;vbXBX^^jI>X__$LQ
z(Pe9}_N+DOCwV1}88c*?36F9yE+|_WTAz7KHc1%tA04*U_A`V&e6---{IRqRolj<q
zd~Vgd854R_FYfN<l%e(shswMFyDDHO?iU{i?#(jgPi3dA8|PP64#`8B5e?VlP5@`1
zoy6%8&^7((V|(MQMCQ?$&JxV|95QO@@h?XC&J700lSk{!DMkC~pBc&RdDh}&*JQ1Y
z3sxYl^ihK}U}|JR`CRl<i;c`&BN}=r;wvTEd$#Vqc2oPW%_}{i?=1E!Yy!0UNq>r-
z1)JTO&8WhS@3(DY+w4bB=I<6e)vY<`QwFhTZ#~Tj1sHi}t0UU>(>)}Ms(gU8+VXm6
z_6oo48IEVSY<sLIN+f9mwu|qgLnfQJQxz@uAp>?x3WXPzCvARJa^LFvaeDC^k2o<a
z#T0V(@X;Nw(I{0oM#3gyPwe+mc7tDyD#uVmfPa%IWAUx9sRL}qkK~-rhpOA8u0t{4
zGE-)!@akqH_{2_nGOv`a1GQIMYib18JWsYD9>jD$Gy^+GsFZ~1K0yP2IqXkU3De+;
zn|r@fRn%X}jSdz*nIu$Pt{Fb>4v#cJqpmhO=e;uwFiHkeRBL8A>Kgy_)=-8p4B8bM
z{Bbgvl88?IY1POd>&j#6XGe4+h|=4TG<&~6Axub#6r=N<9#GDz8t>rhj2!jViuIt`
zkGW56?{_4gHqe=}`DG#By49O_wdrP}&G-YciXge;diBh<Gz+qXv+atT&5r#CqBjeU
zE<x0`?zWfn`!NGY*^S>ndUV%yDV}QZKtLek|E?}2{hPY<|D$UAk1kLD`#kkO9W^&}
zHgz-f_`A~7=iO^#*zx50jn?7B*I+bJ{35flWvR-sVl=W~FkV+mKK=X+lo%xuDFF=B
zKYnTR<JwLCT|YMQl2=|Q8!rOP)!d+=fwke@J(2h8;U`O3@!_4PfQQTL<Gx=%rB|2!
zbzyq-vDm!2RGUo%Pfp{Q-r6?V(vreZq3w=kJmsIG7LPk0t(J4|?979N3`Z@phIVbT
z*X@b-r(*3l`-QHq>-+Y}nj^7<Moy0my7BW2j<m`9qfi|S`|yK}YlZmYDQinM1B&y*
z#?lNaHB2ubt0y)C!J32uD-RvI4V3hB^|5-Zg!WX`2(h|MqvIa|Z0Ie}Tdn;t;GLN%
z_7Sh2KyU7<m>xQ^sQz!2sbqfm6gdJOcQamI@2@!GYS)woz^fTX`gB(b=C3YwlM@i;
zwlNSnrVTC~)e&?u<3r_v2$d=1la6syGMN(|tg?3%#pyL0B>N~{x=zYd$4p&J>Edcx
z7y>mBe6P$k)is8TXXZ-|O-rAR>7_8VCkj>4Y~cM1J>_E?{U286?46`gQ*Tyk8VUJx
zm)(`>LyoDPyuRG?=+4BS2l}3~+r5#*IAv!k4}jFn9)AwC+YS;x9n1#4rE^AKX~Q!O
ze{kj8o|N8_>6Dcrrna$+PTTXDural5v{@Y>fne|v7?_Wqst9(pPhcPew98%)UfPt`
z&X1XVS1;$dnedt=S`}RdeZ401O5u3UkFCJcKipY99^XSP1?|;m#6EoV>mB93v<l&7
z@ipN#TAO|IjClnsVj0AqM-!IKAb2<^aNgRO{wxz0969^`;}o=5qI$Y1@BlgWs*NQa
zYta6c8$c=>tbeQ=tSG(LiJS*K=pwC$kU-V5Vox7D>2-rFfyvtTHo;Xm1-IVJ@gG6T
zW3esFLYc#7K+YR4y_r-U)dqj=s*Pkvb@up=k4m7)?QN7D6HZ-bZiilkCo}-o7wikn
zjOS|4HqlgxirQpouNsuH#MT34F8mt1&34B}H|Ofc``OZpLCkbR#Qyh>Ky3rw*JHm;
zFOIjjo27_}i<`BE`?(=--YdS(55FRX9X|!Hx66r!dl~n~SAmbm8b5YKzs`@N3pOtg
z-=}MVb@FZ|!I)@qM#hGT{e-zS+R~f78;ggjwFvV9CmRXrH1%i%gU*+;o0E<UBj+!^
zhlMHxwJ*c#!x#ha3_WkR50)Q1UbjZh&K};b(pk~nCqG{&?lri!i!I)3yq?7ta4EXH
zK1z!G&RpiaKpPrX(AeqtNtWVI<(C_mtiTJe1#cjhp`md7Al+qFhd9Ae{D^Np3%vPz
z#wqO~$H0MQ;FtvWMR$OeKJzj`gCDKfdnic+2r}aLf5xw@SQBkrB2l}vfsJ=%qDisK
zsDetgvjKI00nKJmUtNdxmKn{%f)kUs<Ka_XWXJF*20kC-QX$79Vwu%ts#=?X;9M|?
zbaYT34{ht|H%s1|-;lKwlkz~S{#>|`xTY978xa58wYI`w;efC+OJO&0AY_U~+zK%}
z|6~T$0{$+5aB8qCS=zcA5Kl1ZgO=pQ_m@G^N1O5ui&K+({}~plh-CZSD|ET*A-@S-
z*F9Jd)sUjMvgDc6)fF_H(T#VjXxV7vKZedhpaX>9$1lf5xJHI*SfHc{wEO{)u4WHS
z5s7Z7x1Bg@i#|F!U;_0#UrEV18mfmUAi&>cQU_uM>Xm^y2P})EM6AY_-*X7}vge!z
zD)WuWMV?|i!$i&tp;>9*IjetLoodoxYEMYptJKzBt<j^rX0(kqr_Dw}q}1TyyCo<#
zfKnoHTre=9<jV5~{2D~MLsx%%Y+j@PH6CR0B@Y<$<{W&k${+y^g)FW=rEMe;-;!m&
zX_gZvN2N|vrR8@wBYWs=>2QR8Ss2aCK7|xXFOF!q@P0Tm3YLv2@1VLXitk^(Z`Pec
zHi!Dr3R)ZZ8sDpj4s9clT!y3@v{2DL&!}UvTt~`T&q5RtHyaWu`!uk7dkA<Ib36iC
z0q+q?d2a_x+FQXiXRMECD7Y3+5xF0Qb4lgJ*Gz!-bHcLZi0g_yMCEm=Z`MT<)O)*9
zu>UG@><>Yq4$e>rNT}`*$-?lrLfwS?+(z&b{JOAwE97(*kGR={|6r*xe7aGE;7AGn
z<>3O(X3jYsZ*?&l+|HNy+Bh6?&Lu18V39<{EQ|EluXp~FF0(;GQ-)qMh-e#MHV;b`
z$9+iFNCgp<?ws5&I8=hf_l#6_z-+Dc7S(a+Ux#BglJ>GG{@b#^paIb!N6SO`?WQb+
zqB#cHU+eFPC9i0ysbbsW!z{mIgApw_<vNqSY5+Rlcds?SBN?{Z%%|y`IJE?&cNKEX
zynpH8mwGIs+K3Cs#GA8#0KJLmGaCMWQ;kVU52i`Rco*1WsNbk>#SMRfs+6|n7-#0&
z2Q>$8O*a@gRi&g;<)|=Ho|aCHt9UoDDUbOXnhOC8`~uyt;Lr-+OBI}dS%b8Xy7c86
zD3SRVg5*ybf(pv6PPcNjd0{?}xSjM~tQFd;caC}L!c!PfeBPW{!YFt`09dJtdpSwx
zS{kSMbOxlmS@=U0u9@hQ$T;M_XPI5m($K?zP>Iuop+&b|$Lwly^ZelE=JiR=BFK)6
z@8M)@*hftiYlzS=@GylVNrRZ6UcCGqsfYCA{S^J9t*kfNy8|Hsq}OzJcatdvjYsB2
zmeMgG6$h>a6Vv^HuX#*-{-ZX#1mu);Fb~opHB-G*qR-Ek3xTU|9Xr!+O~E6p>h5WH
zPlw?gr7bxeX7sIJ3EW#-??yotN=R&T{XYDuNoKx|_YSn?57W3XE^BO9ED1m_7lxYm
zye!I^TC;T~0#7q*vgXO%n$L=JDxfTEO?ByYiBvO?gAOc=?n~aUe7xw3ncjL+_lQ@G
z(Z4Ni!Ne2{I~~8*GO;-4)^Aa2@Novo&H23?l&zdz4=P>NXKfK&k9rv$X&K0^R^+M@
zfkfpg&a9wq>^!g?DFdoRY+-+_y&5=xaAl}-G^4+&t*By(@1s0I>lJ}#c8f<yNGtOO
zn~Fq;tuQ-{0Ys-0^lka!wxg(_;R&_7?l~nH`RLV?yG_ZYBIaRl2RgnNr(#w~flTeY
zUk&7{irf8@g6rm%l|BNl3GI43ImSnoqCX!El}uCxz=dEjGjg^L9?^ajIK~zh<fipy
z9#{4Znexbk5D?9AMe94X?FJ=?pa7D%11<-a$+CaUi)q}A77*^uoDT*{et_2AF(%Dw
zAD%Ht?j4DVbQUUh-fYNsia;di+Zy+|gmJwN2F;;b1B=0mZY<9ln&d7W%g0`ijfX*o
z(ED3qDb01I*gWLu<`aSZ^Y*2hVIx0=9diHH9~}0Kye|NRb|-Jjnwzie`6XG&OMg<9
z*xK<SH`Om)b%H|!RXT>!|ApJRZkYsOd_*%sZNHE#ckLhpHw}hY%7bD4z&&|W!`dQO
zNL+}p*_{v6+Ttqb!De?zys5hzIJ$@)AXocTzoTU&+9$y@;nvB*Iphs~yaT5w!lw12
zs)@Nt`|jOO<Of3J&RT~TK}DJh)}?@YaZ$gQZ)#jfIwjgvsHypvMiguAG{w_I=;mc=
zhFk7!wb{%6&@%0;?`<iQJ+2o_oI3X+!_WlUJ}th!3tfV^=XPu#HKeaNG+rc?z4sh9
zCwFYCy;0O-BUmKtY*Yp~uHH3h1uQaQ<IZ!4OC`PtV)^zNQM=4os*B`x@S4Y_U)(S~
zDx^olH(XAksZ{u6u60)bWHy5)(#nchm8}Hbw%aiXTyRTgrFBuM0*(+wVzL%z?UTmW
zGqf4)izUmdI~(M}U-fCAI^>tX!75vKw}2oM)uPq1AQhsY7drKADz%FnQX)q-6FFZd
zvzN)VFjwad?5s&G8jodjvW+0Os32ySe<Qg@nC!giro)Xn>s@1`x4Z48l$%!6at@1!
zzP4ChYv;d}RRQRoJ1*^^Yh`~_N|{#wCa0de!NKwp0t;rs2+?FkPf`>iZqEvWIJ@l&
z61Vz*M9@a&6%VKu3Nxq7!^z6&tO7t*25gQd*%uV)=`CC+(Zi9dM+C<B)5X?%QAxt^
z4~i@z*2>4Csk)YYS_1V&QMyveFg<9{11KirYY%~4;XPy0hw>p{xY9Hv`Px8DNz237
zk#gRHrynXj>aZv@e)X9t$lZl5);f;`So6-927D4ADj^SfAtn9N491G^hcT5M2P+B{
zhW*K*1pngphrrh_)yK(|<?W(e*Eba1A#2ij;4$t9M$r(aHJ`cC2A=N7TUUF%4@}7q
zj)mwd0~dAbdljXi=<<-8-lnNe{Yje+?g7Y;*%K@`R;Q00nEx380e_%2mxSvT$5rH(
z!JzVJD3&A8=LWCo@#A*RJYTW9A~~ngW=pw7qblGhFDmt{kOEObrS#H{4Mhwk>Jf0P
ziT<|mer1$qFK91A6xU!jr-j|TY`Cd@N>bFHQA?`iJiBX+kRf`ym52)A2B*;D$TB<)
zi;g(#rl!(dvAE(?-~!woNX2>7v@R<c%Bt|269@?>|KLi^?PT^^2BMfpFC1hqvymiL
zm7ydaHiu9XQ9?ZeGD1P;#b}gfoZM=oX~^NCXHn5l9>!w~^90$7KsT;2kupAA1?6~d
zoR@G`+poE6{JEbBt-r&;tHglVAjkA&9%1GMUCu%bWpG?LL9vYW=$Q)2U$Obb<m&dc
zlm+S6WD1oEWIU@%z>jX~7|bWE?Ip8Kwn4kwtwzZ_rYJB1Ju5@6x|9;koG3#X9PhR9
zd_S<l`5=mK%d59BCPpe8)gCgl%k$-Q5*)Z49?}=~tbDdG*I>1%hwhNht0AlvT9kQC
zPm>cg@nMT*u@<A-B_NeK$hwxNzy!ix!x)~X=HO91S5&NwfK)loT&oE2B*+-^%;vJT
z<kU_a2u0h4hP8GHX0Q1q+j|QR5T?oU{pco%^rlmuVH+bCUM{DGf#pKkBSr^p)5Lb~
zH?<yMo#jeERy{D%8v63xPCeN}_CB+&Sc;Ine=kUVA_Xhq4FSuud{U(x{h+}$@er=+
zFUeWQpa1m?$Nd2g-RjIyi)7*2#Upl`jFl1+!_f(;es&;{UAe+EZ3=b@G*i!yK#uN;
z_R6Y!j&smM<TdIaW=-QXO&!lQw7zt3X-<@+cTepi?)Vh9c;!}g!c^y(x5x}hY|brd
zIr`Zb3jd4m=UhnzDsm<}{;p4>j)eZ5><I?aFq6k6{N0rNjSaf?aA@{hF+N!TYa$}o
z(O?)f-?$>a<%8_8m`+MB4`)9Rru(SiZ%Y=Ap^@LIh8T2wFoShI1J#Kr`NXMJ$2J*E
z(<`<^UEHa}v^3##V%oG6c!O6=n%CmPjUyl7%`!;%(I_5e&>h960vd(yfL#aoTr!*6
z7kq?~9+KBL_lG^LkHBC9KEk<A$>Yag*dpn5(ray0%zQpGUdSZ@SqxH(#fn6WiV8FJ
z33Fo_1x`yi5tlYc<SP_GD+=2-JO6{abL!FrSk`o*%eJe!tS;NOZQHhO+qP}n{K~ev
zY)<bp>)h?N=6WvjCjUTYM7+-v%Bd;RblZBgDWM2-B7lZ-Yig!<gE471!?fbSmjOU5
zsMhv_b$WiSXc%5jv)w0~ZKHu8-u?9(n=MnntXU9YXe^ZEsjg8BdU#xWMtmU#MWPdo
z+6ajl66_QZ3i#r@;5JAhcV-)!Ll6;_+<i%S6&fz)165vP&;CstM!}^k9yS}ssTViJ
zU=h8hF#uKrOG7JdI`8;k&eym;NT%RWYk5=Pf?VqEuTN)X<(vM^m-xttoj6tYz$F#l
zuOlJK@!j9R>ynJT&>fHxV5#b;^%LBh19O~mN4T8QnO718^to1hvlD684jm7K(I@Lh
zin;5ezTEq498HEBkWq*<W3~2ElsctATO8DC^GG6pjm~kJR2X;p>H<tMu8Rc2nMDSR
zWIMEMYb0hf_g@OxVMl8%Wo*TriUG9X{D@!~iAf+V#NZ-->fx!Z$D-iGOAx`HBe3;V
zXWkVlR^TN7sV38ol4;U#vVSrJPn2c<;wARuCdn#(Edy*g&{8UaFKGju4fXVXm0OX3
z*&KQrgfN{8sd-pB6U-5;m^LuUE{ZXC$@%J@I`!`4`!J%;)e~1AL3e=YC9fo;Ld+6%
z6*iwo{6#uy&(;q^sHPPYam0zeJ{6_6VEypuK>33|YpU<>vTV#Z$>dSIlF^qNkoads
z#)>YK>2l!Li?)I=J=xc}YeJ$d#CSS;Zb{lp)d5;2f67b)?1S}+8^qCGmyD1M#7BdS
zz3dEIp<ma`632KdrAr1yDOP|&KXFK)kU!e6cOfX2;G>iWnIst~QX`4E*V#2S23txY
zi=w!F5-K36vNp<8g{I8;ChcQ84v8Em;EW3$WB7<NsDSVzNDbcS6XT0$#`+7VS49mO
zZoSC?NrXxMTtxJa&BY~*DX?uI6(fX&pz^}k(e$G5ZZJJRZtUXqmM!0wY5tH~GPhyO
zWT$n+P~_bU1YEzN_|JZKXK2x^_Mc1mNZloAOt<!$8=9|z=G=SSiC99&QN3YQx0c{e
zpsp%J`t*_=L_#2}JQZUMMtgK2jmMYuOm(FXdyRTyA=Zy}sES7+)#0GNJZ(%w^Lx6l
zGhHl%eu5@`U2;Xc_r)kXT^-ZYIbZgl`VwLUsBAg2&MsACFur2MezYS8t@nS`-N@bf
z!vaTxjdR{pFc|JpXW~PY6ytj^Fyy8xAnAVcO2h@hxWoV0P&!<-&!zxTFdE3z{m#Eg
z00x^?qGMqdto-UQL&TIEqclugImRjEA13{D6uH<KD1d~VPo3l(+&6wx=e}b;nVhfD
zDmpY)jlWS}zORkn(;U_;tcow|j`4J&(Nb;7#J-$6X_sr|ej6;3D%1Lb^Y__923Ba4
zc9N1-<mV!(@zc0mli<P>!V*BvHgZ&R{93x|E8c{y`9xp<))BtUm2P|XaLz+FSWICD
z!>p!yJyUE-jV48>7Se4F)GHx=xIVd%LcWT2TR$FFCcCaVEAVf&GUHJ<=TKQwn@vLi
zhX8{)HyzU=q`f@9KB{3!`8deno(ga3`8=XH!@ch*qZQ#%9ePq2(sMDHcOdb8G?7#$
z**tXEM9G}V?mX;S1w8P$`^@JC8P=`f11FC6kw-DRza4q?=i88h&QUSR?RCqNZ^aA3
zs+o88@$q-Z#gq4nj8d)dP2vpcvTY=drW}(Tkxf3Z#%`LKos+T&F<M=k;UVNvk*&n&
zd818s>WpRXE|bAB`%iQpk*%Q;Dh<E(<*;t9`k0CG(&cj$16XcBIpN7=t!w#C+bc!X
z`Y2iSk*M2&H>sgZBt&tvRBaTg<&ZaX8pnE>SIpSG)TY<Wq=(irk&K~KjM49Ynn!<|
zugA;Eiv^vY1-|U9AtA@Lwr}4Mn1E7(;!t8><lIRHkBL^7WiV#l39Wo_`%LXqcTPn%
zdPOj7YpMZU*|bQQ#L)QtwZ!_wi!kf*0Y2FU?YE`eF<JBAdTVC!zv5o4!$@2pt*o3$
zssS*zXGifIdk`C)VRzs(MnIiw=dFbn@v&Ww`Icocu}fYaR~c!Tup)rq%8Bb>PFe}d
zOki3_P?}SZnq~&0Ukodp{56tKVuUeQP&9*3{Uuh-?r8RURGcDttY&mV=V33>s<K|G
zzs@cR`mlQ3*Uw7@4(Uf`d*5B1n-y>(2+jm9T6g0*?Doj#TcHQxThW<c(UQ0Ae*}u3
z26_oeOg>OxiC)JtSBfygFfh1gsF>gNY{^9E)RudI|GM3<xK8Jq1L-TY-14IH1q7-C
z=vnKHF(eviXuMXAR*>{F9?CNznt>!^;E!a}`rkiQ<A#%zww9K4V&KH8oGLTxuGcGL
zRo*`A@nT{9A|41MLx*_@QBK39AAcdC;>wE-Uw~?t>Wh3-ezcZCACqqPhr;mah`Og;
zan{xCnUU>B1;c|`M=Lzvgu&{QaLY}?CuhJPA9?_3?b~&?Dmuu7!nHb~F|yuIX5nvc
z4rsrB9`oC+PdP<HI(d6mgg@UgZRzyFsUTN<ecn1g4(-QH{<y*b)$N)T8Rr^aTuYky
za#?Z}6B`quh$P91lAF;Brs><#{a8N|6*nzduE=OU^33mnjNkTqtk=lO=gnAx325WF
zMWL}q_ZC33e%vu5WOY5<bs?#E|KeIm4CQ*#5YsFvFwxiNzT>pioDL8WwhW0ehYL$c
zUYM3>a+EZPibXN*leHW|yE8A(HQdc=o4SZS)^N;LFgTD!K9IkA&g!7|lNBB!;^fB5
z&)5DFX_1j6O*)1mQkVh6W0otU1wc5CrOw+e0CHCOy&(6VY8pHwFFgh2l9TNr5SpEq
z)QE<k1~6M8(cZRlX>Oco8GH#}aqNz|rsdXXqaf*V+)|Yn7kRv&F1@0+Otrd#t723?
zlo7$Xt6e0{rwg{8ms~f^hy@xQl`|)RkVhXnQeyTC;>(L9%r0TkuFFTTO`F&C;>9V(
zuGZUL4t1<}9-udkwFSNg10N`)^I`>yQ`e%Jl1>X|moahokWy&bGYBoMTA$naut722
z^>@M;$9<U#tX^jZZ4<i~-T@_?D_z0PR2XMWG~4m@u2uTGdn%N_a}6@^0{jR(Q`*Jf
zfQtm@@7Yt_sY>JMI8AY8D8*2JYqcwgU_;CytP?B1pezyp0V^I&8*k&Z?dA@9QMex5
zYq9^VQ{GLQ>|wgMEDn#<`!cyQGszlr;Yym7Ce!89=h@@87m7x#^=VYpvIODgNI=R*
z!E_82i`=GGH4TmZ>}T2R_ovw!Nl=m9ecpGFdg<PiGICl3o3>;L=d9*I3j;w1zA#0v
zZ#_KQQ&lm#XyqpVv@`yucUc2Eeo(~NDWomhc^OaqQB?7;F58~$S-P*0t{eE!x7;DD
zDxBcr_dZyV)8e!DNt(5yf24-u#vP>=joC2^cJ0x`8lyR;UH_#%M<^BVLIB3CgLvx_
zglqnofnmmOz2<m;n4N{9`X&_uJ8IdzCY`+h<UAD}c%j9KI`CJDNQl-S3pn)ON6Rl*
z^n3CIZ+YQ@t1H4vHZU=$Y5@c#U#9_0K4FoIzow&w#R=_HpoIlBJ2GA6tT=Ecu_<Up
zw@Udc)<uyzmW;g}?heAkn8+G-D!5O&(z7ZpS|UBC;is;we)|v+gD?7V0zUrWAgDu8
z@W20_amY^pmEP5w-7x1m#c3{#fB-gLj{nN>NTY<#v_cZHv!+!#24!!~_9P7=G|*3>
zZN8Q|K<m0i`iV(ADXlDSa**MH5<bN-t1RsnkZaY0Uc%*QSzFn5iKZ3Caun%Zmr?`U
zND>9S{7bJxsTfnuCYm?DMAdc#L=iNIp<gitr88~NltxbOvs$gasl4aosI>T!QB|Ey
z62n@fdI9ZNONs1EBWZeq$pailrqI$`)j|#Ydfv1`8nv^^%gSc^#Jg5^y?1-*{33!#
za}TO>@Uj0dY+WLkhvK}O!*E<=olF7-wlI%ADNaUh${hzE%C1g~x3&MJLG<rTVbaPf
zE&k_1G8ma;PQ*&;Z)Td>*6PAT#%Ka!SW5ct91rdo28VjnQTY-51XSKOs^4+dna2y~
z%ytu+7%ws~5beKI{>=g0LGb8sd}b>3qb8d9T>-?MXo;{iPO*(6puu!P@@cEX)9lp@
zn$13XZwQPr5LD-Tg&WdQ-kR<PNaa%OS)q-gN$U>AYrGp^ws?5R`IKcL!THXbjaOOk
zv2azY-0=bdaHjr{L|y}2RK|McR7A^0CH9S5Q>JGxX%i@pk+#DMIoYG}N*VDjD!f)S
zV06B)IjW-QHegyqF``v<7A;26_~O_+ToFu)5K94^h1|9=cs0#JsTE~V+^PURqLB0z
zT=ijURdLAS03r3QKvN?DhKC`PKseJvL~~z{dd^#DLbvXFQl=^0y?za&8E)4Rx=%cJ
z@wW5b(3nS|Ag@*MNeO9(s8OrwLIfmLMq_YL*+mnT9Q7E8&BMf>2=*DiWU(cfMmr{f
zt5v<*Cmsxm<g|zrOZJo6ZEde2*bYSa%_A=bK`iD@DX%SHOFf?T6z%YFb8Qc6!yY)J
z`h-cWxLv5K>?qNC2YTLX%>&N#GyF~%S?4%vXEYNW2wbF0TJ~H0+1iFMXU@?9z}aE_
zpR6p*Ad@DfG5oqyIH+S7FTw2@V%gS>u7QTM)#(GFS@&SW*(*70G42FVzqw3EWO0(<
z<crQ0SfrE1^+m3{jBYg$Zbci*y2INm=AHpj|F8XcDBvjDxz4S}i@hzOL4@3L!T~cW
zOxr@G<?dpC1JAneAnZln2T>9%4pRqsl4WuO*b3ZqGsH;!P!=F^M|W7eZ+cm<4*>zi
zkwO86Kfry@S`2iaOoLiqJw@R%>Q@IE(ed8WykHfDO$hVfwjbvwR;U2dl)LJ-q8H1X
zKiWymGGCKsbF+y~oyn{n+{oS5gZ|LoS~CzLkc*%qaMtYYNkL)4j;s`Wpm&07Tp?}-
z8@V+YQC^Y<hm95ivK-=Hg_6f8jtnRxOgSk29}sd})Z)R@GxANa5#~KLchLt4<}GX+
zg%L}1pY70<9hS0A2`}C`t4lN}^wP;WLF>@Ht*gI)o|_TxS1_aSe_1b#Dnyxp^R|#P
zs<lzF9XIm0e=_S1*{=0soEy~)#3pH+CI#MOv!@D4QX@5BHIkFe)J|;@71^IXr?jkg
z$O!la22jKy`6*Nk@Sm`oIV3dWi>FG=3t-zUag{L8{|Rz>s-dlSI(&;+dTpODtF~$|
zAnYn)Rs!7LHRtcKfoGb$#-s|z#lW;LTs0_4ja|24G6gpAhs9daiBmia(I!D(B%H~M
z^OzqQyLn@JBzsM5>`q|AH;OdQ!K=nb>TWk)GFwld%l8X2xV3#%qe7awj8dU;X7#_D
zNHgN;_EPJ|BxPxv)16~Zusf#~RqX>hTaLjB-eq2zHL8jS-3u5zf|%4#4(g@?a9QY5
zzFAk1v*-xD>t4c{F?Tr6BsIf#1g2rA|3)1NRriv{Q*T-k`+j6cH!Tt_9V1xk1$k?s
zQd19?C~C57oKGH7-H<y`7gza->Bk{^!0hMz`-h`-)6!-%G;y>OBSCWd2`$3dPHoAX
z4|}Ws-H*K4z+Bm0Ox#9=rcPjq-lkQ**++z?`^@GHP7R#BSDVND!P=m(>J0^#VNKQC
ztzkfPiF>B^UK>FTBVCg5b}V)u=iD51lZSXY2G7x}hQPDr_rqChIbI-x=Uh}QyN^3$
zBDkgT+~vJlcV6d+-9E&-qe>5>UDD@bDI7I3c`LRmTOosj#l!=jaK=YBHQCOK!rEcT
z=D8_vvEf(Fr?;vG&!m@B_y>=j&Lk|U+WUr-3H=%ia|p&1-mo83x4HBp9|H{ZgGU+6
z^bKD{Ia=b#FptpQ%R0vXN+x7Z`D1st_moVABqq-yYr<Pbyy8VIj9);~gJUK^8UfQ+
z)_t78aCmlfF*<+Uz?r3<S1;FT=_((w1I&?NigEq)q*72AB;Gvf!9o2hj8ak*$t$VJ
zf(DyU69b}6lhqNIu;>6MXsqVG=~w2H0sg=PnK0^l;;QR@l-#_}OZHYw5-C49zd}O&
zs2vW-3{6LIreY)t(rJBb-rz>&Lp)5~f~j&<p>=vSC_}uBpBv(c6s8XJM{5qj)EZIJ
z<5(LT&_HGtF|y4S1P8@0INWRy4mE32q`boZwRXkbg%{0Ihp!+d8JB^A+myAuhA*b{
z>yUEUYl(LMx;k@if>vE{u}Zs(p#;}3&oAZ5UhJQq$GJx7zK~d9ym_+K_$(C<3#%@5
zX<Ua8I7lLCB9bd{%2`$4W#4>bo-MArGc5Ph&i?`G%C}Ki$=CJFHfwhsC;oIAs1Q&&
zLjwYISPVeczOuC-0^RJ?TI|>K{MtZqOkrrzqB~b>agIgiY|AB$q^mNhrb!4JT~HD!
z^T?-c1;Mc@Mj9i#Ys=MiYu(Kw@xg6P*5_{EMA4u2FFW^~87LqYB;NFAOL=~9syCdD
zk}Gk)^8@v?@Zj1kZd!_&y2#rzwO7307Kim%8(BAIW?Cr1<|{A+<74c_9IM}-exYO`
z1v~@z%NtjJ8x9F#E%8BDPo@sA^vh19o1DExAPBazT8CMw<?$y8mjoT}W$aJIMVl+-
zedBTBIJ7*@^xYLD?(grTKzR%-Ou1+u&5)-=iy5F1{PW~4iXr{1o7}0ewE#<R$(eru
zu|BJRme0=3&B8U@#(Sci(zy2o1fc2TO>XBG0@9nhCpQUO$9t(nNR>1jzY&m<o5dhz
z-J6=0{$&u}r<={IJ#|b4v|ZvGm@T<n2hZCDNzem@Bq9|PiTrM#npLF@Ge}9<+Xt;4
z4OxY-Fw2>y0WJGT(x2)6RV|s^M^`+tr6?HW5Gx`Kw1c0HRV3n^FPoyH*2zdkoTab9
zW3qH_-^tysm;lb}(N|L|-EhH-9)?FNZb+u?(I9wT44>}(Am|;}n9jw&WHFU-MjG29
zK5SC8uj}3lNM|i6+d-{iO|c%wgULgZu!1N2<f4t$%gvlI!*MrLonXF<91{$fUJ17M
z?BP(Y3ScLB>L?X8@rIh>%Qd^Iq>fo6<r$^si>&7aKfxO{-l>Fnq2V)Pl=N(nakEri
z=LNt~0z&(Ia&#l&rTe#ez5Q*0KYL)O$8Tr1!wQL_@8{Z6V#6)JW2ewDH#eGS_&PH;
zD;=e7F1$$@Oa~v90-2TH;#%&yc{X$9W<)*SUI2VM1zrH1guW1(fc;;@Yy`FWxpcLq
z9*$}VK4#;fR;*i4yL&sC?2N}JjR8r;H1kQ|G=;>2r!jPtI#_|0jEQk3HKQ8_uF+ja
zxZCdzHJN+${nu``?SH9~{o9_riECES-!?&-(4M@WRO++>!nC``NF`a1hDcBOBrBmD
z$dwF!8ALPk^ailu6ZOmw^^miQeCg$WcK%E)=|ON-+OG-^JM)L?0Vv}EWkQs2$Nm%e
z!>{#(QZNi~vB!LW3qnbCr*N}`PTa}v!`#)JItxdkc*n@YVx!PS@)h@9)r*PuG^D!E
zzdscm1y+kTgfA-73yQLb;gw-1_Cdh{Dm4+NLyx&=$2<|+>lnsmz_=l|QEy)qzejQ7
zc9h<%n0s04UlhJIvGs74MUQWU>KwbdZ~>rA$dVssrY-{mv#8bn*ES;%JxA~Uda?$I
zLqo}=)Ue!dS65+6(qOCgM)Jk|jRjt|cH6V~KT2~&MOqAYOwSd_S$4cv;Vym*A=na9
zVkD?e_h-LSE|&grlEj?~E-$6&=s6e^IaJqtf(bPsDG-s?)S2;3LEqf(m%G|`Slb)4
zowyxEn>XSqB{+9(=S4>lk@}Lx(ZMen#mY(bWhKyZleCYR%V6+LUd^x8%4Are(Ji;e
znMc`PK0YTlyFZdW%N-?d`ufI+!boO%*b#;XP4_iHIgoFekmkYM(H23hNK$x^(@?C4
zQC9N=-8+7tk21*u$l2`94)Y^W5)epdJZ~I&>}=zuj~<YxMN?|0W$P1X2M4!r6orq=
zI;V7XdDne+@dr4aGPc83Gd=wKxL_P+96;B>5v16|G#rtY5K@{bx}%K4t)K7qGK;f!
zfa6L&{*WBJ3#X~UD$0^=>#f`QNZ5}vN<k5caT;IL>f@5nm|IoQIK%6=Gjr$9ukmyp
zO)PA|EUDuh88&BBm}b3Fk;;2pSnUtK!zT6<{@kexSpsw6=;SU2Ebhhv0!9tJCUPcn
z&QoLLl{di%R`#lL5VA=8l*Q#}121DZS*`-0Wz}USKKd*>Txs70h+q9a1p#+12^jaW
zk?Cg|5Tyg{!gVt7+Yr7<xb?T-&)6u~ZD={2qmS7HZ<D2#wNy8WjpUeYoji?Xf53-O
z?Wr%e@RXoA4DZputBShz06~leJWs@&m&YZNZiE?tgmrdP@Q<dnm$}Hzxk%SQ@=9n$
zD}e&YQnGrc_&-F#i7P_u?)sEj7zhvz1A!ZOR&GZ_@WPmpDwUB|;~=%Q=jjnzG9Idi
z$+8O5THEss6g;Mh;m<9EPw7bv-jXjDPb_~q-LbY%kq^yjKl7!<HsEmbN0IS<icw+?
zUV^<vCPWq`dX>N1*L=qSg*=WsEtQ=WoY5|1-E5F#iW*uK5ZfxtBsaQ;(66Cw1Dmfs
z`qd#o?%kHnnQ=}&DwgJ>w!4!$HB)cPqup{m%Sv1mg~Y1aEdM0MisQ*x`B`b3$L4a`
zR(=;DIFA&#9U%{|V;-kAC|~ddl;<!m);zMWm&-nQ!Rah(A`$afJ|&!YWZCvzG*-o}
zj;J0}vFoiN@Q-b>nn@!THBb6^w>=N!StKMRJq%rNcwl+8U(3b%>Mjnp_N4h|xbQg{
zCt%Hp=SnaWw})a_CpeKY4xvV;WopA3%P&yC+|gS+*SKp-ZNKV|d_K7crI;7&pbXEB
ze3Ms(p+W_eHA>(Y=v}4W);zFF(C3P=V5_3SXEQqurGK{+e=P8%1o|(ZpybSN3BI)_
zp&g`|o_TZsgs%nX`uG9SdW<*2fPk%tKVOU*BK>G!Y+V-<7tzBsXoxl_?loGE?FeA5
zpnf%m|2YpAYf+~y$G2xMiE%sX3WrYMa?p`MzLi-}fXZBA4bbIJyOCQ<V)uZ*?=2xw
z{np#fbJh#3T1JqK#!ID`4(VdVrRW~l$M&?Q@Ah&<G;V~DV9Y?tG|56-MC&9~n(~bQ
zysvthG)^)m(#P48f|^c<uc-uh=DO`HyF$~vdT4Elwea{7$)mh%5R+(GNxoh}elb`b
zyrPN}{6g8lY8bZ=OGudgFal)A``K(0%aK7e$if+j;kpw;X)MP6J{cx;1!K>XMttrs
zWPFVfVGMI=7)&3stXv*Kr$Nj-mMGM*856sJTXH*LfFkNv#bub&y7@uM>@s~9DmK+d
zh(5ifh~>VaQiKFvs$)l%1AT)$oLEdo6dP?=N94q(8qZ3w$Xb@kq)L!hyURibDq{)~
z*ll)V1ff*HXYSm#PJFJFRK4^J!Pxc9%Y|PSB=_0kF7C4Vp7S@<(@#g7j8dbeEF!X|
z*ZiB<RP=r~kH611`0TU=jti@ZTDNu@bVUD8=3PCI+#?lhq8Lz^mN4iv3EYkW1|7hB
z)cX81Dp5GtL=x_YRC7*Pt>e*R+cDCkzx^6J#lr*QBf861NOTeGyMEhMFx1_CPhQ4h
zRyzUV|Ca(2?;>*Vw@@GeIJWfnpmYZcWHQimweTL+yJ*=c^+#<bjy6g9f+<Lth>of+
zq7ZB<gMdckQ8VU}r3?~{fbOZYr>nihL`u4-;YX#4d(5FH=n{q$el>6P=yA@u!iYgK
z5j*m7iR8X==J2$`!QSHp8Nw1H@=3rYg=io~HPqJ^eYG6{7E)OZYAhZFxA?bS`;IKG
zErNISq+fK)SHDY(4Z1fx^_uxNZxq%;XXAR);X_Pe#(Vl=GLNs7CZQXpP|SO&Y9}ej
zlLKkz_R{cX9z=wz?a*Bf(z*bTz-_o3zj)M26=nbBW{&v&8qFrJY{r~#&zxXpB6-o2
z-K?wD$hMk&8sK9-#p|}xCmT)4P;T{rJtQGTzp?!&a-VhU>(x1JpRjrR`r~8KeaVqx
zD`BQ&%KhOD*W}ULo0PFph>VH$#k-k>XUo>za(mw=9sAs%ysB;^MsE5X>uNBmPSkVr
zU>^t;R%e+yDEQw<_m$Sg+u}D8vrH@XXK?51u4h+pchbdg952r5Y!t(DGa4ks)2ql|
z){sbO*1vkqf<oVR-#J|%RpN}PafJmI@}Z|M*6e5gqD<mR*BGz|1;uVz!|J9OMi472
zvOi_II!#wK*U*aVI#&Tks1$D!8u98opnzAJ4v5m0Kz)X6riN1&;}u)LIERE}HTmhr
zB=F1;IR>&y_^s)UmxW0wQ!wScO3kbr;yLwO3~<(I<N2sh5=_x>q5QM?v6~nGEwM+x
z^3<$3*_`_$fP+<*I$J2ukyDZw)Utt9fR_XoQ`*NxiE5WtUduK|)}XR7sWZINQO{i}
z0h~rAwlFM<dHG)F+<LlY-R-*ACE*}0&kv4SDPSee-2I6J#VXA-IIUCzf=i))=<De-
z$bu4k(Smn(gr<LQfkeCi%c6)oue_RH<d$>4kSCUC2ZzrdhdHcZq=lu8iFGlpY*(nk
zZS@JHW{N9AF#WZgtdVXBa#y^0`p%k-iAH4+ip*TVLU#yr*9}igBJ53!Qy!G6-)!Lc
zQbbb(+=BDfNu4zg<0xa{DDtudl#NyDFraDw;v(?*`oFp0PEfw+kO;i*=i85=tC9V4
zr=(N257Me2Tdvc6A0WeZFSr`n;5eCH>D)#zqO2ia4<pR|C^a!L0U*w#g3m+3qA>KH
z81J3Zqt@p9KJGVwd191XI!x6vTU;$UhO0ikJ3<XRkoPu9Z@t(@ghx*<9h&trz!%pW
z#o&0~{J8)my=&1Ameca>?~_s$g_T0%6oFnWCvl2((!2)eX@1J&Yp85|8vVoAPc4Vj
z@IG8*fXs;t3Eug2*yxospFJv4PX`MuIw-5Npz7C6d@E+nQ0wf-pHtVJ@7MK=U+s$*
zUbgR#X=Ml-uZz`wVsC9Gl9^u(qeVCj6yP_0)naap_lpfXW*m{-<Kxv6cP<LOLnVC^
zR=bzc<E5qGgjPLKXqL}<09cmI33!(JI4-W9c5XixXOC8m`S8f)zKoo$p<fX%Y{@W?
zta%}!4)5Dnhq^2xUvQ99VVA9g$t@2=*jAz62c+CrD9%Si*&f3H`%D;+kUkN#5Y6@j
zo!Gs#1)eddgOA2`CEc91+nfTwx8pPku%$3LMFs#){<4%NdAY?fns1dApdK3_r~<Go
zanYJxa`MCgHttaY-jUHn223<6jAbzfC&^20c`Z3em5t2zcB=J<s*drMY#wZOXIB~5
zKxTF3Xgf;Jd2^2ZB)N$3nu!=Lct0nS<D3mPGtMLiI=K9d$T1s+$Y7S@3-!-4eb-v4
z&%LXb1@wV8%bL`v<9kyHf~YRtZ6RD0NOFW2Es9AT#Fo=9qz`jS$MTnq@uAo$P@n&H
ztI=nil8+>7L9(E*IH@@Cf-Mut6Y?G8t#uzhYLVr(?7X@l>cja8H@?$D-B4=@xzT&Z
zEoqn*PTNq|<%%>aF0E!Yne($4Q6xNcYDqIO{L}{4!Pej2%hQDT)Sc8@(`}2Y&VCOl
z;f*E3l#dy(Q{<6^s$ojfzu^~dJT&vdGkKd*eJYs0T6K8O22t$wKv>5vZnyPUFcI!{
zH%EDLD1OHKa~LR&mPYmq4(vo;YaCuO92cr6n7Fw_()?rGGDxj#H^<OB(#8`MP}6SN
zFOa<}<E2j%;^#%J31WVD)-`lmMA5sZEFapYA%1+fF8voA!x;dYxO2#sg%j+m0I>$`
zU-$DDwPp1*?Zd7$LT6Lt6c1p;NCP^Rkhvav)-X^vis@Y1sYHm9Lm{PG&Py(km((|s
zja<B%!X&)@oTc2blVhUSD56wo*gAr_#?Y}m+cX<*+QsA&o!Rs)Y|hDdKpBDZ7N&Dq
z0Uiv_wTU03B5kuyBYGX-t%VO0Tu~awrLr1q=*-K>gPi~Z{oERm=sCJoJO4@1fvBV>
z8C|rl;$yH?mr6y_ou~o9=@$zTNgM6_S+Lw!w+PEc`a_+@CZf}57Y#91uKFr_$=*>!
zm3*ohCM)bL!qGM8PSoUqG?mR-bhf;D$x&~D3M6tDfak10_T8V!M^^`lS4*|(8cIZ>
zSBgjCGup6f2d*`D5T+ZOp~xOKRz%>+ba-BTA8^}aVP2Qljr9pU%1fYtzT=%CdS_nU
z%IStjzech6*Szofl-cTfbSL~7GYqDRPg(C@y#g$}HZ0(}#p?QdWr^+m`wqc?(<WsU
zRK6uhA<EI^W^B3Rg=@vuwcn=O@3Y-o?UW@LJyV(BN9#_<k#rwi-IvnPs^2@JB7Fg(
zYuzPGbolB1$0o%zrsh4K@eeE)Q$rJI+t|xha=dPIy5K|c%LIg76}$09r!sEj>Je@P
z|291xeaY3-dWnz90NozfW>=sgKUCZOSHz6sNov_enFNp~Wl4NQ%OUa!XRE(TxQ-E6
zV~UlUzmhNrrg2PYFDY5iUIi)5sO?|qtCdKbc;w+U*!fFIHF9ee-BI!9`PuSZWmz-Y
zUn{j={5G++t2QYL!LS2oC&I%>`%9rjoZ@V)P>2i-0hPJoVzgW>5f_o!yYA$jK8C~+
zuaOy)9FIvF=w=o?J_qVmMWp~zW&lAnlpP*5Mrfgw=3ceth9Y=SCg82vU3<n9d*b}W
zOyj^|vQl>%L%t=^2{|ae4kriTd#as8+k=`g5f930>{K_W*1S6Jeuo!3AS>~9c+6&x
z-OJ7Q@j^iSMtROZjtY!=l#EV-@_W~|!i)^dxthSB-txS6RZ#vnD>kJi&9v8OCZ9Iv
zxfI}<0Sb9cQq{F{|5t&WZ32O|kuLa(FHBk4JbAS8I)`DOkZjQ*amAPy*P>XJ-W~6V
zDoKL(`9wsp51N!rELDWKzH(P(HRvg94EN3CsOA)S6ZI2&VqPaO<Y|Eu!%IqTp=fNg
zMFt4_5sW)o8OkF_5!6byg6#4K)p5Xv1vJan1H*9iOvAgI^uRhC7k?8ZsEKRK3>Q1j
z+&2>TjQYod_fwL+>mgAlm2>Z7nIWj8m*S#@iJ40~wI$4cCbT+dU>LE{mQs9Cd7BTU
zju#oSS^9nU`!XYmRjk?Bf()>Z5MHR{wIq}i`%eLrl+J9VtD-s=N(<$W|8PC4Aa_1i
z#hriXbn(?n^uH_7LUMUCuYo{Lyzs#TrFFe~ab*+0A+2>J*0*a>2@6iFR|f26)J)^j
zXQ#^}E(EB;VT4SNCl<6`z@o|@W_F9DkJcNdbXlsnLdjJwxgw0}gbJ%%mwD%;JE9G%
zr{0^ZOBh6Z8P|(+TP&e_FXG)jf!SMb&C)u#o`rYy3$Zu_R`LCXMAyGk;MP0O5Wvkj
zw=5tmolWmr{2S0~?ARXfYd~Ns{2a7BF@US|I)1E^Z8B$9MI#w8Hk#@O7x=Dv$O@mM
zrshxNyHC>a=dS^Kazvd(pZ5E0Z*^}?8@_VqmTO7>B`8%fEZ!b&v5CXiM@FuXi_L`v
z6W7mbCy7VXg(&3pKn8}1I4e4Qr0WyYENav<>&D8$Vtcfr2yFwi3uKw-K(sNY5Jh+O
zNkqiDiF+}k;QqOQyk)p=VyDh?n)`3_3V~4aDUQd3ctyH-Fc=MhwnN|_6!6V4zTnVJ
zv>M^+h8sbFPK|I`HWnWVNALH0fQhTK>-*Q)=c1&2<6mJ11CCf5E$_QeeK$N*m(7}j
zI@+c=3hGSlM^n<kA+7BhEEvJ<xs?;;=cuR!j_JD~t!|Mi4wna9j^CZM<_LmT7|KCi
z!R6Vak$8Mw;e6qT71kCySCOiJ1OhGaEb$p)$&Cb=ile*`sW2+s`E*jF4vMqqrpTr_
zcN4CI<akq%$dN<0o#SoqiOP)B+G*>m)267PEvt}AxZYCeba;f-44^a(<!e~Ldus*G
zF=I8fm63N5A4K8}b~a=N4&A1=p+hx87EI?XXgQ#gB-@Fzq6DW5Uy^wRqPgxmV}-f&
zD*SV%82HF_p>{4ieIgJehzq=)!IJj9GxDL@QJv64ukAS#y)X~&Wf<d7u=aew?qUQV
z&`)Oi>=52m!j_&HT*rK^CDV~?(l{3EVeDq<5g0$a{Zl}nyERvVFhL6^{<YIJD3hGS
zx5B{-gAlEcPIAy#P=_@2>8{1ctZg@cT4jXhQPgs`hch?0^~QJ(V=q1HEhc&>&x%9L
z<wC+x7*$pj9OtvDvsyLx1^^F>oePfzjrbRe^U8+f<BC1MEW<5Q*4o>FL#I$FQfIK&
zID(?6EL$*j>Fy~t1P^A*px+gdQyLx|7c>7<rTVb?-iDe@5b}@z+0yTKa;zfH3MR_i
zDu-zjVbM{dr7#4Gcr34Fa+4s_G1tq{qJ1mI?xD(e*8l{?4<8QyB{!pP5Hv&aBg>Ao
zlSwkORF=wlT-CUFYW&0kEwJxJfTduk$!gUeyhQ@i%B3tKVGXpeZn#}4;~FB3%c_n)
zH_zAX2vCzXf;vM0oubfhZNTasoyIH8V<*g^b7Z?QF<EWehK790EqXwzx6Ek#wV_U3
z|3rv>`qF$vsH&``3Kq(pW$h-Imqg)rzO8Lzz&4RNP+_+YrkR7+jv7Ez1j&)F*!(<n
zv_xs|F4b*k)fo<AgIp=s9;ZuZ1ZpUqCImjoz9wAy2co+zY0I5ZH=5i(n`V~3!eZT5
zw-^JEdJa*@BMLDLeNv&d-TULLiQVVR`T04c{^td#r}PpEEFDWpr1>jgb)m*I?mC|A
zuU@oSq;M-~6Qs`GRKrqhW|S;1q|aHkiqH_|oDXe1q(G-Ej~#mi&Qc6I|1rA|CY~~I
zb92r&D<3!COC!uETVe8{&G#Oj?YxM0pRz=!r2O`<(}<?^i@!>*9R#7X4ms}0F2QFi
z_>s=_G#4TwS6`XrLL_yz#VB=Z^%ICCb2dsti)&61?sA0J-!~;v9r$qpBafs`kzU*|
zoW;vC-bfB3<K2H)*6*=EN#&D2lqKHQg-`?^^Y=U&tFt#5d3(cVGGqII0fvvL^HZg5
zl~0i%X}jqF`1lOX4~ea>Rar0e?qY|GVvB}~JK?HWsknb{2mA;A?F!QJ)m|S#bmdQs
zeS=hGOqa$!9D|7hsY&3pr|`E=y*=in`Ok=h_93mwt+(GW3seIi^5TX*F3#jGXxiUD
zJKsLp04z27x2$3JS@+MjfNM(?Uf6|zs_W$h+anxr;WO&XAVW@h49;q1lPBQn{m^Q1
zoY|xpYIL}Zmx9%*XfWVT*rYa*h70-|C#)IFr}65V#sJfNM-?mQnjCnd;FLQ_QiZxG
zOBMN4Suwb(2L)0e$3LY$*Wtgdxj0#8-;rp<tlK{zC6SUb@#~8sxIF6HgW1jLR~=U@
z`^Vlo2exi)1G9BZ>7KYcTSAwtWOWfjiZ<FH4K|Z&#`;oFKn^a`i<Hir`<gW1SrSHp
zTsQ06U3u`lXiO8DhI&cj!v9hu4#ws+tnfB34#3*@M0MH8-nV%#Hwp@hLZ!r%k?ClK
zALjdnX|J(P7X#ZTHcP?6PYha*ji4!PRXZk$CuRx@zZq6ekBJ~nyp>in-wJgLb>tQ+
z4wO1qS6geYA06MfdRB1`jzT$W>Cj4AmYuNKsF42F+L;oc@vw#<JFB-Ci>paBK&(!S
zfW?i|@e2HC>ZAaVIITEiF|N^&jyef57J$uvE$;qq<A5Bh>K1V+OBWd82F@Y>ixHNr
zIv~O0hG>3^)i*E{9#~ULfjFkpV(X%f-b|t%s!r4dGIPU4o2@Aof>whm&LoS%)6tOS
zSSqq$iw5b6&G2ySa82%h^Q!J^k#wxVF$Scat&`{t(aECPb6NNmC)Zb7iN816Q)0v3
z5f&!&Y5Mg=b=r;&4Aob-HzyzXsi*RT2!r9l)e+!(yvkt+q4)U@8-I{XTb;t)px*k(
zn~J1vFc7Um4vBvV-N1HlTR$WyyKVd-$a@~tV7!#Graklr-$+JP8(W#VT;i1{U>$6n
zzq$o&kq%VG^ufkP8uAGiHMqeO$E3hrnw7~>?dlgvPJW!khE-*IjcnbhWEF8~(hjxV
zHYvNO2PJ)nq~^m2W?r$ZQS9LVI81CMx?-kfvL%F9!QEZRg;K;cV6Po(ZENdkdMZV)
zysc~2sPSeLC;K0QZGj3aF3UXBy8QDOLIU<iJIC2mX!qq49f>fY(gt3)&mnHkLc<t<
ztp2I!Wkd0jQ613aur2CMdxiX48>%Psgug58*bhC!lg>G+{I?repC?57mSJJ5uZr6E
zn#rn5i|$DJI0iFHZdlqsf}@`UHsC8#d6*n(#Q1i>5EbC@PA7m{vh4YFtnN*D+uf$a
zCz+M1iz?E{7leNuw@n%);3mZnDz33$-QTa<I7P%bEIoA&GzR{<!tuMxA3VD~;hL~u
zY)gEcXRlU&3WoN=Cd^bt+Pz4rodmT&jco*k?zf6Z;R<jW6(mK@_+4TXGVOkp-5m|H
zS19VV{jFg<@XeMIiQOwZ8GcA(=6@(L?z}P&9<Tt#KbH(H!_p3f%XEBWn7XL5XB2ix
z4o<tp{SmX1Zbq$h`HvY>fp!iRS+Yy;#ZQiwA2{aE+OF6LT0hGWnqySLrg3cNy|L$U
z%&qRLD;>`p77H4ILVk$Vy?$E8nHan<GN4bMncOo*I`-D^@NQ;AW_+qS*kwWS`Boc9
z<1)(}A>Wp2i-f=};RO%wl<J%s!YbJNtl8Kk9$E*6znD%EDt&%wW8jVj@1_TrD-ZHi
z8bHX0kM_z#$_eIUC#%f5Qb<>l<)F7ovEwTpo(unzq9{YK)0>u0rlZdk)2gAWP=_fs
z=LdC@P5$azJc7u^=oTZC&Ng~QAAD_nydLskW1-r&qM-6jek+oL?El=%jK7s9QfB#t
z;LNSX7`7{vNjov#Qc@JiSey_PS+lHZCxuWjS-C8*M<fSx!1jP4r975O{o8F}k;p6e
zET3v|_=bbK9%QVbyKC<$?7FLk_Vk%MvQ{xx8(G1Ss^hfQ9z477j}z0k?a+42hanLy
zmmGO2Qx^YLd@aIERA9{0e8_Ija4r|4)EgyHmba&9R@VUe#sl;DIk_@^TvzG%ydrnJ
zcCvYpMlvK_S5Qy9LKNLd4VjJzZtf1kx=UBx;wxdmQ|1nzDuRZW_tCItz&jWUJqqiX
zwW(8g{~R$#1uMfX2w-au3W|5jpX|EwWIY7GM;Iz<&E?~LzdCq*yE`X+H}+j*Ow;)f
z9_I81IV5V9zcZ^qtPLjZt`I8J1DGTebd?v?04b!YYVbF@16O+C5-B~h1Q>a_rrs`4
zlo7hPG~E4*;qDgkVl29T*LWOgnqbzGnyqNonu9AMD(kr6ARGj5bKK0eO-yv#ix=kY
zHZNZI(x0K&usG`zZNJR~z8g;icsQ4D0(J&sqFMHW7wQOR=EES`9|Znbv5AHKC=5M3
ze4jM!Zd^P(KOgVnLS^^RXy1Hj)Q6rZD$dnRv}}lyVDgaNq`;oA3959|a?#7SvMv$c
ziNrgJ8$zf-iz-cvmlZIE(B4YPxz1qBN{)_ln;Julakc6pm37Y7TTlHfpw_KN5kQ#m
zuXyXD&e#1Wv(uueC<x8GFh^+?wSUJA9n1xurGJDpp-ENN*Dy+`?~`VOf3At~1iYB%
z90?RM`Gh5K>2Pj;bk}B(fZ~<l1pI-kE7Y!=$^JCDannf`Msxx$4UAyS<V7I&$TDwS
zZ{1jU+k01?*QoVPFLb8{#51f@aj|BH*{OTZTwr7CUt@RU##gMLFbfi&gb$TtI@5>p
zmcl~;c(n4$F#VvmB7P$Hp{&^68WRPPk^l^~nSu~cC>D)(R7{v@*HSY+p@GqO`DG1m
zmtwitfaDybc&Y>CK3!;F+p6b;M2}c329uTMo+Hb?>>BfwJuv79SL|T-DpjVGHY-0o
zYiNPie^KBhd9gS<2aAYpb<|^Q6ZP3Ax&%HxiU2;pfDj6bpD1MinqKhGbDf}=b!OZQ
zOQQ%syhA8bR^4CLxz4U=e_o%-^gj59+u11DkOCa{=Uj?O=;`&~>Yx7M;NIKW-B6A?
z6BVS@c+r$E*jjRoh9rBC5)ysM_muuqJV1}zc9nV`WfJh04?bybn_Q0_%|3E@)-fxD
zW7w`7KDIL|pTjO-ftayr3@MYRqG3@`viwX*X_@VvKc~&Cp~0M<M&d?Z+H6g`bj6`h
z3(rcU%Qb_k3}eb}TS*o5U+dT9C#|dxe#`iuaLUy`{3(2=&*w~kc!>7)a=>wDIA+cv
zaaRwGs+72smiej+=oMk{)ib49?PqL6h0+VC3##vtLx}>XdI;SipDzg&r2a3Fmv%d|
z>5$v+rcAxKK!sVEO#b|YnyybI%YyFx=FD^&TVXyd2Df=cgIw4FuTquDVi7#za)u<8
z5<=g`J8;s_8XchqHuo3v<!kpo;GFtk&ZkjQ{;}fSWxjC;`v@Xh?{lt!zY<J5eHEFl
zL#Wk8pBKt2O8(1jNo!dQh;u@ftjJ2!9p%NyQD0KXr^2_^Kxg>)0q&pWg3S(@xEuN2
zcN6d;<g0gU93><IY>wSC`Qw!V;ICi9o?Qw!nazDgq|BQ)6)4=yx|^Q<4~;l5Y%_*r
zjBL%8^eqZFsHgOMA%A~_l7U8NPCYTaBeApn0T6<lQbC20XSDNy)|$V=P&rt(Q7q_h
zxn@K>G$uaz@UIOhL?RSD|4pd<k?xJK3+o}S2S&@_CE>p%hZoC^m7*OPr0GX#{fdO5
zk=ulPnEm$n9E60onWCX(e&WzKCykd<r{xV|px1kV!W$vC*|9(EXKU6J$&&L#W7u`G
zPMMTGjMI%B$bvsE?+ASL*rL-)Wkfe~9;YwiLZ=fr??&dI@aNx82-Op(X1AA_OfO2^
zTzV5bHWyv|fS0?|mIj@8?BBbzYv7EmO|299Xyj*hZUAaM?kbnCw_mkSS=YP}UnRN2
zS{tJ5Px_`@*Scl@){=#58}5Ey%POAE3haU6+f6Z@Z3^xf`rv8C;9o7yT{nANJUny;
zm(2S@Kj(Abm3FaaS=*L(265M;)HuTP8}^0R?;bm!jtp2|rQF@J@e=GWB6953(|2Pk
zU;J+D0x>W=oz>KC#|`+`I=3&r9CqS<n)yyT0Cq4hI2xSb)VDDmQN?w+GO7tw@HeCh
zw(JvsEVtF1LKN)tE<#%!GF^u8P@h80w&{XR!+&V2_h3t#LdF1IB^+I0P7vHVR-!51
z6z-)+%B^Y^EX}`BBE{$<2`SfPNGY#dlE>t`v)MQXJ(mInTo?@UJ3=<`zFeJu;y}B|
zZIQC)E5a=i+V4>o$tpI$Ty2KWP@g?aOpcvnMXzdBsOV5gD>p54T)JxN=>6)C6FY7u
zqm?ZZSV`+IPEQmw6If*obl51&%MD2hQ0%5AvucH6&sM4j-9U1a)L2z@WYKd^MFkhu
z(Kjx}HCDx?++E*A!wbQxk6foOFfVCGnShT}wTsD8%7`K!mXGa^I}I%6)tQuz)M>1^
zEp^-xXqQ+xRb?PC*Lo}0WDK7Q%-fa_O1AK$@ad0pfd`|-oCTr)BCu?<j62LoK@~A-
zvd5vy%Ta8sp=7xF{XCi1paN*-y3zVMSm8WUzdJF;f|QCmZf!NM&R-Ho<XH`w!Dy@H
z@niN8ve&H4Qa(r~7Q6ls8G)iz_zR18siN>Q#1TKl0q}WBt$6Shjlg+a^*KI2YwG7Z
zGQf!7yt^;#ETQL+TOzk`gff&NiYUSD$#D!eati6d7`^eKBVkE~fT)Uf%@q{ffixJ|
z(M4knA9aF%^_Lo<ap)WZ_Btl0b!k0c6zwtol;&$(M~F@pRSj<z2}m%f&n)pdl|*6$
z9YmqJJkuUgA}Tf^a}+DzCz>V`0nS8uh6q~yh|BAR;#XGVbBJwy39a7-JY|y`RDMKd
zST&P^p5%WVTS$9_5iIOU%oIGjA?BL-uQQ@MFo=RmMHdXpDLFf;5Bw%_P+lg_{hM>Y
zJ;i;=y%R>K8=er3_ecV-df*QUjfn@g2RCM$N()c3h??e1^DaCF&41Q4zho!@v!>P7
zH1@L`HI4G|@&@YZ>;4Vce?W>p0Xqrb@UT`8j!4Z$f9NfpsPh=Mb0sh>JGv>g<RsAP
z<>Z@@-Q$8aOItr%l%QzHAinI@?A1wJHlbJ7JFmc`zZ>9|^rhexxS?c~B_Zbjds%yB
zQFr2w!q8!=%W8(R2yc(fpMKqBe`P;SJgEl&Y#L(i3g0jk6g@mNJ>--rIa4hPE7Met
z>-68)t~}{6{y8QgfbMZ-GqGUsB#hQh*xjk8hMqhb2a2l}W|Pu>>_CHEkTPiVU~j!R
zX2Xeg82h_a4bgjx{0g!QrYnZ9$o`524V-W1;{TF4Wp!LnKyeuO6_#*7xz;2ujvTHS
zBBsxSpx2D_r_52=_C6E#7irvHXkan~e_(-QLWOE7(<Vij9E22BuVQrhgJ?T*8=;?}
z5L|6ZPE-^RKFX1~Jt$f(M=qw$GU3S5aoBZBY@02JA%Z;KiqufWO@0$%Nc33k)p!Ns
zT-m=>L&tykF5I#^3_Rtc`*HD&gw9v<e<(q(ejU#xa`AC06ylosBC<R|4d$%l;blTS
zJ<i}*;Q9)x+>x#yPd@qWhx}Qn#N~_@)I{_4G#r#n_~Z$Dhd~we3djZIu)*-v0E|8#
z?-c8vt%9J+ecXTFheIK%enz2j%;s>jhQ$oWTx}aYOI70mgb@TOWHN_XLusV@wm@zc
zd@@AAYh2)c1w+I&h3dxH_dlWFWCyw$>lc?TgpOh%fy0!KbK~SRY_7ore)ri*Pvr;6
zC;E;4$y0xxf*yr4Duhm=vcEm>oqW3+t%jHxNy*itq%c}qQMWcNSEBMK>G1cm-95?-
z{$2o`dsH7H3d1~aF!2%801MJ^7Tv6_i>E{$ySL!5fFp^~H!5W2HyIr=j@E8|=jn&w
z1O=Ok^C01DxSNm838(2D7No#Jiomq{>p=YecdQ&_H}Pa?ZlhyDIC<3TKlzvsca3f<
z4H6&2R<2li0->WZVC{k8aT2ReFDZNWk>;pChRFp;a#DKzwnM|SgE!}kmBke2N4eN1
zm_+JOQ6S|C6#gMg#YCidc-QJcIdvT-W0Rm(d|5G;&hQm{GtnN`b1&u82ITs#S-8`p
zy0{-alo#;8zH0ukS@oK^T>=t0Asqx-Dk5@bs)`AKRe|ul1&oWoekwn}nKig=q^Bh{
zi)@0Gx=6I3GN}x!b<!(vrLZ#e(tf_Zfm?g`myFP&r)5==xo$uJ>y95texPY#JsJ+D
z^}W5`H4sWf5I8%3TT$J8X?DB_C!!D|-|itne^8)1_@<JHw1U903;y}S+u5Uevd;Ib
zVx!6Q!w7&`Zml|5U2>OtWD;mm1KJ6@i1_n=onDF+w6Sxtbm3m3;YaD9XRXx}Ho5AQ
z4@WM|CphWJT(DZj{PGq5G+%{3K!##S(*~Me3+{dW1E2pRG$iV`{uLz$1QenQ{7<|O
z`s=?hhW~GSi~b*;zyGgghKT>`+P8PKbGG~E+W#*i!xn6-e~$h8E?+RYkCB~r!A0;c
znB7wU!}6N0*uPCVyz?~1nx!@^o|v8_Pv~DaY(k2tloLMtv=o`)Oxakou&1Q!tu{0-
z(YT8wa-Sq53XPP4qO8_9!+(f5!oU>bAzp6958?C-Gvp8ix`{<bS&e;~O~<CvBEyY<
zXMPjk#wBL^OcK;k9#E3K<(@#AY})KW{F$9#m6(Vf`BSoAB$R-k7_EUc^)eCq$_yKz
z5?39Sz18$6&d=C#M>zgMQhXkp&>p_>XGYv?#vA(oQ1^~elC|l!Xu8t2ZJU+0ZQEIC
z+qP|IrES}`ZTr^O`|Nv9pW9#W^S5J+H^%xCE1nf=Js5M&H>STj{%PDyqB)}@f73aB
zF`uyby#CbGY1pw-?PdIrbOhe8dcA%Hc&^YOOtWMMf=c`;b<FwLO7)9tqI7Q|Wux~4
zKkI{4dl_P*H184oCOyN%I3vlr9X!$n450~vqApfYu?#zJ&9h^FOcMQg%@ztE?O1AA
zKiwhOp`WiFnAg6xqUBV@2<HgAg*_S{Ub6GP)q*CwhD$`$&o~SH^v28~S>jT~J!gA$
zqPRJ_nVKp&sC<-{36|;p!~_x12f_YsX0ht5t`n@%gnMr5gmel=Zp5Lm;^_&5updVK
z9rPo!u|46{^~BWnM-b4G7ebOB&gxfV_Hw||Y+G;5k~3uRCm5Dk!It(^+#tDu_SaFa
zi9P+dcLDokvw$@C@SLcAIrO_<z*a$5?nR})T_-T472Q2I_Df%&#dzpNE@h*G;6jMZ
z$h;N&7n`^NhfhX%UqxQvv>>zFlZ`vEJO03U0c=}lb;PO%w}Xe*DL&k<mv=4@7}CTC
zaX_C0E>Wb@A_=p1Gys?K&}>X2F--u{8^-~MCrOq&+~q_tq%GWeNLV1h(E58P9)sz?
z9JD^)ZPh-4T7E^R08Q()CLDfLv1RykZ#}#facQ|1U=2)KTe4?K>FG0Pxz@ir9!;g#
zN@H&~@ew~i?o>6~9oKBMLa^#LHLCf42<9IC#OmY^^m)_DPN}h}!g^lavFUi&v2Er0
z)WfPs=&g!4hp#b1IPyHcbJa_8`gA^}?KoV=2<c~Kk9@~2q(}i;$CM?TUc6svsxFzY
z<w`9rVxiC2(vHz?tmA*`pS{$pOt!g!zK;{3sHb6%;&LJDw=lKP40y$c)uHwM4dqgX
z`;0$jCeIVn<iwR)CTgLX(Ee+XrKxIF?`3(%ZEzQ02m=mvF|QY~`RZ=)L8MBE&-iS!
zt$CTY*~>t)rnmso8_q(b?Y`Gu<J9~|<`xi&%{5!nhi3IfT4eqX(+1x})5fdkSecJM
z(iTj}m4TnE=)%~Y%}>Cu2vG3oh;M<Ak05INjRWsE(?c*>Tv$5^L{lN&-vS4stTU7(
zVqV<u(*wY48TXbU{_g#!mzMzE(*uNK-e7D5OACs7=!GcALAa#yAo`T`!NBxV;&l&V
zcx60Tt@{~ZW)b);qC2cEf=W&}1bl&m5$%VQ503*>wo*7$g49<`PGX_(cD=P7rB1Hl
zcI+dLTZNbe7eQRvEchOQ;CR&m<o-K9yD(d8uzKLY9k-!3)hp<KfNV#mYpmGOP#<6I
zGy!VjLtD}J<Vw{UT&4`c1N{uiXl!Y>xmZ^=Rm<cXWlp>Wx3R-gP$P)i6sLlmUjQ{#
zYeKp^<Ug{)V&sLx$2c$HC%Wt3kp4=~WonoumvC^9I_<I!jq@8(>~{t5lhjI>-}Cd_
z#~+6oZP}+OcBmehujX(e7^B-@EmR_);At*bZ+kIfH#-zYnW8ZXLX8yDkCe#+!7OMO
zry5hyI8*QGi^_xiHC{B{=$_f5)?jE9e?t=yzu;)%CThF!YkernIqJvwq29}0p||e`
z6e=$C-beiS#zq}o84&*|Zb>GZO&31~Q~iS*E`p3Zs_aEG)2Z;~FZf@mS9-KE%ih|@
zxgeE(G6@nzrGllo%M0IPb-)YJa%mZ(Rg{9TjB##Jh02=jM1;GeAPPrCwzd^>klW;Q
zq3{KH)RkpN%c&p1^9;d{NTFQbBN_NC7k;xz?Rpg`&Bg$zg921CN_!*rmx)+J5@|m)
z?5<w<OI*fRak}T=v64FwUneTely@J*$qHxP1xHJ8Ii5y|!_-xk*4(31@;2vx>tQE-
z9|q1u6lKx2_6bnYq|&%lgx6S8H9_Eftt6eeS1mUQKH(jacFtI>gP=6E=20rI&j^3-
zm!TGt>MQ3oS#JeHRQ;L{Uu?n=p2+Yx8oU7HY)w^e?^cr&TBdov;<?7Xfuuf&@>_3+
zJ=8<=z=&XT22E=hM0-PlkV>2O_Xy?ylW<iq%_{7LZF~H^ew;6O&0!i6`v&c<`NRhw
z!UvowP!XB+jk)IK-^>;8<RaT7j((S&Su;FyXK$TiN2+0Fri3)ArZ)t^+-WW>dUcNH
zD=Q=s_tqhf_DS4y;qNab?Qp{X#wUS*8hS|aa-GQkj%}!zbCmtt-bs%P<;SI-&r&}v
zjQ4p(nVPizt*wkqEOl=!z_7Dwd6T&+g+_#pWS;b5GV+5{S%#^&#0nWEj(}ahv_hqz
z%Bk#``WR_doFA(ru{gM?yk%3#Goai+9{j_txxJpb>%eg6QL6$eUMg}ey0Y~P{os$N
z2)|HsW(wDCOdVxX2I}COIWXPYv?T#9yj<fea;V%ROUajh7}6y;8E5c-j)LZ0(-l1D
zc5OphE?l(+0YMlXJ3f->g*IW2v*ay3c5PB^{1tzZ0Q3;L9l^Bpc(LDC^`3q~Q8@PG
z`IkGB+JGpE6IZ(`MUihFa!jdHB^ZLMrk>dX*$Zi6S7J-Fcb5gc7u7}?wZTG=t=xI&
zByNNNci%h+BLgb6K=}grjEQ*#$r`h<zuFK7G}{sRN)~>`mn9k_zJX=gTF01wC^LW|
zZ5pF&EXXFQEPbS0LC&R;3K5l&YG)lm*K919FoyEQGmmH(-U)J33XIiatt#Xu%fklX
zZTp+hl{i2SmouOS^iFZHY@S!Pe@s=)+GYrjMH6m&=|UnKWuv3J!;k#L;o`<;2+dO7
z=<FR1onFU#iX_7eA_+dZ3!jJ;hyj6haA!B%QUvKLd{#oQ%9M;arNfI7<kJF0Pa#4*
zUJVz54mmJ!RGKjQ(Dcz`cbCCWWc)5L1Q{VgRy@HmAoNxcBQiTWt!4aq2F%a{KoCLY
z#O?}Rg4igT9q+Kqo#W{|5;pF`AFv^Dj`4?k^(q3C%x&n8KFp>L<<U!5&GZmri**r;
zzX?an9OLUJ4qi#J3`yh!iXuy$5JCaKpf#4eI!JuJy?F-IbgzVaikvU+ChcV*a>>`2
zjpUL$J2MChe*|^#Y>bUn*8hR_bjCGF=FEzyrL9Fl5O>D!->T22ZL8L@s)9ibjmB4`
zJXB__(CB4(HWOJ3_UR%U&mhuZYI0Itz0~YF()P)WWjOLaZ`Bj8O~^vI_)ro#btlZH
z+_oDN_SPVTVhbiSW&z#gqoHEu-3CHcogm6V!OtT4ZT`f~<28u8hW<4%ae2vI4k8EF
zVlAWGNyTnSI^+y?q?!ATma+tuzh#(pRMZ{EZrPxG+0m6<qLIq13zf+tSmr>E$=Zjf
zvtih`CZOnh`d~zQg5@$0y9@0<+ik5^LEe~%#ZqQjV=~1&gi8v6z}u-ef2uKg5`z5-
z4@|%~e*&)2gdis`;cS-4lY8sTYzp^q@=DN!80;HUQyqsT%fa(C_GI)F7KSfqwBAAR
z(MdLBv!E{CAtfnNZ86Sz=|;7hOd-Z>BLalTtlU~_c&a)AK1HJ>w?^?pLZWZ{aNL}z
zJq&3X5sgyCG5)dsTvJ*b3b81BApOPLuVjq84;3gk--hy(lFpr$mD31QkPb5dunwA+
zD7G<{8JaFop~;v+%TC2x1OH)0v7rxumn8HguOgyCi(f1}+gD=2p0oL>|L4s!r)(?^
zrDz}pSp0R3!S-N{Yq!c>?`l206z8Tg+EgDDB_?MZ_DBnbKRLFK7Rb~k7by8T3_`S8
zI%!!4`=-nBbz__DxTPqBzrbx@B!b#mk2Kni3)(ddes>H&l`O&bZuN8gN&E2U^W(UM
z^D%xve#~VmhKiJ%Yh^BAL%SM4zGmx(6%288AYj8iIBD258iltX&>O7!D*^PY=hxx=
zV9In%H#@tfSrLH<3C^MX3_n*BIPE6xZ2J1$aAc}<+Be&sz%u|r+qcWd>u5?&Tqkm8
zd?&>-oAH;on<mIdy1}}}W<qwTen5Y9s5`JO7kG@lE?3#ePu|_(aweAQ)U%JrcF?;l
zLuK;OtZuSz1-bz&U_V&r5FUmqUY_#P8=Wd4q`E?@9}ICtItmAJ^sdHDVqErsaEprK
z#ET)x24u1^K&=8)-9>%A5q?%^q?RW!=VpiK4l^PIK?Mb?gTr=+v;l<E+%H+^j6h$;
zg!>DKx0|UGeW8b`r_ZU#J>aH>;SfeKaB82(Zt$ERhpb($DB2ClbGO{E>hvNiR9}lo
zHb6x|cR2P<8n-}_bz>Gp7ZOEYjPTMFE+XtGG*!fk(a`&cCFIJm^H{ajalDCFu`f-b
zgPPV~B$_h3jQXW@BYQk>Rc@sQ0!&qSh<JEFC<OZTSl5BQqGIAkK{N;99##1b<r2|J
zjEc)rTytD!XyN^s%~Fbl1K(#eP9d-c^Er#UFM|YJzg-;$ulpflj@G+RXIQ8wbFki}
zT~PH-C(}#32Xm69eNxygVX4LyWGV(nHBWrzdk;NF?CzF5g8hHE@%8WEMP8{*SDD3;
z6=77<%T&4J>*U0Yy=`!1TcRA~peUZEM+e_If(rG+SBOeSB)#2|GovO_e$#ZjM<sL&
zslOaj6a&3El0?S(dsKk^wl}^H#!{7I`Y8cW<pvJwMq0u7Va8lh9VWZ4Ac^y<s*TF~
zUggN5CRtHM1Uxo-lZ<HwYRltG!xU|RfmH3(2Bs=6LCu=2?rBLLCWjb}#0zE`@Gy7a
z*nQ7zRPxdK!Pkgu4R;)oXtBIfZhqu!tDHnWKu;YoTkO|XbFhj^B9=XML?CuMcA<XH
z4flik^tloBhQDQTyHr!)hoMO)*+5PuJ0HFL9p4|BTw8R$|9b@^gdc24Kwh{Hb`_p^
z-(3jtC%H_qnfJR<?Mg|C;^Y12$gO+k)oH?+!Pko)3_NcS%CV_ge)(j#E=+Bv+<md?
z37<D3ZH83lgV{A3)y3Uictn_P<ncSRTPRjuZ1;+mpv`^lr4g2l<N%AH1p+s6K+-|m
z9ug%uZMN^dVU-f@A)Rn51d)}6i<H+f+}<l5L-sf_KzjguH;Jvp%BVlM9=9M4KI<XW
z#yHfMjH+Zq2{+-GGmk-d4HqCLj5M^!c|%@$svRaDkk-p~VCp(RzgB4+MG6n^+Kx;9
z<)5>$m`WUJEg$T<mV3o+f9*)Ft*Mr^W%sx%NT-WTyhPO0rw==HC^{vXt=Cv6VSlI;
zcuLNhm*;*tPf>ZCUgNriS?aKWqtF-#ZZDF}xUEKC)G;eQK}Tz!cmHJzaX`sm4*tig
zu!Zs8*+O9b3tNbP12FtYBPekHji5M~oBnZc{cn>(8RJaVzt$rlGC-B8bj&&{eCN3;
z{HU-7`xdS{IrPg6p>!=@I)Jo2BoG~qJZn8-S-cYK*Rz*k9I`RjC~6gLviQ_;23!BE
z!_(doXoVBeqJIC+4xh(8G!2Su9*?IIn;{u`mxE{+5udoI-;qpI?||bfxX9Pk=LcJM
z4jO2K@B1B`cE^G8QW{UfsYno`(O5^!4-nw$WQp|6?zTa_VdMZTzqJ}fo~0<cLf05p
z=D~QI(5^}|%;#~Ub`BgjysOSb4A!t@cztx1+d`&VEL%83^T{!1>#})6v#iFPm7E$W
z&~aBuA72@HL|PF168ZHy*VShm+qD}F?h!xr$|xDb%<{;$I=t@RudYbeN#|`F5Ld|0
zqBY02+-4*?za#Ae%yVzBn(JUg>rM-&g(p$0u)zfaQZ&Q>%+xXX8&USl^`ja9a(}uz
z?M28<fxT3v4g#=Wep{sBf>{d-nz~oZm_j7?RWllHfEPCyE|~*k*sK=pz!7|?pyyd2
z{!SMk!^SGG1|i&q-0z<mP3I~@xs0R~4ixmdIfFY`aUz=r5qI`<*Yy@*bmE(ovBSy2
zx;T1z!r%P0p?GJD<wN~-!PVJ)$)PUB1(^FlIN{DN{6Gd5O4pvHG<WGtLh{oP>{Ht%
z7Y9-Df~un7F|%{N)9l3@R14rZ>^1+C;x2tz^u~Gci6~c|oJNOB9m+?^^D*%USQ;(M
zlLtV-IyGRA6`g?fW5@ZDWb;vg=F{ywwq#Y3KOo*%rYx~UsYoi#t$mQuNVn)-*>{dG
zwAvS>dA<z%pn3&`wb=bYS}c!yDOsW6rx}jhq#ytCEl-;m#Zt&)xO|=~B_BfkCHqN#
zv3n-}<H4~L4H&)pZMLf^tl@Q<_E@8D{`-4IMu;f&`ujWO{k}|ZG0fz|yd4lBx_&1&
zcBkzD2J*AvF>xaYnFs!!cxK`y=pG-Cfa1fOLTrp4K4B)k5K`*dDPYwv=K9adP7sb<
zQBag|U(cBY069Qxb@~iHtSz$HC{YgnGimOz>WCj5J`7BzoW&ion8hx4&$r~~`)+DF
zc{mOEu}c@#(rqVcsYo8dHHzCpq*)8J)N$KvfPt8g+3yLZY^}@%;CJo9i5%sMF@T;>
zDP1rm9$fJ2_S~g=`yO1*I9hzCFB)q&fd$1ps*-n%s4^^M!^gah!IOPN<q4&OlGmkx
z)KJb8{4rJz1WlVFKj>oIozqvs_K7&DXs2l>t7l^oB-Ft+9UMx_TfK4>zm8$=2pZqW
zo?=U8vevmb@kNI|g%7q%1IX^9NrOgpO214xXM9F0Cz;{+YB4%L>Z)_g7Ky?sH&y|4
zPmF8iBqXm{j3x^j2vx6T0#<%^E^U4f9g%0e1A`_A$`-c;*3}Hy@p>g?$sNW!Dm#fw
zlD!c)uw{LoG<v<g%$Hb##us|aVMm=*pLcq@WBeq!;UaPQ$#^v90D&-&=Ste3dT}(A
zFhhHOfa7ev)E>5|N07n8iMMysYI&c}h^@t+MnM`mOD}x(LB%3P@4E9tru+%p5E?!8
zK`cmtkx09db_0glN)RL<YmmwYVB3komOo)gqg<qIc;|Ew$F$pvCTnq@bmRIf#!i2S
zfNT#vZP1(c?5Leu$@ceM=bmTO7XNU4TdEBgRrx7ry9vVruJo3wC0#kVp_<mzwVxyK
zOP`d;m|VCdrl^-3nryP7-Fl|NI+Tg_A|u-;6~F4Sqm06;k>}OjMPK~5%K+(~9eOjj
z^oLrdr~lgQEDtPF8!*#}CgFQ@k5XzPOdmj}_gUbDe8!JR0CbfZk=e+kgv&73wVnMN
zjiI{(sc4x_Qt#9(ww`(pPTKF%`=moyIZ|L{=Zv3^L(?7WbuXae9qLwvyvysGgOAx8
zEfY5EZpqrKS{n~3&x1$0`r#)>=gtt?ZRKN7L?DKoo*c7A`Lj0RkLBU;F0oW!fPZy6
z!B#x$Mo<6%aD;!??J)mExBL5M@E_9bAOZe*@yVcy|Ks3)y+QqVvvV|daCD$kR)Yr6
z>S$AK{<HpSbbrsQp#Jj*b#={c%pG-g{{`Eir~+Qfe5(L1PD3jXE+#c4BPDwX^bf0K
z6yFz6)IWhB|5<;1(|?61`G17?mpzlNt~9@(tfCs7og1?S-TxQs)So#2TK_&)g1<Qf
zt?s`G`#<Ei{%h9j|8-pcub;}m*v8PzTHoH1;a}XJB2{bIeRlY-EnWS?u0ofp^*XzX
zpmTBRVcJS^d<}D#!MqXajisG|xJ!UVi|_ZS{SJ2KOMhCtH^yu?laEMKZgnw)0^#I9
z4e)CC2t@)GiLiyp8<|@djO1?S2=iCXD&>-T`lnO(Z`|z!QAFynVp${YJx{<@Q>wE(
zq`ue8KsZH8ys{3`_yHwF2>h)3)zit_i;pvVOwYD92wE9B-s!2X9|`lTtBbs5kz0N;
zBVh_BLId#FX%6n4OH^?$2HvuQn>-he@Z1R6x%hez^ZYwhV`w}NVy+_kI4Ci3ds^T<
zuP)Y@M<<S?keUE4)T@2q@QNo2#0|+A$83&Znt~em;!dPUg{lkrM76%WdPYQpF!chh
z(PbJmL?BdJA;#D)T$)w#+naI569+x3f|>T1^FmZTU5`?8^kk!m%LDE%J<d~L#8%~D
zCcA+^%d<rz2}evLm}0{D?qVG0pvus(Avjn$V=dKRKdaD7et)2qBfw@&B9GA)>e?pf
z@eY|S)Edc-01HzlAeWMGl3}q{NW$x_+Y1sEg4BtE3j0$gf=3Y+3lI_+s_cYGm83Nt
z>_+!jfjWY7$l`Yq7qWKMNYxZmv7cp3{0=tgjVgE4FW6$1EE7T_RL|Es>fEtq)0h6?
z4YRpf%($jkzqL$u0vPoot+<jc?*P9YuN1J9J;9oP%nYH*5C`WCB%O}{WhI8mHy`%<
zm0S+zheSk!Fi{fqY}d2l3)0xZ@JUGsn!u3Id#m?0;dAWmN+aqxyhMycsfO?GxWPaY
zSZQN9272@bgPW@ljNI87W;oG@W(4`_QlbXh2ry1(j}qNmR8$^jIX5Jjx$=nU=0?@-
z6{%El2q#`(0n$hZ6MyQC-S=Vi<tn{4mGmLk{vI2ru1y)>{_z-$eS{>?Q(|dz#I*8k
z!}oT+RSy=a#Viu-R4oA2uJ!3-574^qW$OfYs<GCaHk*u)wpfe}w>$PQQf5?p9s&sU
zqpVtpuQB>grhO1Elz7o>ED@BTVu=7htPnu1+8-B##o)I08RkQ`wTVVvy@}3aVK>28
zHP*%pq?w}=t+MRWo_*~Ziyj`Tqq!zKJ9FsnP&>?Y{1v(eSXEK&<<$3-D0UXSePy3`
z8E}fFMSrA1^q8!U8p>Kx0jyELs{39UL+`9Q(o&iSmwN%uwZL04VqTueDt3SeuN5nY
zc8v#FJ<LB+_AGj}1jhus1$o!jUAnh)fUHZp?&j4N`ZSUUwVU51#@87=T{|U;&=o#u
z(s>THOt2;^Pr}!=uG@~92Y!ziVnmW0qamRCW|Z$+-<sQ8k~(h^yX;<D`PMu&Vwr?|
zpQ&{$zE7bA>D+f*-)viTaPf3?ta;G2J<-*fxvyj0O4i$hQ%J~GG2(=k5aS7_x=k`9
zNWkYZB-+8J_7a14^c&>j=sPBmyk@9xgKP*Q)BUPZ3nGg=8zc(+At_EMlM~y@yMRU*
zW1(E&|F*q%{J9&#Yq$WX$jrvVyJJ-i#vx&|h_^Qj50}R;_F!wB-HkJ6YOuw9Q}+2&
zMscdob2w;E2gB}Xd5Hoa-|enRhsrMa?C}T8{Tr9j*WH_|Gph=tV-{wk3wWSgU;uX?
ze^N1m%(8QnQ%KY747W<xNR6QD=%5fSUb~5rgUR(!=?^BsG8eA4dUFpBa_2ALzovj{
z+fnZ;5C8xRxPLbV{Qs>G|ErkN>i*ku`XAjlaR0fh8@M?d>pD2vo7<S`{&n&HqI(1>
zjYntDBX8cLG>?LnLs-mEmy4}7NK-k@s#p|f$QyImK?n^MKan8)EZd&NJ7)nAm>b4N
zeR|{={y7}OL9IiJ&!S%TOy#=LrQ7k8gx_2YL~s|386m)@zBq(9lEqX`fb7K!KJ_S)
zB6MPQqNuJv6Y<BxiQwl*0kVhq3-LPtInnaZha!l&reG<H1z-E524cnol0A?kG6&VV
zz7#MObO4`dZZz4taui7QW&cG*$1B*(3Hg@%yb118a&Uho@#7t7$(JPRkXqwLyWc49
zTbifz8cux}M?0~&XES+Y(}Ol5S3@k7-#kv}Q9B$~5p#gwMgWcbg?^Burzj&5PoYb2
z&I+l$unzs93uaLiw}?Ddg=huYa6HbCwXi9(kMY4L%kv9v6a`oU=G(uJebCd77T~s0
zqu`1cmKCCDIkq6DcIS^TO$Kp2&P6Ivi7!%nWO#tbux|p*=hJL<B_)GqV+RVrvOW8d
zbZ>Xn1~*&T$*v53j%9T5h;PuOu_s8RtrjJ_){as@arM;X)I-T?P-#vNDs~)um3(9<
z!35j51&Dw7i`yXXI|L`Q@?w?=1g%AE_}M<x3r+VXP;kHEZLCBt;^}&siqKuHPwylQ
zzawlnzQ!MaAe*VbhUiTKLduUSKrx&*J9UtBjhTOg{gp%oSmaWVfB*nWe-i2M$3%+1
z9lic9o~{4rRKow~R5G+OH@0!qbuhMfHn#t}WV+P)wp|~J|K9W&{Q2seaE07wudzXJ
z{b3iqPdp`QK%}ycY)v4yAh8yBP`6Qpvbp>9oROyOtsaj^F)Xu<Ly24_d^I&SG39Au
z(tVQ3^O{0N9Sem}lU<8fOqcV)^^-%Rn9+Z@_DI)}{76bDOv>qt_|DL0aGvPh5he9W
z9@QeQka0%>u(M-6$}PTGWtIC^&WY2Q*8#py?4W>U^+Q}<hE@0b0chS%tqOPNzPXH>
z>%fi!zpc&bWNPQ_^#>O3BT=<m`Ot+)#WEvpP9!BWFaP@L*s|xotwVav#4;4dF!zkW
zPz@umc4V3y69U!cFksH;Vd|%Z;rh}dpsrRI_xJ9lqqD=OzQ*g*{lnwq`|4!Kc?|D-
z1teB3EDbOQD&)HN^AV%8F=sKaJmI^tuy?sB6fDvZ6^NYbm>ANFKKpaPG?QmCQiuCw
zznqS+_{!F3dC#e#qsRTdi>-<6ln-~#W~QuejQZ~*J}&Y^VCtYuGQ+0?x(BB|Mcr}7
z1$5AFYIRAs*dmFITa52LV8ssB;xvZeAq+%Awk$VnoSd8*HzWZR04PdT1SO12x*-Lo
zLd6YCaH6<h0<yPEe0e_%SzYfK-tImRnDe^>K)zHHnlA+UR{RoOpeBMmC7eC7iSxic
z7lwA&QY0I}%1ktbv`oHBxxeRgFXXK4BRXVZ&OL;fMV@1b(f|W^pquZ`$VMmX*sMmJ
zp61|C=tQq?&C|RE2QBtfy*^&zI$KPxYf3H-A461A6j?h>e7fD=lcjZa8GXYd#@98S
zALF}Z=xj@7-tLmuw!iFt{c?Nt2M;5OD6*LXz26r;jOrh)7l;qcIy~VHbf&7F=G?Rt
z>U1O5i0b%pxg`@9Q|Jm;uI+3VN`l*nn{ErD{dCRwdv%y%Ssk+;MCydsvKLOed_EO&
z$VFYbq%3!UhsNbuuc{v%!l{jBu^ok5%eUd>Sz%zOG$|$QSsYSHs{fnfW<9U6E3Vdd
z_x;-L!~$#vQjQCqn)!LgrgdA&#ctj>e>igQUh9md9zoMBd)cNdsOxSe8<}EEoyZ)+
z;DM)tl(qmz=Qhb`wYzR!o^G1cXiWm`bou<t885I);Q}~hN-;G;KfBcZ<+6{gq{y2b
zsLS<=uJlpv){cYH92N{1{I|LpALk%xWISoY2R#i%++#TG6USSO!A~fXgXI-}zw%}K
z-P+UidCjJHEqw`^`bGDNBaEZvj@$(Mq!G1W#h=q1xq+d+hW>esQnMWe`TdKC?<KJX
z`sZ#<V~Q3{2=r@nXaoUt=pGa&=<0l1=uGYx<H#$5{4uTzSO_<>(eehXrs%r9@ZJ-&
z-5}ij>ks8)G$9q~fGgrCloFOhF`DeqzY2k3Wjp4tWkn_p49uc1c(DMzSHoseu*}r-
zzeel#m{L5h=d}-7Ph8BuTKMX*qcH=oKIjD*n>t0!?i32c5!=t%H(<MYIn18~4IkwW
z`-iCJY;D@Goh(ST1hj<`I)(1!$VCuj=D{2gEF-savotb=9de0MmfDDXdUI;Of4=7U
zH7A-MoD+;P*uNeyUmI5~C=rPRhVRA>7h~Y<@zO>GR~+N1URz|H25rED?5uCJ>WZ%(
zZicwCwlcf@Kw@~&q+~NxqG|y|RvY`Wme)IsXxS57Kr@xDg=s=er-6rp?gPPwq+P#%
zmIBG?zueV{MygyU)JA8^zVDe%Omo5Rd}t#y0N93Hpg>{sj*tq2JbT<DK&)si{y8A_
zlNusZv48=)w1VER5=*s-PrYj_mMIk(F_b)hz##=y09_97KAECFStL#n8G>&_Ev>Zt
z9Nv*?m{n0$Zcb>*-#kWp`PiR2z*-^CZL~jC=_;VWMx$g15))v_yvtP&;L>+I9&Qn!
z&49m;`w8{qIT8;iw1KsowB{PHZTJGnbk}tb5DdJpS+mIP08~A_4{Kn~C0Ak=tXHBW
z=_3p=$x$utGbLeLM3gIy!%NRZLa5_|=e~c}XhHy2^%cV@%&tu^c&b*5ai4P)l_vGz
zdEmWE{1Z;Ke?HrS-Q@Vh+ezqwiH4vK!g14j9#E6ri_YWp)?7M<r=J+qgUzg!da4=y
z$y2PVvo`>2mRrypf&mes(52YwMnNN%!pVh-5aEpihzp;HG;9oG<X`|a%rsbzrM5c6
zMEpEG(clV7#rOuFn=Q+Fptz&ae|!z(1hTMo6SKFICh!9Nw-YrP_x!jC7z~Y<wh(9r
zBek55w>cjxOzF8;7^n;INP%$da1`(ytpRYsBFm*Sdi<#IwH8l_kq5LUKn?zJ5qyjB
z1GW~`CMxxafmi3&?K<g2C70o}dJ$+2RNCO%&ci14kkqo?4Py@saKfSr8K{YzUM&?y
zaz-jG+G3SeUP?)sQKeOB9P>wsfD|3zKB__ML76;&OH~K4fhtGjE2Eech!WX~so+(Z
zp_1nyJ*^8u1*WZhg{mub4^NUv2r*zS^N$#7f&odqWKfv#Gc#K)w}ZJ-tfg+Cxd?g8
z)GhMzG0=fY4G_!sA=UyCIBIBvh%z<HG$p)@kj%RiixhtioRXaNwE=}DlRB{A;|-Nx
zE!6%EjfvD_zG&3aLygLq13FfvMmK4egn#r5#-yqE-NKe6cW4Xn*}a}pEPAL>j7h_D
zt<t=9^xetj$jHIU6SU1DqBQ=hntkv$VkkvYEKvS;$?xRc*6WdguAvqFIwz(K9jw%8
zB5t4>Z$Fl-5fQrpHd9nYqrN#5ja*;B7Ny7IXAP<0E+ggOaB`d#?k5hIf#y>i>Bv^O
zz31w=EuuJM$IkVuq^O+}F9R%qwsq7FD?c<P>1}HqaHG?PTz`-~mv9-H{Swi$_4d(M
zZ#cT59q{zhuuU1RF_=^()xT;Mi=Mz5W}-FTtU+a1az)lFzM%$?l6IrdH7JG4FpsVf
z#LaYWI>>kzB=gR_8pc6O2+e3K^kr+H^ZmwbVX;_MR2)}PE5WYRm}Px~-X2FMg#S91
zux{gATEBoMf;Hn@k!6HI3s6B!*v53qLF~=I+g>br)nQ-t*?X7iI)|2bh&uCS32|=%
zUHBTZU|*%HYFmUi%>PEBv<$|6VcMKRq%gRwYVn?C<C4fT6e!|A;k|%(4;+QOl0vD~
zo#OSAW%s%wZJG9Stcre;<rn08c2(s%<JybVA9-RTXehfJ6maid`*8xB^bE)d?1OuE
ztT{<H(cfE2;1th)MA|^nQuPvE_zQE+QB~+MF$tZ^;P=TEBmyN`GV1RkE}728O8;9r
z2xb6m&i6*;7pohUVr~OMdG!HQ#z1c2JSjaA!vh(8mn?pZPWUi6z@sc~<hG&}D?aJ-
zP!7?nQ}+1r{z$7(yHqyJD3RyecDSQ~^hh6|%^XvWuC|x`EO@jNlZu^4Lg@%W&TH*5
zq#1XAtnN*7g6#fwzv6yUQ0@b{%*FLfJ7KLc(MB@t8_P|ItNv69o%Z{qrrl>F$>-6=
z=+%d}K^Bi=pJ0D7Nsj#n=$(>=OcPih0rDS~Vt@3$Y7JFrA#%H~D_{UbjLsx^GPE+I
zb>J6V7n%z$6B_k*-P6daz>+>w{5c#aA?yv_^8jgv^|&G=U6K6ZQBu5EPl<tVn0FM{
za6*MY&3mKW9d+&HF6nkK9BE~lU3RDREK*)ehjqdB_n1LQiy!Plg+y^2G3(j`zuzpm
zIoz7Pnm@;DZgqc54&*YNnAgZ#vtFN-vwI{N>{d1F5E`#Ykx9Vi+WSLt-s;km!6!=x
zqezsp92GZ=z@4*Wfd<RX&R06K9PK~IlO8lkvdMM}u(^Ws+1|0#XDE611{oQv5Fh(u
zxcJv)zJ)7BMY`Fzo9|?j3=+g7hHNyO5-G@>kL}oQ;ga)t7A-J50ODI)*FvVR-fCWL
z|8g*aT1H4}u#~B8Gb`FZ{bFvZ`C9S$J^y|nEz&8C-T%CE$_?6|PE)^vP3vg3WOrKe
zizr*zpcWsJ+-bNj*nx&n6=s=L8aMi8+<{+o0~3C^sWUU9g`<iM<IfgaPlz+U1~Prj
zyoeh*_*(Mj#)$UXLL3P*jC+j(&f-rvOEWXS*0GD)jPkT2$)P1NOGoU4L1T;KPM#KZ
z@`stAxU~UX<T*^FM{op-sot_KzceM9x^c9#p*w|Q{@P*R`_w_J!x9IdAj=DVwd&e=
z=P(Y(MFm5{ZklBsDEpR)YM|-4s{VHC3+{)}WIW*j7J@A_t~in*)@BF^Ejnky3F>7W
za5jXv=g%*zw`l2kZ|d2l!(w&|(CY9F?Rid-QJttcjmo7!2-m&|YM+T$i2(-k{`H9S
zBJ0md33t2)o6M@=vu9_Rv<IM8HZ&WJ^BqAM+vYpRJMidBXnGDu->3SENuiqFp!HgB
zn3?QQ7ekz(h6MLguCk#vlXvmX&jE8?CYnK(aYRN6A6kLD=0u5uUj(|=;|N1m-C;q0
zMD6aLjjn@@tcoW4Jj;sjV}rG8=_TtvbDP~went-me$>U=3wdmnwaarZbM@DSQ_XU2
z9(rx$koQ45X>LF}pgZ#mpSKL{eiBMRrL$KvR~<M2z~d^HAvqL0DT(#T!^nR~2@#~o
z(&hBbrGzE0^sGn+;l4^FVL6YS#38ep%LQboKAmw8b+#V;7RI%T=IRSCco|jB15~wD
zN!FONB1m)w?bx!nCd<ZhFg@mwBCxZUVEfBu;RL;5?Op{3>wqW=!mMGa0;JGA{BHqr
z8hmkjk8iD=GWvSSKvCU^EGTs%e)XdL@pfWLHo^=6<-~@V{eV*GwNjR-6XYndTCUyZ
zRQbb;egbD>>F{BwAh{E&;}5tlMHWuNYJb8kSEf<Ry^?g{uO+3$%+c4Z*tC2@%5xEC
znglX}?tOwCiXT4?x$}vn926&8V=c4E;P0_kyE?`6G~F+NazV(8TXTC-#J}v=`1OTp
zlAmO(qT;pI61U;XB8ICCp2K)N#Pk}<G*k91QWb_AUwK(=AYa=#vVh?z(+m3XJirOo
zfnvmfep?$PZ{sm%`gywPVC}LLzj2=$fIazzNw?#-S|uyEu0ciuv6DyjrKM?**p9U4
z9TbA9ZlF3DnVU0?hDo_2a*%=%IQw#7(g{h1g{KNE*OUtHT@j~b@pGo1VOwx+wqwE%
zU$&hLh1w7%bnz)-wWBqHNi8_!+%dw~oLAAKEn`icLn7a1G3yjR1;aE3EPv(|%;!E0
zmp}2lTo^8aZ5sMo$9^vGJ$V6jE4T8dxNI!NmsmwDOdxP`_sz|?A8r&x+{7SsP$AtE
z+(dD@4-Nbn!*GUeFDWiZb?4HQeoG1PN%p=vo}Rm&p0o8ieg3X!;G)3aBHdG^Ym4<F
za){7~b63{4ZMSGM0e3(QiXH(ytH1&dR|cG@_I-6Kkj5;yEQ@_No0V=d)>1zd@`(Q>
zigI6uNV@^zBs;2kc|cnDd3Hd^;~;|9?)Y42=n<&0wmk><oFxhL!k9tTA%JO_!JleG
zuyzg3Vggs~ouNefUcSY8`WgEK+8}<MCnH$Hz}3h>J)iG%rKc(O;8az87HiopetnaJ
zdHthoX9jRRvC%Xtw*;?<QLwWJsI5V3nyhN80Y$8dv9N+28P8mLY(rn{bxanGz=)k)
z)p53PY5j{cVY9;@-&d9Ngb_S>{m)LJM>hcCBoPm_Ji8n8geQilMNl{f>BG4gueqO^
z&Z4D2h%Ky4z4?2)e@g7*r-i@ZM&ZIM8zsoa>X%8f$<*FjAg*N4s?>nMw22Z<<UJl_
zxzPu)>sLjDsartZQmM$q&}#{t$#Yut2<306p1zM`{0a9V#q#-mpCK+qwV!KvEvuw0
z2whR-8Q#wQvS|?ka&@WJZIRv`j3p*JN$cARfTlj~Z${DM3MT~03g(noH)u{5y<Wsv
z8SU^Vr;eu+AM7FO^H4_65=%$JWS&n=?$;|ewfm0=)zA(HCzG{)!B@z~j?+j8Bgo64
z+!>EeGeyfo0#02PAxaZ!gMcrYOgn`J0YmmDIy(HgDMTEM{y)xrPT!<!!aHHSx!!dD
z!o)Vzo8>$HP|FQU{|*zQ`dduw{|Y4fk4D9?{}~lCG5@2mGI#%bL~K!Y+j5;A-uqdX
zU;`}!6e*)G7hY|2DTJSGrL9Ch&?-l>q%KL~k5qN}8u>f%yivTJ_Kz4-+@<aPifeP!
zS$kPsLJw1cP{MwS*o~KHmOGPc>W9jyC4sEiL}3>g>|oY@zBj2bKlvo3ws>yh!Jvki
zbUCA}5M<&pCh@+7h*~8`4fBjf|1ZqRih<K-GZ=|wR<<4+Vxua4Y|chKFP9F;x(*}m
zZ<x9yT`i)U2d?p*N|ebFy;M8Bc<1%ReRc{BSq4PtWkYcV6ckg9T$}7SGT2^IntKHg
z_RT3UPQd8Vc2I~}slGIYg9bErLLL5mEUY1@B90-EvJzj@$u_qMM@tL-qt2qDSC4^E
zKn|wRp`|<IsKe7lsE^BJvJ*(P^d}Ivta7)RG?rm2W_ot2y~MbvJnLC}KD~&gYjIfy
zi<d6ma3=R%J~-4Z)29e8mOSm9IPYEj=5bIxzhX#XUOHXHY#4-NxdD6^gM1@Uz1DDr
z`VJ5dlYvkMp?g!PHXQQHAsq01cYmmwOAbGIHAYZiJxs14(OQaOWiA_%t)NPPed}lI
zzz&EEvy03+_+FMroFxH)XfX;K^%|r{&RMZxXN2pseS-A>FMiTM4z*dnZfz**?b&WS
zFFHpE*RaH7ri6^2ngv1&LG%|LU0$STQflmH_vOf9pQlX*9e_w1s^$BhuJ^Zk^}_Jd
z_Gwb_4Shg-3(g+s^^KId;8<1smn~ME!HiC(np9oJ^;O@wOSes=o*APg{sP4ZY$JRE
zX?zdF66ZVAJ)#Ft#JR``F2uos%8;QWVsq@wQH4*X{`e=(+9YaCnTEXR?HKXcGI1R+
zKcH$qYPZOOFA+95I(N6HhE4!}k$0BKtUrTyv)R>nW$wNsaH9#gkcC%oXnUsZ_g#Xl
z4Dp~|sndH5GjAhYDRs$m_?%PJ*hU`vtt9mwn{H6{^ai~_R?nKHcrd%a5PCf@dgLdl
zobD{Iom;1eSS8u!^LGt4x%Qa6_-N?a{M49011~c28zQH<u?$dXHtDhqHbKGo{s)7V
zD)0n=1PlP6`fqhCWPe*|{>Qxgk1jHQ>39A&p@_N79}3IK`VahWZ1levtp8oixv3`U
z@JGzqb+4+o0hOfhb~eiNlelS)Qkq(J4|@&@j4*}KI$jT8B>c;Z4}cgDK0Zl8nrkuA
zm>acy`=_6BuUQnMhH`9JJS%pRlVlnnhvz1r5veA=EOdBjPILP<?tHR5|Dm`LyTX3y
ziECMBC1oB0_6pPQY(s2<*H#s-oP{YCYfKu}UrR_2(cDO#@wx3;uDMZLY3hu)BrrNi
zfSPspMzQ4@aTlXUPwk{qu^0~<lMm6esHrO?;eMzT%UzdO<W)n0ta?O7TJ^mUJN_k`
z@TS4=Oe3t3Dbv2YSY_*w?h7JXtCa;OXb))tO+OKls}7!lf#bi&!~tjj_|YE@)4jwT
z-c8G%NF)r_pS<mVJbr!*X}Gnwczt@`rqrVfb0CNIwjBVWPZ6)fI!md>)JqmMW9!qV
zauangwl$TQ<is}xk;cFzZPg<@f{<z{yq4e?xRUi35<XIrnT^xV`ZAArFZE7Q_Hk%1
z+dC;r*6{z-pvDbRfN(0FN8GQY03BioDHoIDA3HmvDPu>MZim|zOBvR0f;fYnn)8z#
z)@Q}e!jz9|SF8FRQB}(i&-WVsn<eF2hsU~I9=#lC0`p*G`1po#2QYli_h;VkDLrYH
zJv$fUOmZ<9i74D=xQ4(uOkx1<#agM7b5@mF{U|caaK{lUZIeitPG!xW(qPer^W7yF
zRRpxgGF0=5@_@B_>J;{p=Ylp)fR*M@(Y9?jo-VJe^JktPPr^5`N5E}~Rv?4I5X#vv
zpiJ2$qP4!km6Sa!3yGI@(wKF@v`C_p;{td#NxoYm@j`RW4ORoj>>E};iY|vD#sKGa
zn!>u~=3Xy5D{WlH2QPVay4={hyM`k&I`h^doX&kSaHlT#MmfvTT~pFj>t$-dR_$6}
z^O&z*l=ZmnrJ#v-C(=PdinQQK_slh*k@?%<<S!?q7}3zs#up2?V_tyuV+$5-{%nE4
zg+K4wiYILzxNSJLgC&ENjh){_5AT|#Nl0=K&U@tKtbo+fb}k{)Df!a8&5<FX@H3Ic
zF9g7tk_HohUIJ3Uo2WW9WQx@~VpP}=Fu34`(pCu9`*t$2a<F;=#s85#K_;@5xZf&B
zw^BW@d#EV*OV-j+CFsJ2(pjTozGuDCTL%><g<Bs_#!%NOF`;6#1EKOU1uuBl>K<{V
zZ8b4VYRFld*cq3!w&bFgg57!cmYtjB@M7uX!p27LUC=h#XR?u>E}bCx+JI!*)wWO>
zy7i&<>eO19>D&|~M9b;g%N9G|y$n{9)c$N~v~l@$@aQ7Xt_4B)$*EnAYVEt}n@pwg
z;3E=tp=?T#<^%VBqxMtxjGd>-yJhoX`rFoH^7^ltjoPZZ=lxG9?L+x@GaJd@meT*B
zy8K5MQk;J-r2qKe1busZeYd|OFr$<tV>0OByWUm*?BW$5Cm6*g6t-w@FV`sRY~T@&
zW059>h+|k_%{bb2nV{l>pkW)@whZpgKCU&5(!1?p8tnExx@A1LwfbpzfrI>%dCKw)
zu$C=v;J#01Cjc6zkIN9lDcK4?+=q%8v?HEwx^^c(@_ykfK#IyDhGhR@&Zw@2jN)I5
zX`X<*nq^Fb$w_T|zkY|Py#q54HmAM2RO(Z_He9~I&sluB8$`uIob$^)vPcBvE0{Tj
z9DiE4x}FCUiGJt_F>?31F?ij@;7vMvq?0G0S5G?}`(^}Y4X@Q!4)@%kOr}PDiY><a
zBNw!Niiilo!3VX<ze{ZXws@U7PSYnMSGTSLGls8!#ymgtC<aRONNXivpBHxFma!>w
zY^Ks?hGk<>#Q0JAKt12eQQ#e=rUvD=A()J<%!EciV7Gc!sp26s-;mb<nQ_Q(MnEs#
zg{MbVz&YJOt^7Nx0zIIX3xNIHD-YyT8dNB{%6i-V&A^lL;KB9RYNAzzGCVkd4*rb5
z;)Ieg@aP90gzO<A@{aI28Ij3X_gariB-ML!H^utQ5SPz(>w2L@9;P;l2CDqGZ?JJk
zH%TJS4$WlAJM?o}Uon49;YH3z?DQ^=O4i@o!MX2r=$u~x>ys$<8eO||JdAU!a|>eU
zJXgi3{xx60wx(RoOVk{5%2oJIXpg~9vZ2U`B!Q}4!B1@t%5U{w{}6n(a5bV}0RaGD
zg8jQ(Bmdi6`=1}c|LAo4KO_VHxLQMfD=PziL(9L*xJwP|Khq21SGBI+$g^L1Ii|Qx
zHPfkiaa|?x)L<={!ukd=F07CMs#Uxm|6J4BsZaMHl(=u9&c%v!4Tm&o^2p%c>ZU0g
zRP3d5JIgLe$$%)ez<z4Mu{QGs+*{RB47;RiWH+B^Y6%r$tX)*}ElC)?w1zc$%K^1e
zb^F9zB+{}lb0mhWJgR)m>i~?I1UqazZC^wnsJes}$*ckq4{Ff?EzAiaURg~byT5r$
zJY+q75rZkNI3lL@QU39ngygqWT~M-o!5j+uA=aIP8~^sf3K7(W_A#5GOu+()i%tbI
zfwXu$QPlFb_9Hjy5c_0^9kw>&{uN4B?-j|*EozMeMoG}lJ=j%;0$baOCk$;Az{loW
zW^HZ+DLXhwBE~LTV2)w5Adrn@o4u8Q!ZMj8^iMk;np1;mqF-ml%*51^TwvgU9JRSE
zd+|}nAH3V0zqTWNaNCdfU(abJZsJ=;DJ8v}v3jR{V_PCRQ2fg9@eytn#`0w<7T18I
z=C5Ibr-SIh^NB%^e06`J|DX?Ho;~f5D@H*$htQr9byN0S>+^L+YwUhBcv0-t3>Qym
zyeni=)*bqVEe8iRatVbcI<OHNs0%&C`%vU^kr-GKzef-@N8@WKj*1revKE!LbCLp7
z<;sO3HU%0>Q)>MUn*({S4n0U>tOcA2Jl#yh(E%3^x4U`YOW(m(jzq5K32`XgzXk#^
zTwB4<e&h8JEiBZ}#fSIPlW*AcWOEg}liTClf>ZRK>ESw|*>?8h5jm@yH$%#l$?;C%
zD^u3gAvSxe@Z+Ln)QFQOIs61sm|8gZOBfK1Pb(fd3X_MpNDN?K4;nK{j>{mH;WDZp
zi>bU?_*QLBt<JT+FDfzxm`_vZ8HdHL@XK-GGFxe{RJShr^m@^tn9czRN;S|RbS5UL
zNK}ZLF3)3ty4tOQu#yy{LSlY75#*;7Z5N$6h!nHUP$$*ztyS@C!cN>M78n_uiSy~_
zSCxL)?nM2N9NSd19b$gEyhj4V?GlAhnrc3+bz<u0v~s;Y69Io?v{~A=vwX`0RnDk9
zdGXj5Egj`*ghQ~CaC87&u#2ZOCtX3#4{vtQt?w^8dh#no-zS5RN=B7*LJf4_*qdav
zHpH8gVHo39FM0Mo6vHpCPra3zIQa*dHg|5h1Zr`0*M{IP@@~!H;^Jfh%ypoy_Ed)s
zR(K6_nx9ZT_{2rIN1|WqjIf{d8$!&C3#*WY@wkP+VV<JmJ^^4^#i2A~xm#f}(9(x^
zb|qDHC%>{pSp2o&9;y$8`UkI>#1O7Zr`B)E&lSvSv{l;^B^gv%7qpQDRjpc7V6M~!
zB|i34Wnn?oig7gsPCx)}hr4>KckePy*X%Lwq-JM&@CikP8^%tpmY4SEw2vZK2y(j6
z1Z}0bSjrH=MvfK5I@s^Vb@9r42$B^vaztqEG-PptU#NRRmY1?9mEmRJ%ixkTe~qxN
z)Qv5+QkhPK;kz@{uK&8TR+iOD+Yw4^?YlRpRsPi-tz&#_^ji&7hfz^cW@zNpWi}Ts
zC)y+lx{#_vYlHVfOHE&;yf9QjgChRUpq7za&#bF_+pzF~r6ul0Np5jAeZ{H7?ndgU
z<UzVKw0DX}PgBzJy%GKu`LoY;r#WwqnBqs>8J+4OH<`i>2x*0E!p@_m18HkyuXCL~
zN5SN{dbCNG|N4z=xyRC0r`alz_Y3mp*7yE%FWx1tE89S<3qZkr&t9<<Yq^FK8F6Vx
z#tKZ9b=8DY)9s#R>*UeExT}Dz*jDvQ#@h$c?3BWZ*7!#(h#k@|1ZZlKNaQg%T9G63
zpsH5B{54`W+Gt9Vj4Ap|nPr7MuUr)LYIF?vsDqZGQwvKpoo#hTEB}+;&JWjqsZL;K
zge+_cRrVwkqL^QH4w#iiwFAqwP0P?pVe`67*zEnZkYWeuCx_9Dr;e;Kzm#-FDo^s)
zW@0`gtON8#lB&c=EDOSWh12Si=7XcGSVrG06b_HB#7eUa20wp@nH<r;01Xh{oYU+t
zOQ-|golH)bbQZf>@9ze*n$;MXi01g*O<<yQFtD!$!1Qz3S3Ex^#pMU~&=>r4W?i&H
z8L@~vKS~&fFBhOpRFPMGT+mXi2nzT*eRKT0P<#+<*G#CE=G?GUnS9@n<Y4fHo&hbk
zT&;7AR8rJpA6y0@!62hNJjhP@B2}@;86`DAp#LK{UfxY$cWxOU>i@Bl3Z=Ulc!%7C
z##b<#QI(Jw*eN+()_OqG%eDgx*^1BU7kFy809KCc+|-gV0&MHQP@b8aBuAw!0p5T@
z90-#i^DxR@%cWmr&Ci`lB9THJ4DrbS8E))X!zE=VM#m%fWJzi<?0kLcoMqPKymZ!a
z4Zp6{D7vBEgBbsXta_eeQP_+-r&NBWSQR?N+y;bEgm!PJ?~2ml#<$%7ObHudQ4Tjy
zV<>`%A)xG~+5;XF1ceDmVnkyC?-;GLJbR9!q$kOoliA)KDSNbZM_-?t$Uc5&?rv9u
z*;eKKQ{-|(hwZs0t^K^nxQ+iXO6;wCN^QRhHN&8Wc>R-7m2!Md6(<2lda|=E;hW1D
zX%!1I;B&Nc&u)53&r<b<9H+O~X2&~Nea_<n#Nz?DQ76}(^Ea8q108cE@d(2>DqyW@
zwZ+(iOxr|Ru#d<(?UAMdrdnCc<wpG({;lrdog&aWw@+U_@nIU7jK7Q0qU631R%8=w
zizI~z*y0_&*zZS%VulaC?2J4)f2O2xBC(tx(O42sZET&4R11lYIaa~>o3ER6)3X1E
zyLW63wd>M$W81cE+qRvo*tTukwr$(CZLK(2$zD(Qmv?vV=k8tge(L@Q*HtwK=9uF+
z&kv3A(#o^=&+SKvbOEo_F;Ap<AJ<~Pn^3vzf#?rEM{To2w>ma~oFsvL4);<tm)|~!
zCY>5xvonmPbPks)ihzf<BVKaidax8Io#A3>wHR%L6fp?yIn=K0G)0|AX^Jv^m!O>-
z)g;+Wd7erB`xH@mRBNGrb34ivSc}sy#x7{3SH|BW#fF>=h2MBf!6dExktbDM8TFO-
zbCB%ReKQd_;VmG?e?+Qd1acb-jUMl(0$RLv`RsR;7GdJpd8b(M;3vXB4lylj9y9Dg
z@u}m2tZJ<<-C&=$=4^i~rhK{E&GWNow;KL<pM1;HKCZ=&jwVGFeF$Th`yW*nJq!PJ
z+t?Q;dp{N*Jsue;s}v0Li^5Ww3euc3eCSrkw_VUWcZii)hWC=~x9+xL<RxPy&vlP@
zgBDkhsS9>dv3l>Tr05yxt1%m-|GCOg-;zgqsLaJ?xQ;t6+<LzP?MR5172JwL&8z!K
z0OSRet#Bu$2&Cw$_10Z>5tbodTd9jci$+y9?DzW*8Vyn8&aWL30HBHCpW5Mn|6K6@
zt3mu9-4XwbJO4i>dH?al#uop6e^x_V{+bQJcdo8}P`a>IZ3mzSCjvZDpeM(PDrYGS
zig1b~X?TCQ;^w;Xr?;0xI@>s)Fr=3SJ{)t-+X)}raezA5VYgBZU^H9}79FaERn`M6
zT@f^pp+n$U&2g2gUycaEv1r3u{x;RkSk_(;#d^FIJRGQ)-|LVA`l$@$fw}&)TlFhY
z9Tr&B0~g*OYXDWA^Gfi0J@?C#5(FI>6I4lZj)aY7kB$c+C{A+805&sl8PWOB0YRtp
z2wJSTyRe@rVm5yHGG25IjbIwq>)ECqBau2z=4;8=jS5wfWh50FjGZ_LUogen@&Q6O
z10yUEuBP#k+g_ZE&*M6YoX_JjY47IC7+?k!RmrCW)w_11m77z@8#d4(QIKu<YZv@u
zfZWep?2|DIw<oFs;F&wcQed-^hs=3;9!Wnr-^GCrWCO0fZ+Y@pNigG%Q?xwBM7h>1
zIX)B=0`RPS;GS>|EO3RlJkBqTvAH(4_{yB<8nZfOBu%~Zi%W~Qt`*`{<;bpi&(71x
zQ|&x1vuZPI2-PLBPotpB)|Ml$UH2WXEaWIW<9K3b3yvZnCu}_wzxqe)%E^w5t2sBO
z9SAW-A|%wfFk91>wh<-rD`e|Vjly+g3uir*WzpiD@O{GAGZE9%8T2LZ`7I-mklES!
z=u7W=x?hhgoCKb*T+vN+g(2#t#(o62Q%P)l+jJjbvg(JqnZGn2V}?;HT$UPX*?>VY
z$vS!|qSw7%hx@9I6_g`x!X#by4Kp0#HRdow!F!S!Y8a=r*zyQefVkTD&ZnmJ)D+LF
z33Dn^O(cX15~~3x*kl2p)L@PjlPOt(wrnp|m-hb3?*#KDdIeDcR3Itn!yQisY#&=L
zVH;nCY@W()o^owx)!6T?t}h9;#XJ20d570J{}FS)6O)K}44ATJw5hpUU0Li!<yPp(
z?yk&*+5?hRmvAd_ElB<#yh6iUJzYg%OL$-C;0b97L;LOdjobZo^H`T<@MYQR3H`_V
zKox&x(X;7N%Pfv&A%sGZW1~39XR%6G>!qu#fqUzZouvEk&~bUtaw#e%^xX&S(6t{|
z>+J`xc53cbnZ?GT-VoEysn%!Xcu#UmS_cVmnDQJ_SnFtmE8(x~B({T3Zxw7`dwlgd
zP793*YGTbE395*o1u?d%)_kk3c<ST@9&o@GA<8vBrRB*zfunCd7uek2YUPVAZxiRa
z7*`}ca5Jh0Dsn>!+P{|?u;RpjdR%4Dkc;b~4^BaEN$yT&FPw9+6L*t<;5G}FD0VRT
z{SdnWM!u1c{S~Iv6kmY4{r+Qiu#Tmnz5aJj$Bgt($&&1!X9xd~JN`%K$$zPw{?|Nd
zX6S6<_U}h^H@sF(TVjcO&(vh|s-Q9y!NDY3iaD)n2_+hu+DarF52+_c0mt!yus|$$
zi5XR&f}h+kak_4E0O<o!`FbxNMI1<w(@)N?Pfod4)<<EmY*3@M{5ytq^l<FC2T80$
z4684~x5M7kKzY1p#H)8M89P^hx?5k|$Rp|k?ZaSm7<v!!@@UHkNnlK^8PZ7{kRAv8
zJDNNYMrj^&&Hdqd*iQ?NM0HPLiu$FE_z*}}MP?NDJ-ku60`N;z$G(&Ym)d-%`Nwj(
zW%MV>naA8pc~DA5y^m*c=xHByymgcG%Zi?srX%V^Vl)p&Zh2tDcq08FEoPJ-NNDjz
z3Y}4+`5rlFo834#xcJ)78-FML@q?MApM`tTb1}(iHcf^(<El=dJ4Iegavo9dKW$ii
z1mzuv>S}5>IFEjt$Mlhbo)un;1OIXoLt)^G^+<kn>RtaZhzZ=-o<Lw*!}to0<v-Cq
z6&n(r&S7vydo$_vmwRS^&_oH-P3>v!nuh+eNW1*by$^Ov1|zGzbHB~wS$^PmBl}|T
zDM9(!*WnJL9AAlPns4}k-Tk7{bH13O=EHUrFrGXb7(Ut6@XmJVccQ6JIC<qw`((7|
zqnEsacmvv;xCe$nN8+;oL*e>`h!27cJ6(ay1&|X)Q1}^pM%Xmx;XWh$JX1CtVk)MG
z00^Ilk8|7O>%qvu%h`ocj}XR~$3dVLlmNnHFGkE42fm&m2qaPL4&F;o^1U_;Nh&Kr
zY{>tZO`)U->xcrLPs+6?hpyd+8*B)WJs2lZ_4u4V#xD4#$Orf=AUpk;Ltx)`pZs8c
zJpJB(#-888H=prVBEA6A>_<T0y1KeL3qQMylZ{BePS@1N&d$C@2RBQ<O#ZTY!QbYK
zot=w|gV%$beXHB;k1l8&5U!{Bi~T8J%Jc(wczg(+EFDE5Z6JRMam8=Tt4Ta%uHc26
zh_L22p}kMjdT74VCnL>O*)=|s8?fg05`#m}gZ!{N-X9hxoxO8sOl`BJdCZ_uJ`Lq^
zC-`lZ@gf39@i<}i-Sr?PG;YFj=~2;*+3AkcZ(x?7Vrw2}?V)Nr9DY47{rEQ>T_Qg9
z%t=)s)rJsQCNH(U-O)jXcu~AFrzBtHfSp_uyH_smVAl4RGoW+|alVKVz;#%&^MNwz
zS-IH<!SSSUW;349+c<z9MqzG~5`}YmwbsZR6Zc_r<+pPx3=v~@*$ClVQCgC5Z!?0X
zXgX##F{ig8W<b280UiK>L4WK!t1JwOehcNR>zl2%>vu?Qz7B&ha<e`>o%$%`m$;%d
z`i+q&U{_DqMn8Ms*;u#>>P!v_AA7lzIsLmkd>kHh4&(&f51z2tB*GBNVvDxma2ksR
zXifS4bM37|t8v2S#|@pblP2b@ARnE<1{oly)t(%m-P^4uEw)qD@SMhiaV`Y>ms6)X
zT+Z{p$gdazzG>r|s{Z@F<07ZwYaxPF0e|j3-$&I0A7{@$#ejP#s%CtC3<0(1Mltfw
zjx>C6wH)b1ifEtdu#GiS3+RNklF4^TDv9m%LWUvL>WHngvmx(&4&vFc@8g12enT4^
z2EGW;Mw)Wg#wl+FzH|%)!&Dh0FsGW=Jzh9(I*y<|4yoeTl@r3MBag%Z!Z>uUOsYC%
z`FWhy+0&-bWREZ}gb_whHq2+l;4APEhY3ez!H3*W{(U_jDS}D~O?*0Y)xSkSH1k-c
z1`&`@oB8T==A4KppvDjSS<?<(P+R!(7YH6k1D6UdqI!-S5F4<^VgPMq7K6Q1FL7lt
zy0bZI!QniBl7SoHj0yai;tvOf*<?zuM`P+RM65Yb`unCXqrMwKQ<E<|poUnOC<&RA
zi-IMi$Ae@`hGs=yx~Y|oo2dM5QKYmJHI`UDUX-iAf1U?&O*2HrlS;ewDz)Ro0-?ws
zINBd(SbisQbRfH)h{)lDZDi4X>M1?>h!?1&AXidGX;Z#yBIi_RU#|dv03+21k2ks<
zZtOt^VGavZ&hh)#3hllR!Q<QACy7{T{ngNP4|z*_)K(T2W!ool@P~r9#J|%EhE(6Y
zHg3RX;kF@O!kR)$Pj0$EQl+-sKpkD(D{2ojWEv}=P-&(J9kxd^sb|1R117!h(<V|!
zI;q$USlL<FH(Y3BYd_%fovw07aNs=@oB-<rD3}d{n)VlCdfeyn6YYdOO&2#OtJU$d
zY*<6gBf-l8A*2N78&c3|FC%zZAh~Xg>2(hbh6Ag;8x0D;bg)}Y$z)9jr0kE<{e=h?
zT8O1FE+zb}TS`%(7c55Os=2qjy{LbFUl9H+Kz}V(5d~neZbDJQc{r$P$O+oVX|F(w
zG@X{b|BbPkF1Qv)3VzI961hvPHaUhL&|3|Ifo@n;(Rg2sxT(36uKRH$t$$)9M&RsT
zeJ`{+J1NPmKGKe_$UJR1?WoC>@=HQvRt6k~w(>-zLRW-uUt`{BISi<c016-pi@vza
zZ2(b7^g`YiN9MEmWay=48~q|RJ-?Q+M)UzNh5>a8`B9Y1_ny*$XUTnlm%crkeuzJC
z9+*#04+|Aor89!Mw;E{i!ns$W3JZG?kLyecfV*={b=jxQ9199WWlX%0cOk>f`<4@7
zwnS8%C3!Lw+EO~=l!{QBV=<l!M{#A@K?O_LW0guco**fU=n(5nJk8hp;w*&T=M8*%
z4SSJA++i@)p3bLf&t$M-Lhrr;9srJKu)lGtk);a>zQzo(BH!&odOxif`Uf8Iy96Kd
ztua8Fe>jk@oTopn-Kt(|f$=nV!iwZ1I<DPIcM)fVS~bRX8Ju0Kyj0AQwS=ZQBOMZZ
zkJi5~loTvkmN<eOdoEfo%X7F`d!^eZsxT}4x|oz;RZyhAdZb{;KZC<>P-fvZD~AR(
zPyid)68?-|APJ{F{#O+!86C%nHWDC{i5)-F(U-WbeC0|}>-!^|?%_w0YGr9e(g)W}
z($s@si{#qmfqWH-Kag)GeUW{iKhcklVO`w2?1r(~cnU7rRP?o(ML4^%@U|upfTuu}
zn!fK`7O0{Y^izQetH|KA)TSwTOIj$#=E}OIKW+R#G|t621qQFSmenDsoHZ7K*yv*<
z>iZcj(mav|4g&+MJ%e!EoCZUoh^6&0skG!I;&8@N{3u!4Xet70H(kg9k+k_m)9S_u
z>J!>mkq5*i2IYGJOoHWsG!SUUV7&16P?I^>m?RLx+Fr%?6XKw5&T@1RC!At+eFMtZ
zpOZ4X80Q~oBAcHxkDz$*-~Dk6hjk2J)SR&>Du}pU;qb?Jgz*qusBL32y}*vHw5~3x
zoMXKc4D8qjye4vlUnbN%PUhaPnBuCvyepMS6U<{V?#5Kq{yb)EINaQ{Lp5PYikL{(
z#6#LJETRt%`E_knUWV1_UhWVO{gShb9vJuMkTnwNpldj5)Q5|}RM+6qow@-ahJxe#
zq_*W`W*=`7pjDJl$Y4T4-k7^yg+48+A}C{o5l9VR^y0-K>n7A(i{}*RJ)NpnmggGT
z^%i7oURkpsB{`pw!&kmSJ)43~8?UcOm#UJ!&*K`(JnmEVa7&Rr_FBQKyH%+$or&$q
zPG|>sDKaIDHH2$u8RxJC@50T>KFCUOGbXIoq`V$fo9#1C>J^BMG@x+3XjS3Te;8>T
z5z74RaYIiyzn8h<3JNahCyWE3Ig%~WB(P)avC<Ao{4k>!xj)QRE`y(bP8M}R(7+1`
zn>aGHnals68qq#ycK!gad=#))!+X;l=tZbPH+hQ7@9L#U?*Ma&5IBVR^sSW|=+jKF
zefqe}8c{gsMy~fzRS-g&C!;79lHj7EE)F0p=<wt7RNSVXdE-4I+EQo~1Vvu^LfE2T
zKmEXCW-e^Hm-MxHwfG#1U08xTk)<u4O<hI3TynTOi*kXeJDSN-DOJ8SirqlDv}rG=
z6vgtIWC1PTF$uTYIyYaiL7AMZvdO|fbt>wBYKOjNfUmR)9miO|!qmj@+tb27s<V-l
zO*p-11!aGZw82}5un2OeC5*07n?OmW@NSvBoSv%j;uAH=kt}AmMS(9fhTk>&QnZ=A
zR$!}%B2~I1*Giws{ASyVuu>td4M&WPxH`P2juRDAO!%0%$3OJQMukOhZyrn-^f`&;
zq%%SmEN27QLOE+o3-ruUZzG1>o!BraDgsmKIP%Iar=pq}M+dmH1FUb2nj8+{>bmSL
zRf~dh&hPqA7uDOQ64Y<hs_OSOtliktxbrQN+|aN4wdQm*LTucsPl608Iu!bnuu3$}
zyZz@GsavszRmz`JO^v1sBK!0A9N+S!o~98zwibOuf-Jq&l;XH|QC5a(O)K4Ei;4l@
zi%7G4XB7817WyNLK)t%ZfNMs3c?<p9W(P7s7g+piocMwwRvn5bV*73{G^&F$7_-@~
z&P@oiZjWO#;F62$+oyxDO37_2i|V{?gn1gJu27HS4720DCyau>9#*!ErQAbR2nMZk
zoL1#6U<4Rh5*!Z=bZRJ;No@F^Cbui|hqj2;R09D8?n{MZ8VK|{^O{>2-0>P2w~<~G
ztK)zNTx0F*qQm@iVtDsr`7&Z@+f@2E=G&15=1wibO1c=BtrpN*KI@X%8&&SX$rP9v
zMN_V`0v0DJuMO57x}bUI5+J?FSTQ2#FTYv4IiD8grLc~3SfV#(0zqH|yw@43fSIX1
zwyt)YwiwbA%UVVn*~M1tb-h)lWQ=jNtmVS_2UKWIZH7b}6*u8D)UnG;5ss*ymse6>
z{WO09U%Fc4FgH_i{PMn%`(~v4E(l3(!3v{=Y1}y8-9&LJf{wRdEIpez<Czm2J)(xI
z>ojL1YF<Q>un~4ZWNv;UwnXx#F9~^2Eq8?|ZyTswihB{n=_KOA)A)ACxC3@@ed#UX
zcH{#l^08+|fci_+^2|ZHtwr;7<WI-p(x&W0@x>`^s#6CA7&1I(?@E-~2-@c$OhU$V
z6OH81I;xlfF7CcU*}!%usvV$Ut5y3-mNQi!^ydA}*_}R`G5ypdJR0#k+tgns4K=J#
z7qML<*>_8F<LA4pO};+Dg7m<Co+_>OZrI4GXdwG~eWed^v^C8;d!C)o*1soKn_gVl
z8m0K#>O`2!EUjQ!E3Q*}VZ54bl%*rp21?Dcs0*K(lw_~=^|vc{0B`U<IYP`O(6r=R
zb1k7)K6|pe<Q-5Yu??{Q$XZ!_N+QNT`bu}xFaD+YPMmaUk*H<?f0NJT9(u(4Z?*KR
zi|cr^#Sr@iej1?5GV6XQ{b{b5<!$~x>S4Y6eNm$@kCWce0K|t)CDiy%q!io7B_Eq}
zbh&B)0u}evv<*R!Wq~ZJ>2U)eG<WNR&0FxOkiB{<T;5O;EjwW%y_X5CIp={Te@S*b
z$uq*Y3g7MF<DkKK)Hy5XZ8vaF5*<zi!<awBmV#8`i%_*dATw~|QTP3aUZ|1=Xc^zp
zv?=)UVPXVOM8<JcF;^$U<{p~tV!`V$daBx=XHzN1S102wdW$2@h=U_2m%nMr?B`qk
zI~nfkD(c-6f&V;$Jr+C@;hEbLv#T(A=h%oK7l=SN(n*v$1`jf7%hI8%!W|*7`f2ZZ
zR~Qx$ho7F(u5W8{a_LNFX!e39+)j4D5&f3?;E)Fskx8^@#S?OftA+&=iD5kJLIInC
zMy?i6$QUd>rH`yZ$UU$JmboD_Ps4`murqIV^v%}bpjRu;E5QXUfTwhzkH)B%9uKY5
z{50b1z*ReJwhmDx!bz}E$hj~^C;(Mbs2-^nu2y3~uLlPLdJT6&Q)`kxO<Y+gRYE>G
zEk0u=ojWcO6B`^ay#>O;-vE3r{hqiQ$~1)Ym)orv@$?R-2HI5l5R2|S8P-bNF-h|S
z%naj&nO3<nA_>ta9PS!$RBL0fszJ%MlmZ;_j4K&Ixc=Zi@vHJeCp?AbYbu$@A@9a4
zZ+xm5=SBmNbOXI1omC}<v~noWPYo86bqGo)n@V5H7!H>LBX`fLcNjFcL(GEm*K_sM
z^^MKjil0!NP1nN4)1tvG3&4qU-2?<=8mIHTJnf9{)#Xdp6;Uf}%MEEu9_#9^b+zo5
zMS0JP4mReNf>&3A>*$>dpW2B_g`O2(nkCzBcVR=v3QOi7_R9v+UCYSC+)KqeiYx2z
zTTZxR&7IN%81c<nb}UR)7N8S6CWi_r-U{^osMkzwRPNi+<4<Tap)6t2^5m22Nyk>O
z#^ET#K5v+o47Tmz;R0R#1v*aFW0=^(oveg`8!AU#SxW;?P~zWu4H7;zz>AZc3cMV?
zq-_l&5m&4iv-1m{%eDUJ#~A>Md6DUvFzYL#$vObqZUG#BjU<EkcRDWi+nQT8^Ch1W
z$}1oQKsi2B_oZ~L_{H`T2L-_U^assa%uMb!xIMxcnP&(E^+)tt1z@tq07re1xulSV
z)%YPEMu9|wC{$ypu-3N|h>`%&^#OToZlRZ&C?6iWUTKC?$`N0GiDS#@kJSneEjM%U
zrI{d5g!X8)GXUiAhhvx&t+*RgI%codrq~=}<{e@>JHz1FqKTrce)i;bIg0>;8D5`8
zemZYshpyrc60e3=GgazwWyeVy#s=O*L6TmPIiH%&qG7oRsBbC)by}-VwBf8M`gb`4
zdq$KwHcxko{o%|=HyRh#&CJ1#%xnqlz}6??DH)d5ymQOmmx~ovY4E=5R^-dhiSnX)
z4C2jHcWvFF(H-Eqf6V8Zpnx0fdU)WE_B+Z?)TOgEXN(?07q)B{c@fBN;<1+|=NL?~
z6#a1NOC%fQChcTMryI2`o{e(JEaj%Fex$L6I$v(0&=&RQQHN)bN*)0fhAVZx3>N0`
zRGM9IwD}PQ<V6AuUamiA3p@)6if_W2>3W=d6OqH(i*snyju7oxS6i;+JAWVGaKzhN
z=-n>LA;V!b-p75`SZ+Mnkj8cMYzJ|-UpsOJ0_wcycXM1)5olsC7u%=%gWEcsdV|r%
zEWNn*^K!b<c6os+u8I`~##j;F?Nr{5P;Mc-rRpGV*!vZxcOar*_~$P!u@}#1xdq=P
z)2{^ZvJzvhr!U&*niRaIAE>3556P-6lj<IWS+@$LGJht?BXhcoI{CE6UJuMNJ7j_N
zF;8zZKHO|Ta{4I6OXF%zt7$80V(AqAA**LZt<H=6@lv{AG$&+QVDDx~>d<`9EVGru
zk=a=RNVCY$+hoD-^H@zypjj6i3vTzK*UNXljr=*h3Ng_~5JAfoB^L@*0($POJclT&
zoGL+!c|gSVx+^)=1qKhRkvq}Gxl`r5D@i$+XxjKukLQsp(~y)5eRUkc{j5X50YMYJ
zUI=^8&UVYvzs2%obrfmd>+w8#!Y+F3?Hd#qwSKU$*nc{}s>Wyv{Z_qB<e}GB?UiMd
z3L<#*4m4f(wTsj_3PjlbItX}z)t^{zg+1r1(U6F(5rz=SQY}td5qKzP&h3PNE{ClP
z1*n~@HxS7`HtNXn+-0%|hr&RqK^Pw?t@2I~WyLeJ<A2tDdW=~}U?Y-BgH-$c5s;UB
zdOm4-dVf5OfM-1+fFIaIOQE;Paq-Dhz5TseJzHp@JG50mNSpF(8L&qpM~m&_aQ}D`
zp=;gdlf~JxTIC1#A|E?P{}M}|?9{DiZ}(j@S}l<^9hSzG&Q%7q3HB7eh0zT47TEeH
z3cB$AQp&?>ay{lWw<+H0#VQ3DxF3e=W6hWL!@YQ{0{MvQKCDm#nSsQLZgY#(lph0L
z#<+wna=c&$>L(Q?auAC$_$$Y}g4$}lW`yG_XKkYSmPW8AIcgPElzO*EXawwWP)}0U
zVy?1eg=$~KB><nQoU0}|?`ds7T&>$2)L=&s#=;U{Be6)Gqq3QUB!FL!?Lr7|`6gMa
z9XUbCW!<f7npSuovaq(6h8ZN)m!@YyB8eNmhY9Iw1=|TEEK>}5(}0{|>V-mKk3aB2
z&rwf88e4y2Jn~AIN*>MwV9!4)a8eAsECDBOH4!kXVcTtzyFi#jf#CVIDgF7v?UVQI
zLddZuag9>7WlFUHZ$<9M^@*ku#`w3bz^s}3bE=TxORb2W6fW}dLY~$*DHQ6qMr8I^
zJ;?Z}55R?08afrmD$icE$1eSCul#@t-!2Uw9akud`MrZCW|wG&ELBn_NQDi3&M?02
zL;c7?*NtkWWpV*mU~MzNejQ?k$`&?;it{U~#TVr+l<!q3Wz7I}vYWbeUnFp0a|)bJ
zcDM5HH3o@FSR3kU7AyMJi-i%zMLhUY2;a8DuZR^5^y#}rD}0jR9XIEZo1~fy#h&z9
zSS~Z~>(G62`oN=M>Kld57z&euka{fbp1eF)*ZI>P|2subVr+ZoSz)^Yp$GtctLZvy
zl%WK6uGZDxvcN4B?pR%N^N_oU4=2V)RR<$?kn)qj-h6VFU<o0wg`(cMJ#QfK2QlTz
z%D*GR3`dT%8yQGl5duP_4|T1Zt9V0BT^nY!d|$kBa<rW)WqHkY>ddN+XeN0(YZ8`5
z6%h`9p1#-W$yfkQg~I&#Ma2xFd!Ayk0u(_3M7jX-&C$vc`Lf2MLYW}MQ!5gx&c2o|
z9a&4INIR(bzNKsN1ZqzQzHA*;0Te2?k&`UfHL{G>VOthOhYD>%riQZy>TvR28i&ka
z=(yU2&`I2Z{Hr&_kN<2BbDKcBZi28hcHVc$?qstiGK*Nkp-iN9O}8t4CKuDu5^Kks
zGz!H`0jSLbd{J5kyeY2&KZv7;n*gbc*r3LkS8kP~mPDeynkZxhzU<4lriZ@hKYSr5
z*DU{?<I8Alj}xq~Sa`}?Q?nH5$0s~y7C?wzhuT&_F(TiOL@puhd2*p0BH8OE3I(yr
zKo(PlnkXQf;{fh0xe|LCuh#2nN7ADvP&sR<lrz$)<du$El~0)Pm4W55Dg^;K)Gb#x
zwb|`8_PlQoTrk=#nAm>#4Ac2^Te;9YQjXRY_yOVDzY6vi9y(G1o~iS!dt?Tm02X37
zNwt-n>PSha(dCZf*V_tBJ0`7G(8X`nC<P3Mt@4#Bsl0Nrx@vxWgdL5NBgAq!Ae^@8
zn1OXNut{x(E=7;Tb9n{uX85i~?E@A1bYo#gJ}rxe6uvvpfj%jw`A`hbQKR2^(G7;K
zaigLMjVWOY#!CZl#9aq<=WAC}0UcG*fDadN@gP$aKI4BZ%TMwh`S>#KT|6gkCIHNQ
z;?C^xXfi3C&(`I!z$5Rz%aC}az121}EwOWb46oVL_AVT-zIStEHO-C<Zsp2nKU6aG
z&>sAe<@ODI5_myjDVgw@yF^DZr5l@vmQZ!`e$Fx5NT|nTz;VY1XKr|BR_A&*luRb;
z4`2k8Ov(GaTl&rEfUVjTqj@U!EV$)78Z7MBl60h>&6q+r*=UFn$qd&3krg~Zvm*}}
zizv$8*73+Gg=@<)s;o9_YvHlU(bbL=g}8wN|2wXZ06s4;50mUT1a%};eRgD4JXf%A
z1%96Fc1UcZ#SxG*a~zvYcXK~mri+q}oj~5mkj`5WT+~e$QcI>AGRBr?Fe?FP<Qy^f
zA%^=fCsXY!UzWK&6QE7mI0++la+|AY!P4_!))lw`hd`e3^jh=9RsLa+RJRONGE$7S
znjWe2V;+`R$%iLnaQpU|DmYawD7D*>%=s%*Zg#L1>i(R@fl6Cqt$NOMd>=d8Ai+qb
zC2*yTwaOhXBz_`;`IK7*HX^+C5o<LC4X(`YPSE2)(REx8fo5q+)g>`G;TX#aEG%R|
z7JUN>2!@R{F|XuX4tGtxIPcOZPSi+09{Wt)G5Ao-MH9L3Zb96xq0=`KdUxBN28BUa
zDi5zSzRJwC96*;+`XObWLu69*T9q2d^!ACWlxq{pmxv>T%(g^1COHtKkPbyMv;9jI
zDFXUBoFGsF1>Qx&+#4WreJ`98lvtJoayl%YL)~PJ%9YYi+nHDN!z3$>(54~j?-v*=
zhz&d8bPhX$r-`Y$#*nDw7OYJ^MYMn=W>91H%Yi~!>L{hNx0pjz?e5kv!^E$9_9<nk
zLpmPX=QEh3p$Sq_q#`#oTWDvWugSh9#A6`QCmdmA+=@JvX|P?1fx2rTvnz<{*$=t0
zak-UNBu_`^$O%?(*q1mA)EWx^!sZIMMKpd`m7m1%Tzx?&cWnY~aK9_rEjz5eCV?|;
zN5I~Z(u{*z#|5w9Xz567LQYGyycHXxnj57(8qoPGK!F!nCR}2NTLD#tF5#5kbi19?
zk&9gDfrg&Uov<1oF3F6*4HS>Hj)rR;Z#p<bNA3n_oFZy+;YMkzQf)=EZ4FkdQFB~a
zg(D~%lwxBcm#UXatifoSy6JlHHg;%SAQNBAO#evXIQA8=)1R}nS4z;%2fovrhtrHT
zKG|^=G`3>xVUSap7I&v2z9HKMh|jyIXRH60S8r648ZAL7IxFH}SHC@%>P;9?u)g3h
zT8^WDKBjzn+Ry>>#h8n-R|)|gOg4e?D3_O2V<|~b)i+k4p)R{jvIXh@!m@~>ZjGX3
zZH*=zqXA40H*bqSDB9@{Jk)amA~<@Jz!?-ctqfe~SeeW(un17m8!j^H2ZXmq_F+xV
zMaFi5(qjuUC<Qq=OL4&?SMQ67p`tvt?C`F^?D`34-rA_IjK>U{oTf>qx1Ui8YH$?s
zf!8P|w2F9k8<2)ny2IX^0UvNXN$)Ow$q`y9`5cR((GXrdX43J|*!LA)VMRC7wh#F-
z@NdExHDCxDw`O1fS+)j2%M%LuPn1)805dcg9{@nC{Gaw3sQ!7c;lFw7{~x{E@L!Pe
z|JVUDH?g+=cSOWW>VWMQ15D_h2NY2cg{VQkXaEqXBSt7%k9F5I9Ru4DMGCE8Vp6-j
zzND)Xw`?m;>*$K?qI~3+jFcXRD!^IPq-rP!xT3iUj!KDys+bOxI9N#z&}>a&3h^$5
zSkVGe`sIgh3P<BC8MO}a+`<Bw6<(_A>}Ct)JQ_4@MA51ac1mK)uDlpDgkvvBMMq{G
zg^Tqg0}wykAZ=X6&C6@%3k(}zb9|p&sGxSbVD9m;@8W5L!zdZAWa(l#Bq77593irr
zbhVlM-)%0d1+7QTn7ke|708Mw>iLg9c;tm3F&EIM%K(<l^+)wbb6l5o1Z^rCGnkoM
zJ*C2P^7K$~t&nko_XG2fg8Yz)xT<056*%&WQrTg_d}-;-E_<a7+zKIv3L1fSjD!V~
z;=htPIcQ05_T6eIZA1Ofi+<xKmI?`I+`kXe{fv)s4S84KvlWY--T5c(!Bml#&4aOF
zqpuwuIGdN90v*J;tOa>;Ym|TTtd}j_*ndJFAUk8rql;y+C~p5e&~^mkzX=)qIf51?
z&AGQNU~S$HB4z1c&Wu(`S6sct{UNBqMJ~z87W~Nl6PY%^xtX+rL-!?p=zHe<^!)zl
z{fl=D?z7ChXKZaP+`WY^Q7?{cM~+W-e_=@dk31I2y!ppZl^aCf0qnmpKDJl&z{kIG
zq_!~s6h{A^i2iSn)&IH}vbJz?{&z7nqor%N#fIYZT4%6=-r)kG8LK^=9`;OprG=tf
z@9G5!6VOoitD~`8NyX^rGa=bn8Kt7#f^1E~AmJ77@%~;fvhajM?hY1o+8{t8aCZ;~
zRv%7b|FiGOv+pRo1|lclN@fGfX#Ls~4xEN^BcDO%l%@<Jlwz|um?@cjA)AIEnivda
zYy;rnDteeRsQ&)^87?<kaFp6u`Qq05W@dtHM8-<Gq3(ee&W>Y&ySu8J@4nMbbOj&<
zJuUqZ)*dVaNRHCbSnd$Yy@UIQpdURMi7Q^zqe1|ruX@#rqOuZg8}BeL+QCF<{ZJN5
zIDCoan0JQo=-1pm(x7!p6KIFDJ0qCHXOEs8ptLh^bX78=o7`jYGkGH|l+G-OZi=v5
z5uX-PBoPW}_~9{_)E%asG2Ri4aGj&rZu~JC#phr}M@n@rMb7|cGHnzcv%ycXE!ab2
zb@G$Pg7m^NA6%QhTP>MErRCkN&v2O%^^W{-Gb1Zq91YSe^z9NoOBml;>ER^6QkzZ-
zn6B#LU^(oFb`pgmC-lmW))J<&@sgK+4T3Oc`B>Q{^0qF@Bn535cP>^$wkX_&{Q3Dl
zp>SFAkQ&RA;_rCehX|D4W2E-45b(VxJOMf-NxeP@U(~-?@|cC5AH4CWHv8AwurdGW
z%@(eEiBj%Lep8i=;Z2Ya>fK?C&qg<F5~PU?Bge13J3=mZ6>+%Kxe@t`Ad7B-969Ja
zYnjGij94s{bSa?sL_X-HUhTmTe~zVMe5|9LAYC{l5NNjlo%Pv^Ne4{430+;@YLy^)
zpc5@`V<4#yegc!!l2+~k?RPYRhg;z)*cbHw-M0bDG5zy1R}M%)F}W4A?4Xdyy^I45
zhqb@&ey`bd#k}!~_k;2$;!J;mb=n~p(5Q++_zRx611O;c!!~)__yvyLy=zTOXRm68
zjH{f!mRi6n7vz;3iX{3m3rAk)CBHbKRe&`YKTGinsxo~JHN|9!*p~joPwz!_4EnRP
zx$er<3nXqC0S^~-h?q58+=qL{%-r}_d1faW`(e+NP*OHSja8{K<=IxsZ+Rg&-aM=#
zxn+D@8~#qc%Z?Z8%CK}bvofS|oxq}T5UQ+xnKNti{UQ}t0SSGeJ-|;KaVPiTn!$D#
zL+q-_@!G2XUMxAWCgA4Ank=>^IV6P(6_nehia?1Rs_>Xx{)d_;%U0zDZfLEyicvU-
zONSS<!=;f8AS31Qh|8j)s|7U*-Hf@oya0Nw6vgW5W<Abddv0Lk9s_-*n~S7DS=<e$
zHVf95mJw%5XzSsavN?#@vZHK9V#;BWF<UImSlq;C^0PM%rVK8k*M=3MZecoVV{e1S
z54k%?;tf~cdI>yw`KZ01B$fsRJhVfuaFoJ7haBJ@(VP}+ML4tLM{L|R&yp5@ww)@-
z<L>0&`dEFePj4Hy=qZ{Vn?0Qcnh<?rl0*X5^N1|}@DgCrAFGQb76mEx@#bB_(3%H<
z!B#t-6IC(=Q|1RV99P@74z4kB9F~PhdoW!U&<E#Rb>E|C{Ne(w&zu`_xmpTy)=nd|
zr&!JnlXEd*qj0f#&_?&QnyK?G^|0k#QlZ5)PVz`s|F%}BxorAm>A<?l5oh+{eK43B
zZg<{TtyEmQL^*N|Won($q>~1%1|@0jrmCh&6Yy~&LX!!j&x!42wZNjXC^6g?z<>cg
zw5FgHY(h>dsoIJ}Cj~nE2r^3dB+!^)eKJ>o*$A}Fv^sA($j9gbwXI*fuVk4W-<6Bf
z_hISPM`(o~KXzyo9gvp1yp_K)IWRMHVXJ;P9LWpOR3KjU-1G9*3uA`fb2chay<LM3
zs?yWGMU6Ql$jJ%BVXEx(=w&%h+*s8}k(m$G3{qZ?*Jg0DYRxh7(~+BUSF5tI4gA>;
zA)mRq`!ojJy7R)Qdg=2+$8v7}WE~rsZvzRjbdTJnSLC$n#@jiF-kY$SSUcn;Zv}df
z;+ZT2g^AoFyeqaf3i1{|Un93vmK+732Qxp9qNOqNE`ure_6lJipW--}akAi@V-8bI
zY8jYc5Ink~7;t`7Kl&3&mupPZr@MJrZ*9HV=%M_TD}&AbT!yx$kDa?pw&c0iidHt}
z2{y)#mA^+dw+VBbXN1oe5^909>qbSwv!eXss94b>`^sx3549Vb5|!!hL9SKEAagI)
zZ9iV(I%pqML;{lo)a;ibzU<iV_pf4Jw%)ny^_LOep!mNi=KoaR{zn({|1T)Q#_-?v
z!y{^Yc1wRHl;-pe2c@Z06|)AVMM>vJ1tcWA{L3sGw@*9CWQ~rT)W9}>W;Z0YyKdNI
zCHB*}&ORUh`j9#uam3QGKmc<l!FC0LdT|?XnDayyz811tRg_s1HfsulKKq~~eM?Pc
zI`mq~Rs}ezElXJrZI)_G;<&3~%oh$4B10#uTKT(SDLT)yfW?E%rgZu#mzK06f2G=6
zkNlZELQnL$6cJvvs|`V8RRDR4{(E)Sv8UdAmn0H{hAkO5C@WV5)u9+kNtG4=iZlYn
zS*w{iX)Tz85`ZnFm^2)*9k|yjH)e3rV^LUz{ltxOpBxoNqd#Yt7H`&Gr29tYk3VQc
z3cnV%Iu5bE709v!A>TO3*{p*OnEz}xM3{C<AyBE(J11B&t6aZ!xSk+T*=6MTJt6(V
z<$8mm<V9vizD=g$Igy6@vcbMH=>N8E$V%`;%IE^L_II_OM~alzWZ^BHc1dy{D@J_F
zA-KEQ*gc~3ME3?NueG#%aSmur-=={liZkUrbh=T1EZf>UUNbYU@&lo;EOAo`2~$$9
z^uDNsOy1EX;{sZ);K#1uM>av{@<l1X3BNIuqKcVbqQMB2MUoe<exrKpW6YJ49Axhi
z6Q{b^6kCnun6>%7=JLVY>6gzMrj6V(g)gyena|~LEhub{to`DiwOJFtKa`=T+)ZR8
zyy4#FDEF;8YopCGLhFWa@t3GRw@Dp>2WRsFm^}OY0aJ02K&P`(rrsRLc~`!4m(&MA
zkV3g<0Xm`Ad(X`5uNm+9T<^z6h$JT%+1aq1Y}ii>$pH@rcI&y*liyeyf+wt9$^Nwk
zqM{juTT(p^uOKF{%LzaxNu?EL4mBr2G?iXvqL-#K^{OwdG@06Fo8va-j$jkqS#L=i
z`ZKYaLIpN&4OxYn48p&W^!13c<Km!$f79Tc;Z?`AqCjHFuG<UHckEC)6K9$Nox$?j
zKc$6G#B!!on{&i<U3KPds7Yb`r^bNHA*#{UzcdE^f>r<Cm*BtQ1pXnM{@0}rCzF4B
z4cJlJi^>xCtG-r;aUCTTzeMfB0nt2@k}7l|xKvg|lf)s3;?mY-wfOay_a?)#F^^lk
z%8|e2=<B_63U$$4%dilVW*i6ND+oPb0V%AypLEdsWx=kr*RQ?oDvWvJ)jq<CnvOAc
zGM<vk3DH1hQy~c<;YivN(kPM2M3z9B%3^8?)Xu;}O&c*f<RF*3nrJATBXy%+Dj2Da
zVs&dC_Q`f6-r8d)gUr#rT`-#69(VzwJ|RONQZ8eIHwWnZSE4D<*PxaOsXSRhi*CZc
z2uz)1y0p1r8`XO{Jo%1emPz>6zO2QG44U?$>#Sx^>z{nCnfZ7>W%O_h(9mo36U<39
z@L#$aaRGrBb=vn|y-w1@%~JtxPZhan<y=TixC+M&$x1X?yJb$giF5Uvdb9<-CjCLK
zUDho4bNL_@GLg1(sD^s<Ee*JvV5p0c(QxwwJEtwaf)+_MZ~msgL#*3`uncg#ZU=GK
z2V+afGJyZO0N3BOHH~V{NQs;d92tD*J=$Gjp5!&3R>oC5^evgyMo=Y!3s}1wG-R!c
zrIA`SJsS;*u{eKbj8==g?5b9}W2%l@pUdyxW4)tqb7GQ+>L!Np;}t>^W+vN8H{U68
z<Nf(YFZx4|*XCEd2dNny-0d%<C7>aV2^^N%(Fj@hscXb;-*!f--|u3N+kXc8gHj$O
zcMN>7m^?)rL&5ib&_{9;FrS90?9RqSJZc8ep-XU+`ppe(Y<%{!0L1wgN}PM3F-y+1
z929?#f{?o{KK7WM#rWGP;LCaJ!R>%+BGT>vRR8%6jxQ&WS|nDIFnDYF@BvC$6Z1*_
zUxj}OOP<2!Z}2`t|6c^}KOE@)b;0Xw?ey>N@Fi;6|GLBbuGJANhmEI~k_vVKfwIuJ
zG_eY<x;Hc#nnMMa##(cs{I#jYxm&waKo@V#E*bhVisbp4b-RTbV!*&QfMY5|eFr@~
zW+5vZBj*Pjn5%Y_w%JYiO9CeGrws}s6nfh_{}oT9R>#aw4li6DO^|}DWzI)Bfk|Q`
zv$k!i#+er*GPgxbaV})Z3~CXg?Yv4dB{%1{VWoF-L)hc2x1}~TX<L?GV{27aw$-ak
zMD;Ro4p^<(C{;!Q1@TLBf3$K!NMcNKoPGO)G7X##$y^P<GC0~&4l2$(j`>*Hol+d_
zM1kv42%y`M-mD?-CeDkM+qy$2{AZP}XhOe^{^Dc<-^|&O?Gpoe$}!(nvqPp1)mO^j
z5d~zgpNvaRPHr#X+COlZ%M>R>TrC-?JSlZ^bK3l=meKC&V)#fj?@IRvn)ir-?BKO~
zxQ{nb-l!#}1}H0=J?i4=vYMu3e;Tr8kbZ8-qQ%WyD$#WUq2L-w=OY`k=u}84PkcuG
zrp(m_Kv^XLN!4khn$%1)H1z5ns@-bELoTD2j?s*FVLWj97m-n;C8(!Bptrc1V9(b6
zJB?=|Qm!80q$bP3?_6yx*v(-5K!IaEMdarGLMbYh+6j5ei2A49RxSPaFS+@MUo|*9
zMzs`X&T0-cDh1=lK_wOA7LToCrlFiJ0gF+q$+bpEehj8)o@j=IpSsc<x7ti=qvxzf
zYm!wvNH7zikFgWojv#me>Mn5tv%xlsrs~RrElTHdD(O!PZC709`?gb?Exb_QZt&Qb
z&-p{mvQnq16H?HqHxTdoZkdop^@D}C*j{N*QQ@l`$+2P&`;O<V+6@Zvq@*YZ6zhlw
zN(S>3T-lGSTNk;X#6O%J9l5%3b#)Y(f9#n&0f)lWkFm83l&v2|&)AtgoxVSBMrPi_
z(rSN`8ko?xxa=B;q-Q|sb&{}7nOA9CmcCdIXS~X8a(c~um8O2@y$eT&LV?rh^crI<
z%6Z91$*i`pX?iaIm4GK+WNk%j+iq1Ye$i>eBa@Aj<1%iy8lWC*oHke*0Q@WNU=T|m
zQ5J^^kgbX~jui_N&qe2J?^4jf&7m8#SHP1FCtO?e@-!YM>v>&dNc0MeCFX~qo<ZK5
z^({Sxt9M>%vWcG{+$qewbg(+_e+Hbz=huv14xRlQAcR!;S_&+aC*9H)p3fG$5P%z%
z_zl5fT@xxs@OM%IM-Mnjy&x3X$lg9azgm$DWyVws<u$z8CF|friNboX8?NeH`qA;s
zgTvfe?*ga6De&x-aF4B7K06Tj;mT9r0-z7D>go=gU2V{&r;~jb`|HMhlw+eO#ig4h
zQHB<YvKd3ep1Q|v9=Mk}vRZ0>BSkl}+vThe2&GFC4E%aPNKA{<cZ<>Ee9Ss9*A7Lr
z7`V(Qto@+Lx?eRj>e#@0jTjUE(Za0h>Rqk{2j_9XP*SJQxX9<l*%=w7rRV2<gdx?^
zh*v@ShBJra%sBy+ACn#Y<x}5pp8QgDD1Kg|IBFWj%>;R^cR_!7L@6Kip)l{v#P~&^
zEY=dkqw^c&!Mj58|MX)0D-ooW{ntE-;U6cq|4pOz5BcwZbgus|Mr<}FHg=94`qn0f
z|1Tk^|5L{DM#si!vlHn%SHFNzf139%n{2T(ZG3vFv{*_{h<tA0>B)^ZMzO!Vjw2~I
z`TlH1`YzgP@(an_+{#a)KUJ-2<(4mwBue+`m|s5rCJxumF`O?}E*IyXn|p0IZ@?z*
z;BMNbHl8`_Gg#EkG}TO&VFU6`&RWwKt$jKJd2cj1u)BP*qZ>J;bYO+Yhx{IAl);2s
zY1+HNZK=1xme$FL(c6rcNN^M`@QRklgBHJ8;q73XXcdN8#vJ;YX0503+D=%;mQgV`
ze6#L@d<=gtZu<AYNO_9)CyysH2lE^~SX$mlQE6f14cGcFm2l~*MPzwnVcLT9%JKI=
zSs(h2qRHc(k_x%ZFEy>N_p_e1)2$=^xS8Ka3t4E5W$WLjgBb4Xrm=S|2ri7dXD5$w
zrMQ~|B<pYxF18I3ecDYOQAKLI25Zk@Fq?Cd_B@jk3_-gG4q6q;-z2PHWl|gUa||D}
zBKN}u^A1a7>qs9`MY>OM;(yd>ogF=0zrLU7zaNBuhV~De-c1?Y*<nB{TWxNyucIGW
z3#W;uRa~NK$iv3W@gmuf@U5JfG4zLZeOcRQuxXdw9XUUH_bz6K&Ai}#m`>#?A-c+I
zT36`s=gw+>MZvN4rkYoFF8n&Lz-ksbYZfF<W2|}KhEZj828|OV={Kx%T{t-*^<)(=
z3qs_=)akyBn>Y90&FS@v6SMn(7_45yhg-Oac;mW+7gy-yM>z?@_v-QeNkKS1PvMI0
z;t>~D^kmH62WcuQ;afy$X^?)XbX-PERyws&oQ~fUVF}G^YQWkhO9vemc4Bz<W1dKc
zAHa&9>w3&jNRDJ`l$dU^ujd9Uc|Wrn1D03Co84mX9r|o*`qhP2BBgbKHF2iEyDN=6
zeaW!ILg89RC=Zg$*|5R~@oTWj6$C*1eEQY?BcmED?l|n&kRc2}IoUkoamIDqnwc(3
zT2YAfT&ntV8+2%!)7o-(d-ixqM11OGR1L*Om@r{w3(#r%g^_UekheLX#iGLo1MoT~
zCUhRr=P`oYmIU;x3Oh#yar-AuI^v!J>Um%V);(uozoWar3Tsm#hQJ%C5<9i(DMt&s
zz>q^Gc5GUE3icimKMIW=-a@Y<1-Co;CK1d-gPhns{c=eM0Aic%5ef~~y4rk{tDx6H
zbqG#M{e&a6`R@L0AFY@HK;M`PwkWh!UKBZ|&ag3f6GMnx#Vyo~PZMK%cRusCyfL(I
z5pC4V4;;<f{+!nMNG{@_K~qPy0ZdYeC}UKKLsGp8YDKM)T7_oOLOTRc<T?tlUyvWj
zui<RWP;I_JJXi?tJS~{+QPEc6>;4u9m=RX8^(Mx~@M!TZash_)E`KxTA2c4uMEa!=
z_-x2gAjGLGgQJW?667wM&)O+oR!Ka<s(1$0z*;OsZtLH6yitx@qkNO~qI&OcdTdOp
zLVzG}Qk35)Sk!L(Y}#KU7GX>iHe!5knbuz2(>e0Jl5VdiUm*uuGG6V`cccZhxb7|Y
zYQmsKn#1l=g5eM{1YTJ>%EU!#<qGJ|Q%c?C#HT!`jpCe#HpfqA#SaoETYlH<OPiJc
zn7X2B*KBF05kv_I!E8BT*3&(rj4*bUP`8&epsPx-B{praL1O*x{)sY=pcgwkG7r=D
zDwnpiF-mg30fi5n=t0I4M6rz)M#*vda{OpQl9M;&0EqG68J9S1G_5?sz_eD94vWE;
z+%SN0OtiXFp=#@9k|YdTS5yvko5PC77KWVBBF!>*O-@l~m`JN^O+-$LSZ&N>eJ!E(
zk}CKZUD*_%sLETl)JPF`r8YF~6gm|qG9=ki;bD>HrX%3XXV5gEPh>JZYem&1g<qr^
zAIH=7oHsZLiJkU=Ksz${BasYj#j;=}+gV>l-?QUJs{_`!U!7bs=0Mz{jf0mUzE!j4
zMBiCSe|v{-Q+Giln=^S+t260^Lo`(_$}MQgrkplPv!%?7-hr~VF0VIgT6<`Q(|T1E
zO*I{N@cEoN{!FPpoSYmTNS7%CFi15j2x?jt(tHrO7%G`PLXp`PG3Rlzp*fLtrVCaQ
zMOD_XT&VTe<X$O%eFmtTvsu((P2gcqSRaHO)@9r{H0Dc=1nTaJO+Le7O-RL0c}u5A
zc8GwEuS{$qVaTPR4@5)epU_-qumAOynW-xrP~PexcYLtIG(v}k?OTo5F62&`*Ea7_
z1A*VF4L$q<a`)^+O12^~?#Hd94RB&a(rmxs(G9#P=$ylgE>)TzLQtKYe^!aAJcnxH
zj_%<&vJBST<$zNprEQ^3zs<0PWUHzabA@&nELQZ$>M~~m9U=-Nn#5TX?G1BLeOH?a
zJZcsF>c_S^*7>Zx+It}Kek1xp2c%vz>rmkl6RO8af+BNoNZ$La3=i;YJTt%+=6%gF
zZ)B1VpLv6<a27+|mc2!wMoj`4ShsXLu_G#Utdw?*Qj1&@J7z~;?iV$<PU$W{H>yl5
zt02OOFNES26or@@hvUFF8m8eYYcwU2HxQ<q@>nhq^B1>0MCrH^DKkC}Sm7pBqVZ=F
zX0Q+Q&Av%Bcb8Uiqs>H*7hP-j8ZYjZ9CuN!v;2E)yyUwtN`FG<a|JnVA7<Q48!nu(
z$Pm)5b8&j3rtvQmagIiF&R6AWO_4=DK;C4$g)`7-OPChtgD`!7r!QQ9jAGGSxz@=+
z|Ad-U`Km6aUzKe8Ef<X!?SyU?Bcs;PCzXC4A1)aH=A1pEHONcvAHTv|`Ow;cp?yzW
zXj-=EJaY}S)SD&(qRpxK9_?MhC4gg>CLoYpn4db5evRV7dY_o2*pVK8eY{s%8P4I`
z(O!$T^rSj3>PZnHB`@08m?UYuZx@4j>(l%{)m?R1l-<)_q)|dxx~02IKnW?4PU-HJ
z?v)a05EP`llu$|<2>}5?Qc7tF=|=b#-}vpS@WS_>%eB|VA8^k-=b1Az&zW=Pv`ldL
zU^Nid$vGNS0f)%6{OnMhmacr!-a^hO5q-g+R>w6H(je}lbS(xM6Vs&`@d|Z~L3d|P
zKx*`o37ib0gRzX>BH1KB1h3y8<QpuLFebO`Jp6glAFya3r&LCb!ljTK+|+Xi*d%oj
ze>;>>UT(-Tn=Jle;5>K+_vVQR>Z(Z(uok|{*WWV@(7&ZSNr^DTu}cs!UC=u1zNjNN
zZrSZ?(Oo>ANG?#7lgx6`j{YqR&3#FBH4nag9w{YX<^7X>{q|RpTehlr1AB5iK}?}s
zHQgFF5zt)=xTxlYgE%RR=*oHV1dNvq1i9VJL|5Yr^jrMa?TPP>U8NlAw;NC*K9pLN
zJpqOu$OM!vaQTc^>j(fd4hoV<$8@n*cX$F^@~+b&)P~(>hV@smxAGCpL6P>IPR`-j
zBKbZ%f7Qn%lMcU>z|<mfL_BdslS`8xpN-;fSH|qVw?}uTTJw{6G1X|V@VfTz&9Rf^
zgU{W)Ob@*8FVOOjt4&Mtg}=*i-G^WBNbVZ_0ICsL{)jxTC~!|jV~DQSsR6EDl`sFP
znt@nze?esL<^<x-U~U6_&JIdR8WNpxS%0A)Daz18p@2=*H}Fz29&htVE04@j7frBN
zfaLNz9Vb_)Wla@)0?B1Gm5H>mG-=^<5tWZy)n0q{WbZ%Ftj@Dn3}5)H`930PztO0J
z0dt*S@7?CU1GaB~xcpIweyr@AIi0`wV&mIl9Td8j!}La-iD19wuT3ItQ%q^Mzm3c9
zA`)jGMrobgYF?cc51!PCU8lRR;aXKkVMaw%vKV4L+F8k6Ih3@x_1LCjaChKo5$0OK
z#Qp>ZKKR1;7t;QD&ORYx=w(_Knp^s#9~KwwXx=W|#BW$InU;UPAJuyu?(qq`MJD`H
zY|^%z1t;|N6y>4ZCd<22ad&z{REBipQoc?uA4tQ{hz6d#h;N-*8$|I<1Zb%9$n?mw
z^8)d;SNc_gx)3dIrmMm}Kz+}}I!XGhVt3`WL3HsVg9VpcZo8-qi8sTBU4)?}SIn>m
zwZeMKPQu+I8|8EzRivU6Uwk3~1P<37ig;ly_)=tN8@4bX8K&d!uQxv2Qjg^)>L1hR
zxQfnYpBpSHL*%VJC0`_b)O2rJVUWxsTY)2t=;I#l>GD^5Nq1-ic<F1_?Y!l$v+vZs
zz-z)E{Z{l-KWb|yD@SujGY4aRqcd)&9*ENhUmpStT-|2i)E&XNw+TZS&>+r(Vhk!V
zsm2$j-LcQOSME>A(^Wqig-s*-hVp8s`g{n6^t_<ND#cey1z;2*?3l2uirA{~$Kxdu
zVy*2WB)3jW9<{lI?n?Uyk@7Dk{U0{vMzgE-zvOPf!_HD|?R|q8-vN^P(qL!q@ua6P
zx(pWQhNS!(KAiVr+aFMJx&R~D_GngwFVr%`67m!u47C#K2nv}xH<I#s2%$^YSr~q^
zTjsvzbi+%h8ms3>p5ZZaOWV!You7lBYFwGt;ILLkz(Ku_oQfqPNnKUu`1n(GQH(bc
z5#><Wt)MGkkzO~dTM6R`zF{s$Ta%N|<hO4(8LZEGkzvn3^L#7&<wJw##qxnD^!Y+G
zB+l7K8DY6vV_nagNjee4=oO+SiVFSMyDSV-9h4NB?nu&EB~tPS6M);AA^62y69H{v
zz@cIy=`zE8Mt^}Ory*84zkdfNp}FTOU(Lc``0@T=a+OPWp>w-qtm67n#xr~yRr{GI
zO1-K`a`<Mu4Een;6!!~c7i^jH#Dei?${eSZ7ATXg#<<_~*N@v>C2qhO!>x@-%geq#
z(ujslYDJ#pz2H*$V%qoluEl<+D{a8{z9;D-BcSHIq#@?rgKkfc$K_mwO9>J<tEDDo
z9%4^BjsS-CWs}@VWtvRpHh|FyU!HcC(g*w^2&Bc`Wz|phS*_Q~7zE_r$fw_&V@eF@
zkq0CPT-zbGLHph=?g`gHrZZXbGy+!5BT{U^?T{>dkEFG(=omNqHeSYi-I1f-&QZ@B
zN9ZCg(kHrh0@>FMsc^9k6(X$<5=q4xe=J_Nk7k*ggBPzIQO=vaXLq;$Z*%^mCoctb
zkE+vlRl5Inw>Eawx3;q~{>=sRvGOv~Z-p>wSBvl*?y~uFtDpq<mLWF!p}1q+=WcJd
zNmvdXk+D3keJ<xA|ADWLXLGK0-i>4w3nc{K48A&n>=PUYlbUp^jhG>NLU|-oDI#Nu
zMf^OPLmzhkiqmzRDayT|>RTZ=Mk<!2YoCh`VNmo~I@qs&U5Eebzr`r2Y!-FRkdY~X
zt+<ojJWI7s7UkZGL1%+ZAfM4eo;aqUdGUMLNEjdt(X_-1JFR8$IXgJ>RKNOO8zgH{
z6iO{&HZ1jol%c`rv~TX)3>IK7eQngkou5)usV0!W56cEVv5UM%ceCH$PgiDw->@4v
z-;p&-G=(S7%|MPlKatAigrhq++hf~T!fkUf;VOuWdoq|?uf8?!&iN&C-Ud;eOJxd&
zn9Unq8_mM)MOG;}a_rYdS$g3WU~@_{0w`fD{dUKZ_9pgMpY)g~e4N}L=etjS1e!9#
z*)#?4ZpmD*xt?7lWj=G$@kJlrA8oLd7cf}%-&m}Se6`J6)@x(i*#0yg>-~AYIoHJa
zjp$2(@Qu8V7Ha5z=`jbZ6k>D+X#RFH<LjJ~cC1A#O8mV6Yxg?TH1r)GU<5??*J8iJ
z`>6T&x^va4-~3G?nr~%pTnLIuuD%>g$wWg@E0K1tPl}nR$YVNe6gn1rvW(0}N`{b#
zJymv}??tc^jJgyt!fq#W<=Yyz!eDr9i1@$sL+R~ZxL$B$FC}ACJU1hncE@NX>OeP?
z5jXCe5{}KcLQGms$|(+0{5TB6Qt9Md*FHHG74io4S{4vFZL;m(&wZ6?UwhrGT%vs4
zv{$L}z{v5(RYa7iCDDrj03ZTvPG_4E%jHbzcf<Ne+Y;k>wsg8v$=C*b3EJ6S&`F`w
z;w>vi%^^eiO+F0A#k|j=v{_kCFbNj2fN8u-cV3c+N5?F#GTbkZK}e5LH!(3aC9niK
zeyPi1y6sRtPa-GaE1y(CqBFhB7ZJDiE^BP#z|?Q&k#iT5+CuM}bXtq{BtNG$#EeP*
z&D`ENlTm{eTy!y*^_60nR)3u2lD;&nI9BEmBs|@OkL{vdFx^)^x%T=;k!-W%CSiYi
zIwMF9_f3xwVb{n$K(f${lR0%op;zWn9X3~O4y@v0bWQ3LgoH$eo8OohyR1U$DhS@8
z+}c=;=_HXtn30SQc-8hS|4zWCYS%Gbe1kUK<m~WUvpqo4eP)!X8z0$BT2u3VS+grY
zghnyCp)IE8I83NUc(L&STi;ZbEUL!N-Y}(*z*0EK@6UTB4a=!x1iN(aUYN!hO<@v#
z%%JsM8z-vS=HOe$pz*X_{s(>bM~J7X_7^zQJsGgc7N6?@c`j(S7kGkywD~ffXTA=`
zhE5I-%$<$TvELSD8Ot%&(<qK{Yoz#*$wwtUbivHmS*YY_wXa+5)lM%nmKssMAXGY-
zWM~hsFMXaV&;15ZdUN9Ws+KR?GAv4(;#C~O?FT_sGR!#dlTdLOJ0+vLN{3Tv-+1^O
zvGfQdy&YU0jSxd;{Qy@7P-l<kE)ZU}_sXJVjxR}=8DFqyzF$Q{xcey2-a#lqR3&Y!
zS%M8#GGpAbJsoKFwtPa^1pDy&oLuwOEa~EPWk$eeWU`$6+s~A-u??76s2~ZIP3N|C
zlA{-2NJST-q}$S4vKwn;BvVGQ<`!d{>KZ+ZPzNL^QeF>4JKc-tNZcJ>Xs3%fmW>yu
zM^F_Hbc+KSkv)D$x{N!*6!TamH7ztcSYIv5%_=Po)_ovc16@Qvc|fB~ajtkUuVA%=
zh5jhR>&83F)RzkU<SDZ!ipFbJ5x~8d%=_K@9lLRGJ~>>RDec3&_%NiZ_Z+!9tTh22
z-iDQG_G|kH&`}9Ztgpt<XN8rhJMgf5g0abKW}$?OcV%|XlzD2ntw#ENsnsU7#<{sC
z!V89SaJ7+ZuLkQVL}@bxMw_qi723^lpmw2mIQ_QXlY3DGPs!ARBIgmKY-qM#%FZjn
z;v4CDX|R;3-V9e%xlvuyQ&O5~wAF9dLv)jre`|GM-N~+H|LW=VO8bmaxeT6O3y9}M
z1>OrzuZvjNA3eW*y0Pv@dUbX%u#S+i1Sc4QtH;<PG`N9MZOlw?;yifEoMx-y+Oy5Z
z#|=)ZO$$Cnk~fD_MR042NW93>-RbVdqDS7S*`Q%mLth)_WXMW3&-J3W99GH`^s&do
zM)`afC9*9igRhTgM0SBA41AAsR#UZ^?W@cuE_&9P4TB=1S2PIUNm$yy5SqfZK2daM
zb31GV_lFu!xYy>S84@vW;&5Ds-w{78yxP&QyOm}Bt-W&TJ2EBmE%AM>Cb<W~(~8K-
zT*=u^mCt$^#Re*1xVkFL*osb#Me=&f+Iw&cA$D#GarPBl|1n&Dv}OF%+4qO456n%$
z*QP>k{DG<jtIODe55U#VtC@Xm`KDzE?~I2s19a33dO=&f%z5qMB5kl{%LZfB{gdyx
z=%WtHl+KGU`8gNJ%cSZgg55J69(K$2$T!d1m(~c~HX-e~l}nQ$);feVEylbVY7jc*
z)WH$FL5ZOG_&%e#_%kGoD(uABXahqw_T=lPq2law!L(ZQj=~Pq^Y)YyDx7qAw26If
zyDndd-N;;}D90n)ng$5T_mC0<KU|X#kh*8aNZC%Ag!rP!HKB=N=)q`Bu`pexx`5`a
zLpKkyz>ElY@}ixt?-b2jR-v6v)b@7?P3aBM_mB-MDBqmCOe)~1i!o_7NRY&;OP*7%
zXR8CXIv7q0bYRPUGR)5cayv9?2^O@uM6rDr9i+>6mPseeL88!$sbDK*6k`$Wx*gBn
zT0|!AyWH(`wXjYV#6d6{|A^G=i<U$$*(6o-u0)1H^COG}!iaD6ts;H?CM5N>eqFmP
z;-82*P#1XuQQi(?Jz=_%DFh=cUo5<nG2uUZl`}7Nky<GHN*SLzZ@_X(@D_uBrXG&$
z2cwteYg=31qNQ2G!2}}ooH4H$bCTJ<^Kl<?90wSHFqwO~<j7W4@9J6EH6E<MC8>EP
zNEjnosPu<#C2;O=4CHLg##<1eX2bR^Rkf{nzIMLvb}*l*wMNasl*%IBCB{;9f7}k}
ziSS6L_mRl<WIyel_@Fts8}L`k-s6L8Fj%k_yq?WOKYu(p!dJywAeD;472Ic%H9#F#
z;Pvq9SGa;)YUzsDfO5UsE3B_sjuMo5Woc}tbh?_^-kOhSQ-^(<+ZTRYq--bMy*@dB
zA~<O)CynbCuL|4I{&B}fNdc=<P)jkmVsE*@B3}`4IPOb^zQDDNmsd+H=iCS~;(~Kw
zan;bmN8-z)3a%D<^hY!@&4o$X!-+hLWamX9I;cjmli_*V+FIbz<lofrBEh$2B?(@E
zUD0`;pXNPbk7}yUT8`jEsNWR<Wuk3SIrwc&IXm@^uW{e1Fy@8JIGH$_t|ymoZtWCS
z1RZ~KLS9{&<aQBYlB~iIqp%O!p5~XT-}z$np#=8{34@Amhj6|@5w%55qQwv*s{==2
zO5?!*9D(SxLwYh_q<73dN$e=yHdkO!xQkrLD&v-@{nt`J5alr*a}vCFedX@kWyV#+
z%v#+1<We+kcR?;Qn=lP0poE)l)~;BVni%cC>t`)YD5|&FJ}WPMh|2%;LMNQK(?xAq
z=Y>iNvwOAW%e<kD6V~x-{KvbBaER+wEHHj+?tukDwmPs{NP*Mr&0PAd+(p*uEhlg?
zod$OnC5G|l>iPO^0~m-o?jp85&G4t7n6f}Ls(jv5O@31*h-_Bl82;3YcF%aVZh{-6
z6XBm1a?d_Ud=c03N4rvd=p^=mv6acGCq2Cs*4SD_{Te{4=8f8^Bi)7p0&3$%&<Z4@
zPfrm~ex!0{&EjLC3K`k$+6!;-g@H8*%CQ}ybLygVnhfJa5}w(O(Zq<4(o)oE5Qr#p
zj8DOG+}CN@b9Z+0c-DTne`xGp-m*nsZF;O^b;xL5HlK!6uS>fj@(Nd@@<Tto1Ph!q
z`Maya-#xu;r$j#BUimIn*SalI>;p`*UD17W^nK&D$%*uS$_HB?twdV7xt8sQ7Ux)u
zy(uxQ=@b;B2(O*zKG!B2qfRhnCc><Bfn>t{-b!!MPrl!HG+~=ghFQN(bP_?>h*cwb
zl~E$3DK?c{GHMm6WVH}C%1)-Xc;z6G<X9rVb0%pMy#=e^t=Kb^IBP_q;tq1TD8EPM
z&7NCpIe;q&EzJH2uidZ9OqCX-Q-0BFc99@08=8BIxL%`!t>zW!SZLul?;jeDGx(_0
zOT^P%a=zklR5`IVQb&y=XvCRCSDbvW$Voaef=&<#k7#Juok&CyyAso_LKR`>VLSFB
zH<4rPC*(SN{!eqeh1#85>W&+Jm9aL_zHO$%Z^fvO6LYB$V!jo8*-Bewi0g70l3io-
zJ=*Jt5Jw*#1csWsVr^N_T1O2kQ?Ay&Z>eKPjA}x7kv&z`8%~UU!0v$k$X8}$0Ce}>
zeN5F`9WNDkH)K_e)!El_5^@%^r=Lm4_<9MNEhjAxFTu3k<?ptQpL-TsMzFFXtnRtC
z_j1rPB2Wusfnexa#SC`C^-tD3vr6+Xy+iC;<nBn>;T<6@w5HHHx42j_MAG!L0C#85
zk|#B1S~xP=N0Y{b%wF6+G`O?!MA$AX&riPU<HzbJ4-Mk#V!Rdyy!Z!}VUwctW8!BN
zUHf#ZXYA!ho+}(=ehoc*jVz?f5IGpO=DBrKb+X40kjWHgKIxv=&Ve^0^|n7R<%uCj
zVAo8wEyEQoWTxP|ZffIHv0Mm`5v0<)m&xO;;9#}9+lnMZbYuja<zY3s^R=bF1|Htg
zNIKrDyzQSES>d%ZyXv}m1q1IJ$3x;Oyci>L%(!vuN39RVdE0$)Ec=14zifL>4r#T&
za2t2&tG;ReynM79$E%ZowPd{{dWJp=MeQ)IYN@vU$RJ_uumZ;ZEB0;K;0CLioMY76
z^o+H;WW^I$R7q`qG&kIYQ{0@_x?jDIS4vx*bi?OiLt8;_&hxnSxweh$-60xDY`K1%
z^LR1r#A+oM|6U?7C#IJno^SSZk_DV9cCTWRjfq?E1|msWi)Cho@u8%Pn1RUl+sj(6
zOq4+>BBECOJNAR68Af+FL*Dcd4D}U83-x4`9Sp%WdbR|9l3}B6o3AO(=rtL^WN3mr
zLZY~~q<DLhgd~TYH7Z1#{tj)EG-~y>*<jc$Wa~RpaYGNEU@4Th+!K#4cvsrei~0rQ
ziOi_=x0v*8PmNi6sy5<~R^F0k%{ZrojR{vt$+#eIdb<+P5?)N5!(7JfTv05s(_^#t
zgS|pg=0@68Hd&JpN;F=??uTETNfU6AQk57eW?m2FO@1F@z<Mx!7lYcT$H5QWV^}KL
z@$DWz5trwSm+)8PBvp~5(Lb)(lyn2wY}pu;TyxPRj*wWh-U{^imu<V5rlpkyJoWUf
zXEs7h7WJ+)rZmGoKsD51)iz#!DFQ4lW-?yL*K=A^!kb$D1{d!3ZS6*%vLYE?tT)0J
z%`SvKY{Yoa(AX+3zaE>{`X9-?$VOhFQv_8A%iaI}-b_Pw26WtXRTj4))kjbhl@zPj
z__{Dgap*x=Pxc+1R}xso@(w=Bq5VMKFHEx~gkSS0a<h;<QQNXcKQ!K@pnEjsdsidD
z`un(yI8|XDUn5nk4)=mhOHe)YJgZ0M)C1#MB!i;hDDIk}H@SrOK)RBEbXg&tVjDDO
zlPQ)5WU-IBNUJbJ2N|`dB?P*!eiq_YV~<zi!F8NZTz|=TEhmir9r-Ph2pqX?s(XX3
zppCYO+gNdCC?|s2_9rL!&Ki9O@m;qujwL>DY1cbFnm;jK^tnG6@rBH?8Rgh>b1ot!
zN$oqQXpy-_^7Khwur4Ma%bPj}rM)84<E_VOv?6nx0%ns`!$7nrkIiiFjv+V9bj&Gn
zu>_CHqw#)3MbfgZ)WhmfM<SNMtPt?;+pWpGQ+ITO)(7b%pwnj>?ajI-qmZsJ7LSX^
z!MFLzwa%^jX9ZR__S)g>Epvc81<Lt@p>}oCUH9ItO?Txs>hCq`4<g{GWLfu5*$nFO
z_?z%}!Si&pvSm8p!t2>*;X9D+@G)v(CTgS+(fC~NsQ0hmT(;F(10B<>dOeEa>v(dY
z`+4T%c`%pVu~A@;#38|S1zprI2X3A0?h_e11MI$oppB#`WTL2PBC;o0W`4JC*|{HE
z`Yhq_JU9LF-G3>+lIdf%v-#ckX&wfynQI?tXt$z1KhWgh&CZX!^Rf38%8t}_uWO-{
z?x&g{be31i-?~2Fyspusc|ZVbaH|7I7O5uS*Yxl<7jC2Wwq9BEmnv%^AR#vC_p<8~
zL?_za=(eaB_->yJlG}SE=gnEuR~%~6wAY*SpQnWM^ho>gT-PLac*((;oNdykc?kdg
zj${lecsq=YAB~Hl;Oj`bX=^6oeb6)S%^g8qjy}$s-95pwej(dlS7YrR?SWhEn%VId
z({`Q$gmTF)B~5vs?Xl*LhH2hEbUjQpzrjTBo~i&NNO}!vD3$uX5waw=2~(@5Zv=VL
zm7rA9$KeW6xKZ!?Y+XUkxnzFTHl>?koVaDY%s94vi?Yp~9)u|z%NcS7(KqzpRg9_w
z5rn8F4keSLF`IUa)-?fSLf*KMZwdAE;;mFaat*(k9ogV$tuA_&AeXdGq<)=kI^5*A
zFjz{uit~Q3_rto4d_P}vDg&8ZW4na6?jarSuJTu|#vt2IN9a_sRE&&M7AsEzZ_1{n
z#;{1F-#}=AmrDp*jn~js_!74FytCcuWgx9q9iJ>J#_l_5-RsIg!}oRp)53>)OVf`!
zzi!>GMSoiE?y)ED)G3Ob;)N2&<~65iO+C`;l1*V#MAFW6)1ELxg>aV~h;27#zBo|s
zZDbJ76VwbBS&KRJRPT-w2a#WK>QG;^{9G;hlIn8Ss~WED2^~L;+L&utO5T-q*hy+5
zG`^s_f<^KH9|e=(QM36c-u1%mli94+_Oep~Ph3JyBEpt+Pm@rWFosVY;FEX4sOKf2
zr<E05T^V6fIVE+L)4Y@Aw0{49g<1Ffr0kDQKYv=|`Hz?QTGe)N7aNS)B?g3UF=uRw
zQZL$Tdy&*=A1tS~FyXE+g<)lv=0+-OW-Qb{?Z3+;Us#!k7AV%e!MlnoVQP1a4>1Lv
z(_N$K``p~oE1eSO#FfTP=iqz1?j#?)-dFllrgz}fM&6yF1foU0(9^%s+sRn?UTH)s
zSOmwLaQoyj())_(w^PRS3rd+HKx&t-1;+QiCH>#&mZfO(Mm&v!UqrS;yfW)$b@iHy
z1@?wSy{s2zAYKCs2{TExbIs~)WFO2=sU>n0<We&k1v|q#@LuK-)l4SYV&)DAJfX0d
z!cnm6UcmG*1_gXcg6G)9+@I)!ZwW|{enp>onKbZ0fkeS~4Af5fis!K+l{`F1j=_$<
ze1lU7vyDDF2{EtKBynuc&D7a>?_ngf6^CuaMm#*71j=W7p-F&;fF3G5pXnF_peuq2
zhenyK&}Sb{y5ok&*6ap>&5+ts(d1wS1%<?2EM$dXm)NB;x@XaJ#tO%)+`S3;C}WR4
z@;+v_E>uoR8~yfJEDFa|%_q}0r%qoXjmXFUtJSCe=96_NVlUKM>O~6_)eK_84B@)?
zyQ<rDE*cJ;Bow1@YpUGj%5SzU)0Io@$eQ1A%L9X3T(xpG5>V=cWMWq0I6sCGCa1>J
z<sx$8z0fv(N-lv(l*-P-%t&z*_NFPQ%?G`XJBh2og-%xpzWI(5rP8d2c&eZ95Z*W8
zNJW7?$=cp(Di`;@3XObzJ?epw>8yNwUXe^<!CZ73HLS^AV07*C-o*MY)y?Ga1pc{x
z<+mA=<pPS<385Z|B;FD1@)=Y!#iar98qZGL{E^>#*aU?etxQ>~Q%_+%9`5h{EW(Tc
zbXXS-&`Q*JXW@Esh&+r}<7aaBR={1{cJ$7ju(j@FTJTk838Xik;ba`7e(g{nCp*Xq
z5_Itu7$b2z`tcs>jmcS84z8-Wo_nt2?A5&gm>7SQ4-k^YkpeGl{`<lH#qlx!IKKDb
zpV!X{@GI!)>*pV(Wt#M{<i9F{e?kA}McdgJTo<yn`Av;HB!WeniN5=NPe_DXv5Gv;
z2M*2;9IEqIz$Iivr1-Aq#^6N^0514@c1iivpF#%_X5F&|{!s(^v)P;u=)bBm>z+o;
zKhj`eu3*E0H%Ea#KWhjN0DFtSK0+(Pr@#D23slsReyqxzP78VI(|-p7kib8u7DuTE
zJ1hhKLtF3{1M=TN9RK{guCBR_xudS`4+H!Q>2!VPFQmVSNdGiLks*-23wMK&{vG&J
zGTmSzr{D7+(4ZjT!9|UKA^ueg8s~pX`9667hye?o0)Gi1a56!{zu++Io~z&gKnFVr
zBI@7)05pQ1M~VsTr+;uV&^NTSvvqL%?MVF(>1=v@NI$E|tb0LF_Z^}P3LpUBq6Po}
zA#`2=QGqkbAM^~Z%#Cdv9~e7;C-@~Ol0u+mf)a>-rFaS8^}}#iMDY5m0hkozBVGYL
zCi#b^t+k#0WyTO6ir_f;ZwOulQt2SW|09@M9|iz$9RhL%)OO=<Y9{93l99RlrG}3P
z0xA>qmHJn97e!qydFrkLekHgf4glampss)hY5peo*JZ}GR#wJ_j^LQ_;1Z9S0s=7;
z1mgOY=Ow{!G+4of$N>NWsxUuUq!TwJ-M`yzUV<vLMJfiKLG@z5GxWmwNZ?0a$^P@a
z776;Brk#VWqphK>)g>rGyPaoGf2DXqP|+$<>oTz0LArjL5<vKOkpHv4>HVGS{mt&5
zBIeJY9@6U&KmcWe0+WAbcOK^NgU?k6%oSn#l)uUS^-g+T2*UQ=gU11TYOwPH;6Q?a
z@}WV_b=1EPM3A2FyD%{IXGQ<V?jo^-9D-dYNc9Y~3rCcfw(Z{Ez-PBD(0`uKl4($*
zXJBk&XlAYNV99m~3xuA}q1nGz9UNaS3Z6kodv6;YKb66hNgzgI1>~CpP1nuQSoeXW
zgSn0A|JrK&oH+bG9MVDHW`cV2&Ln*~5o7l&6l%Z|=(REcKn5XV1=N`jMg6Zx23|lp
zjggln&j5j(37V}ri~Oa9JQLXgV1Xl(8dwF$aIymWQVT`@-$-&<>NHT)WgE|;eklQr
zRxcY4!L*yO!HuLK1gwC>o1kc$+kmab$r_xToR$&YPMyU?yciV(W+o`U^(@ktfUo2)
z!zKi$a|8$g05Jso3Mirtit>+GsPEtac{TLKOq>=1HWLKjaTe>#Nw|Jz)u$4iQ@vvS
z*}{*(5`LtAfBCw<g1Ekw)oEUNSu=sQaNxkX-2Xk6a4Pse92exh<dC6r1%xs9D}AU7
z$S*eP)KH@?{OMfg7sQ8bIu4nVtblk&e<l4(ckzo6r-vere|RqOOUQt1^$hJ(fJeWQ
zhi(;qG3L-dMeq1r<`)$J+0+u6w))Ahw4u9K{(opQA;A3f;`!G!V%GgJkuE9#vY#Hb
zeN!RA{4zbE8s}Y%HMD)3qnyk7g7}bq#Gpy*UHz3bRKu|U4{2(&b4gziAF?41G-+b=
zUr9rCzWM)<#sr>A`hxh7M^>RzD`ada(!V{Mda-B;omx?mpT+x<A|TJ}K}Sr|8_?8$
zbI$Z)BMu!grzy{5eklo%r?((;<rUBv6%_H);Q9MAb{C`14CTSYY0jem|B(ZEZio^>
z&I%}u_P>?*>wwV3v|xci&jgLoT}q4p5(IgUgb_l}3TTA>95sG>VB})z@IXk(1Vu7l
zOr1*#gxtT+1R-z*^o;2owSK=*|6+RaLWs!(t+HH9uggk=+-nY<+3&Ibsu<)R{EKM;
zo!JwypR2{i;2}45LMLmO9KTZjrF?cV9}AtV8StFT`w}uBHv~YgcCLW5d7;SvT=~D)
za6@{IOwc@dksNxh{hyTQ;^>g&OK1Zb6M-iD^Dg|04K%cYEP~HGoWb~F@TY|o$n`O7
zs9E$`{Ac)o>y3Yn@9zpLkmWb%b-PxHzefPF&UUc}1v!+NAV$ewdHpx$&$>dA%LL&{
eL16w<B!kov9{gqu0LXxUyupQ#UGSd+0RIP$WR;r$

diff --git a/python/lib/py4j-0.10.4-src.zip b/python/lib/py4j-0.10.4-src.zip
new file mode 100644
index 0000000000000000000000000000000000000000..8c3829e328726df4dfad7b848d6daaed03495760
GIT binary patch
literal 74096
zcmY(qV~{R9&@DQ)?b&1Qv2EM7ZQI&o;~CqwZQHi(`+n!1b5Gs;NK#2xex$0qv)1Y;
zNP~i*0sT))&s3EBUz7jcf&Vu;c(Pd0EBtR0C?LvzO%f{;IB_}u?IGX*0U`Y#6C*=o
zYX^HL7di*eVGZ5NB?ctl+ghj(2W^e$w>;vSJ@$f#yuDzFf7e&r7MV+=%bWhUq_X^r
zH#f`P&42VV$1|9dGMLYINZV=CIs65z8arM2I~)(io<dG`Y0BZyOw^;d7^qm*sWegK
z;3*$alihBQVpaaFOA8(-Sq<VkQFNz?%!l(@U|ko(Kpkg0qz2DHX-9)NP7#cCzS56n
zy|`Xvi&r)EMB2HadCFs>a!a<n=yLgmj&L*i-Jkd8%$jtP-RHo!XYoWp@x^+)z@E8-
zb05s#&yH1}WLwELsAFL^o9M5opx1wWz&jZOe>o@ltTF#<M}eS&qbkok%|klLw%n?K
zX2#(dfW=hRqX8e-(}tx<kxkLX+Se39h+_h_5_ZYCg<c+#Py#CbCyRBE_CRrrgKCD=
z3WXmETu#F51&S#_k(0z8e+&Mp2s-br`55E_?$XD2mD`64rY;bzop}>5lo^S4-PblK
zy-)^8!w`v0o|QoxZ99dFDMc75zp?k3fo2|d1Z5ZRo8&x)3LBju_h)PcAkPz;!weMI
z{)}eE=T9UMfkS~x(jv?~6%`-Wl>*`w0Zcx<@i_P*VX4?uGh!yOxFp444%{297-k?m
z73R}T0KhRE(Gd<_T1K7%!Q$a?xK0?PJ!+t3Os~N@G}+=JO!)L>@SVh~N)E#5XjaVA
zRRDFHHyv^zAeZ2SfNR`pgV!L@{#q3~20qP;IF<cthSjbOzu0NSA=S_dOK7hVSl6&Y
z@wYg|_xOw=+B@)a&O1H7$$+c0LKtT83^g?2v$sG~n2#J!bdq|(-J)twgUCWQ`pknA
z>g(Lug1yL$V^9<3?>ECLu##>*(a?(5%JZX?j+I~RO<JV<ln=%{$JkYq_-hR4KGE_G
zmWS3jwOnG}&FXPv#<$PGlUt)X1$sT?wOATf8y!YeBdPXlZsl8wmK(%qbWe3^=@SL@
zD=4@3hhrF#L+mmrdcFuQ+5Ox+5oMk7;25Zbe$_4gHIFclNg%hY?m;n-qch#TSA5Nl
zJ=Wd8d~Ru0@L6Lw>3kc(>$>z<^^d3qMZcn#<w{NrCkQjU7%MDl5w5A>>Yyk>3K3x@
z41k6#Wfu>{KQP9Ch!`waWhjP4OnHeDZz?j5zfmZ*-&VpVG=IBJk}bN_0U->h(1&=Y
zp~`^s;&XijeW46KegJUco0?Fqo%t~6*a47k(LUcZ?bpCjm|U>zb{nnFC_C=*zfTp~
z{PT<6x^kEuR+R|~$-&ES&H*Z5d-0<0iUrH`F1V~G9oquImm??3f(8A$X4u&kMt+q_
zM3ZcJW_!RqWM%)WpjL8jM{N$}ufkEtuO{oyk}uXA%2x4V&5?AxV8#2I!e8miKmH@m
z?=@RFwUHG-LGB1rUTKaoJ^I&ya=SIlLYGdM^Y4Ah{#Dxc8KFh3uJhJMq&_<M<~xx$
zkWL?cm!GTbF&e2Rg9K#V6=jOsq{gP}_p;<?K!X@7w^x+jNc!LtZHdM9!E@@sLur2-
z41)CQ{iaVZ#njIn*mv=4>YnLD@VdMa{Ec{;W{F%n!HV&j7gCSgph=&y;u>)}hhC|R
z7tx;^DWaz6-v*>G65LUz(D0J_hWhps<WO%&@@;_d4F%+Ie~xl1eH+gq{0+*_xUf^l
zBi#th5}`<Br<{9=>8Wk@_6)<qqqNw^Sz^M$EgwG!;(pxUF{EA!xMr!FDz-*D8MR)>
z9jV8Ps*QfoVZ)cr>*8}*4hem=l%<Hdf0%^!)?Bbn>?0QDO~1rkQ+kLzWVKlhV1*h0
z-SccbgI_x%J&EHeXW-iq0Xa<hr$%Cc-#uZ}sjK|FaE4Y_4!!^Pce~drNO*r_5L1kn
z!%ved+xq4X|8`l<n3gtYbz^(O0scpp$pM(@wf}wk7U~9k<Z{&Jy#4<T<>{m%l3p)(
zg(;{&K<?^5K>w2i{#Ud&wy`v|b8$9xax-=MFPvZNXggoBAo<PKGVB9G%7k6QC1o%v
zZ#Z0sS`OL<IO2;-%HngsEg)h2aC;}vjyEh8JpmzR2Ra*RYE`?bcjgwQ=)dmxJ=)UG
zU`n=RdhAZ|`F@hnV#9?GPWmX84piP0PDrn0sZ@!orRW{~O*~dYy92s~K{iyj&!HE&
zm=<kDC&m@oNt-u5^36h1#V_75Mcbvw2q@9Nt@z}5Osql~E<w6`^<a1-1{IPVX92Op
zF@;s-jAfGl!q^pGm+}au^sg4xOwj<%W%E>vLZJkK-%pv>f=jQgf3?&T7=+S9>@qnQ
zRmjEhp3~G66(z%1Q*ip6#&qJ5z$ur6fMC1>*D0cEnph38HZ}}sekyF0RZ0eSp+3lr
zG=`V;jB&z;OBL;T8F0{4cO@D?D$c@8Q8$N9gkA(YLMb-g6MbZo4^$dsg#gubMz7Sj
z?9mGRIwrZp<d2Bro|~||82AHXFLqoptR{jOsv%Xv)`{qc{R;vLqpnESGB4Ns3`y|y
z-Yh4AK4PgCCBT>b>e@WkswsdjMJbsttq8VbF0)T7+oz&Y-4JmJ2oyP0{Bod)5?A!W
zN6+KN@>p+TiPB1m6j&4lm=f0{@_bgsMv3m_$n*1lpBP~P0J*y|_V%v*tE=qHRu$5I
z>Arrx=zhM=-`|K|TCIHCS$@2*=K4BvwY`Cvn5E{6v$gayrR+{+Ly}nR-QlJw_H0Is
z3k50*n`oXXqn;O{WWQ`h)jR8nKVGl>X^RD8(E)yX{X-MckeW(0{ceMy2Dm!;QV-8K
zRcu>*utsrGu)q+hp>c{@(enlj0>;Rp?gdq5a4B?UQ-mn=jOS6}Y(p51I4NZ1ax{bP
zOU#)HX$G6S=<{Zvx(eKG_<!=KrI^y|7&zG~{&``Wau~8HX!m7LCLpwEkXLpHggcI&
z3|jT#?v}@%?S^U$ut+r&!3d*e{U}GdKrfot10gB`I3NOp)QXj^6?KpInUK#9jut7w
zKiSUbwOc-uG!<br@v*>bsMq^YktLX99?OzMG)va<P<mA<)IGzV8!Z4)4u4lX>tO}`
z=LmeO)`zKY3PMpisuG*WtjWn)^FkJRWF+>akWin*E`W=!ZEus_05F=5NBubT*qXjp
z!i3)XwB5RTy@5E?(hQ_3OV`)K*oe0XV=Av|hn;VFLx}sndTN?YW<)!NqgNlefECuK
z&e)1IyPH9zZXZr!HFV@&JSs@sV2ngllwiY!Mi)JJcwYT(_@`htK0DaRm>^P-UIe5#
zDwN7f=lYn7<bf$~!k*usPdojM4RWlE8;MbO3!ynfzF*NH0sdHU`RGsLTJ={_s!GD+
z!d6c<u&`)+rks;^;qLngkMsAwZ#i&sNv4d_Khw0^p3c%DXomt>N8DxK5h}jo8!)7j
zqA?>t6;gG}u^s5J(K_Az8DZc3pPwO)K>_hFA`hmhwY)0i!&|OTd6yj7DSA=l#Ho1}
zOJOvsywZXd@ViS1YO3eM48DK32G8KPwYHJreW*YyT1BPF6ve<Z)fwZ|Bv@oc;fnCo
zXnzfuNSc>W09BO0;p0&(5Ln(1KwEj|rg_3tHSjY=5Pw#6?6<H8J$+|euuLu1s5OO;
zJi&}HhneX6K8LD8+9L`(LQ+x416xusz`ZTL#lKo41+i5-lugDYjy<H}TWGMa$u^~|
zuG*<l0!fs$f^%b<db*oBa&K{xy0=qOG~y0B1Pr)9S3NTNvQXgHivA%}Owx$16PJ2J
zJ3K|$NFmn4^B4*kj$xBUuwy4h$u4t>U0-UxuaSKE+$Dzny%=wb1>#>hla@`@$lPdb
z;l6+++M)?hkm_5S7igBrBVjS$>|T^^A)2KJ+yBIRh9^-)pqjN|B4I#c#gx(V(i<aR
zs3}*RRnZ*IG*zND*s3o&j1v$t(6RVk4;uUj#`8D0=^YnzNffnb_R=LV_BUab7R16E
ztS^ddG60~}nqH)$uhDxB|GO(D8G77@#53rx9YJdxM~<O&6m*&^DLqZa$w~YM{u{&x
ztAH_;!xg*^&yTmB9td##sDG6Ms+#ALNfky4L1Cx>N23GQ&d{yID$&I*SXv+uZHy%@
zZwj#hjuzu!opRqD2?frtnQuc)D2JL|n+(wobld!^V@h11N4tzz2qNuO7#ImZx^M;r
zJE;$IC8f=}SJVCUhN)m5?5U6P#05(jy+jORX*PZ<R3NX5gtJo#`ZfUOhw^#@+@cu6
z*BsaqUNh|F=x6|Xs2&~tt17ZiX<;P5MjUqo(wVI40NfF{-fYc)rJ1pwv&AJ47Kren
zXrdtps+lVpy_+~O!gN-&r$0(_dZ$hVowhWCE!>)rap;)NaIx;DqS48a`5;su(33qP
z(C-Ndau(t96nS4zW16r5$8(~+B^KwUGgM1`HL6E^(ikk)X%C9B4b1c@)2)Pv%j~3{
zQxe=XMuJwFhdTVXpd&-&Vf2uvrf}4bb0aJJRgEO?je(Zzs$rGiq>g^9>TZ|*%2coV
zM<CoQDIVrCJj}~pPZ;k{>S<mhWD>@6NLHXCMMtvSgM~)l%;?ov13z1@oJXE(bk3d~
zjVD5q%4SS4-yY#!$P1)m#lizf!L2AwMEsg)t+?rI<J`=xFl!MsY&yospXufZCWIPt
zva?ttTXbG1jekMs79e+d8JC$6zc^Th<-ol~-3jom!@gL2#*~jTAtgRK`gi<DIhrFM
zIx=B}rV>Zmof9}g+jCpRqSTQI27ks>hF21Y$50fb&D0p*5q$>)lMrlVy<S=;EvTkM
z9g~G#$*Pt;5W#<prqknP?IjO8G;Dte!w)Sm{fy7-guAjttrT%9;^orK>5nXva^{R=
zA2R7>%cNS;#AtA=y^D+YD^SCH_>c_2n<*0Th9{j33SHzkda@@GaAS|iD`0O0BV-%3
z>nu|T+EgH;nn8cv9I4NAwUNBZOIl95l^5!PAB!u$+yt$?nj<_ZY#-cQt0}hwCcU!i
za&YB9GUxJ>_^oUJu@Tu(e4AZ#b^qR)!2C1%?)u2R$TQUfw?LH|XB4|{OoH}{!wGDC
z%j|FuN<cR7GvN4%((0**UBTGHeto|Okk%{IO<L0Jkhl%;uJBU`7KHwl-@6#63*+ET
zgYOUCC}vzKvpz$#vQzN(^K1>wFD`;uf;(gdlxz=2>8{#e>Dw)PmZD$L&KSxDoLaE`
z)rzSEffj;y3Y&y5y_yg5D;(J<=h>S^&fvDdj>M_EiT!cCg%Ely+^VdJ$|eTvbX#Bh
zx;On6DE%SC>4C=*pv7v_z}1JUvz+W+286fuVR6$%pbiG%zx%^#FG#(xVYbF#FbDSF
znxIQhZXLx_?_F5VQBb3x22$8+!8I>Q-?Z!N64B=i34EpYAT+>lzbK%d14{6~Up#N~
zMkOKbCfzjlO1uPaXyJf;75(vezt#SnzwCXS8a5n&g8&pUxzB&j-3|V-vt?KWXRdZz
z_vz|j!wZOg*n(v}i#|}QJ70??w-7$96SeZ4{~N{dPUZ4yvq9XrfjW)<Db6}Ll=qD$
z6ahP`5FV-_BB5U0fGces0E_6+sPMe3@0jh-ra|jzRduHe8#xt&A}04FaY=xEuzM|I
z@es|T_X3Bvi`gdRfEe+=aELv5qXr%gPoLE)a2v28#S3$?S+*kq;Uf$VCyoL~A<}D(
zv%H3UEYtSx7pLc+n=!yjJaNVVvjKywMtBJt2E~BhS{~x!J!A?^u`Hhmo;bcg-1~hV
zfvsi-=&d=geySs7@yO?d_G3wNUkg)z4>dK<*b-plz_kN75`RWl39lW6qTS~CUa7W0
z55eDBgJ3Ryb@m80E0mb^sgH`-TasMw);Itf*FN4$-a>Dv`+er}dAfx>9dl?$skzHL
z!G#Sl%1C7?CMVaPV9um{_!!2xrbp<1%_~W|dHC#8t#d5PW*s}lZwA3;sC%@ve{Wa2
zq<-IF3NwpE$<5&N@>L@=NaLo3q8tMnleo<!qM<qKilaA=E=a>KHk+#Z?hxBF;Jc^_
zaZ*~q!NP?%`jWFK;%mDap(_iQlH^*JtUKXvxL#TE+=FBD`AP{ZTujVq(YA9mKTN`P
zs?U`$dFRV_#BPn3tBhJrA*tETuhg&hjw=rj&85vfHxwf$t~2A&5umj^;7W_v{0c>m
zm{OCOWDa9E_CecHo1TuuP)3q;r!(6W=Dk`f+~|8VcU-UH!_iTA&pNU(Z8}~}EPKS<
z*Q?be=XWaR!i!Hv)Fvpb=yZA44k_6WFV<n_vaoMXQmp;Hg!w)n&Yu?VqASm773}S6
zXM14m&Dj?hCl#$ps<io51heh-odUGc8vwtg#csRTnv|Uv_*KgOU7=lh3b8pGH<pAF
zw>8d1`0VT#s7)KK_r9gFb534`VG)<ew29Un%(~>|cj3c6WB*=cB)OJLr66UAbhYgf
z<W#weF`O?aBJJ<kH}uCv>LEEs>`vc~@*MrNn>n~uX7j((JMFvc?Wv#NoPHQRl!W2U
zFE(O3P8S_B9ca#wI?F2<VzL7JO_mj@Zz+<PvXh`F!d--*osjd!N5v9#p&MlEiIR9%
z<`E{4O#WV~n2~tw!GHrsEV<qr&SgyR=HNkXw8`M+SX`7<JMPgxHPstX)qg)8V)>K*
zbe^lV6!uWP4<X%pF%q=Qfd8bX#3k@WtlsnbKwA2IzOy$zUf$LW%*vTB-pcty&DPM<
z%h%2fRxVDSV+pwQ1f1)uPqy;oxV^)ln7HhT_-%8~F6-@Dcc4%*Yi<iqsN$`rD|CWd
zu8+d`SVJRVpFB?xvcb2D3XUu6l}f}WiwX`ToAJWw^jg=!PWG6}aYl39ImV`-o_3*n
zI?jZzI+P6FU@Jw7b<Curl6@%ZnG=W313K9}fIkwG_?kffeaQdE$E2H~3<PniL?ahc
zG>AS6ago29*mlp>%+~JX{i73lIEN!+?*|*X;P7a|(QD@ThVM}I606O^X1xFLIGx>3
zrz2D8F*14rJF+mqvU)tHA{uY0?%GYizW}7WV3Tje^B29#m6R<m<p-?#-!TwC{Ec?T
z4|iUk&1u*D{&OrabACQO{VWTW(cQXO`N%sXq9h=uqXF8kqAfCdA}Tz2jAR$&jmhxo
zpWXF+GwVgWjl)M4WW~6(!0>7op{Kohny#Lc^3W5MAWr=z<0Lq`!puV%Z)ZYxW;Noa
ze68c<WI(;FaB7&*%B9e=0c`n7k6Q?q2!BKa$QJOKYv2~hP=aIObO8%-Wau57-_a99
zCA+bHuR6;*zskz0yhBmyl;_Wi)e(C-$tk)&>`cnd(E>q|G2bNL$Q?V8Enc}UV$j4h
zqq3p4NxTto^`}@!o|U7gqZHgmkeRamwHRdW(;JgngzRA0b+r80oH2I;t!4KjI8*~T
zF|u)g9+s<#r;+k;cfW%r!x^=;r30p8QKVYwyebFbeR?%Fe#ag@-j~P6s;>%??Pq5C
z#)LQb5YbDGw`<~1csP5<379~S`H(|&b{}#)gm)DR-o{2VRq2uvO;b6O!M(?(7zeAE
z8VD(cwRiu7A9w<*|82aNhRqZj?Kk@2zQ50fisE{E5s|<;Uw?|WFNJ_unB(dbhRAn!
z;0kwvSjO!eH<)TqWx(U4fvU!rRqv!P<a*4bStrz}Qy+w^MJC0hJ70OlLm=C&Kq@{n
zanms+@^zpAKee}0eSq4)h5KadD^hc*3H=E!(vq<*zxtj6mHoSlvF$f)0;9!>g>fq6
zR5fSr;FLq1wlLJ_Ir4`@w%j&$Y3oXp4-t;Ze02B?GXPpvTODb8#>RtWzWBVqyQ{rB
zQ=q?dZ-E<k(~mD6F0KP9!^fOFtC%=xPAFeEDE!tXjKFjEa2R<Gwt$ekA`T)PBEuMY
zcCsLcBz^{&^0d7Hs|!bEFychDAmcDTCQ`j)oELJKM>ua*h`u3uMKQ=NXIk&MA7H^@
zthG>VS0%h}YqI@}-BC?)Jha(&cBjkMGaZE6ie16Y+M~=sK*)$Rpi*mxHG)7axd9aw
zHk)W5<5@$xxs<N8aQgVNHjp<MZ2p0!yto9s@H3c8<`{%k|K##kAlr8`m6SNv9s%8^
zJpbi}$`TBLY*O2)M-%V!?~6TS&*IL+<ZMImZl{4a>2*b3mwD!WBiWt_$0CpkHypeA
zyhjDze{7R$bL?jgLbIr2tJ2B0A1wEyuy1kILaq8%Y3qP!q%P-iDKV=>UfG*N&o(W#
z#eXSPp`Uy<VMJK?IINbn#n*M`sqhnI;2P2IeQ<luJ6a2Jrlg*r(#%~Z%N8~l&^EBU
zm!p|b#O*xdLjo;X8oj;ZToDz{E*5IcC(N-=qt&q1R#(c;iI(;1^VCCZKbG>L#k_qf
zW^`@;QI0sP(HgU%kC*&XjOCezb8jowfgs)6rl3~Lz*%>*_<^4>SXd(u-{Sr0?P>S*
z4tD;Y<5bz*ya6ppbcc-q^YL5Sil4c&LNgHgr4b=Rx|&<f#2%k^v%GhdH;HruTv~_}
z-IS!i^23tXIJ-B|I_N^1cmt{NGtXl*QESHTD&g16W6nY$5uJ5<Do@BY>9xypEpE5b
zm?C<G=dQJ`a`Jp>tdzM4yj*m_w3G-ZzQX&ku+>7t%0_x9Giqo^6aG=m+`LI*@67QA
z`~Ro$?muHazCnP1jG+HhW&S@J&)(L-@PG7}Zq@17MFymvQ!24-=>}@#3*r!JFAbu-
zfDnN{jslfqk`&eM%G}o6n<v8_z<M$DIlCp=Zk!6L&qrDIemp-o`U2#uIyR{a8DGO$
z25ed2w?{Q5VWnMrC1G7C8T(2WW5q#IciD$HN?Q$(bm-V-ag=+~wk=A1#C$-g)@@OU
zj!3kcqGehGu7QwmWq!XX+L~87DLUi$q>_3WQih2@yD3W8+5ubxAy$P(p#V_nW?Jgt
zNOLg3!6Tux^o$q9)s0gY?a-;4a@qH7V8OcpsYNjqIwxovTqHyysq8_dw|1SaJ}dBj
zBZS&QQ=lhD^ofsU+w6v`nG*ZTF9Ek#c=$eWD9>3CS{4MOmm{@w{sF*>pC_pO{se`_
zCPy$gevK{ev^bHhTp!m<4JDS-vS&}`5EM`P9YZmp(WPZZR3!Yu6~c@~=WCD9!kD@p
zHn&oVBJOZ?r#A<2^Qm0S2-k!nu8CB=mtf3IKH@g@KsO_Fk=KfJF9DMi+>7(#Ka!;+
z!T2OF^Yk%>C1+ercu;AI-%93tLM&(_tcB}?+hr~*{Y67DC&K|39J=MqJ!2+Ux2WtF
z9;3H&o*8TOujT=mzk7c*-dSJTl73N_v>j{*<OXIJ97^>|bt~-C5@f!)lqjYzLnz0w
zD(k{aCKkmj3U^7$Ju7#k6U<%qbzb7lWz4f@3;Y)fN%)i#N}0as154zWN`g-V2K1FH
z?Ie`P0k}6T{SuR}sDF16o38X`v`aGb%`&yaW37Wnh$Blv%^S;9y+?IJn7VU<S>;{G
zP3Os@N6SCz+WoCX6o1_>5(g_-%loSgw1RuvzFjdtim7BUZ{E~i^&A}QbBA3O$GUIm
z9*{#x;b<r8SnAB$NPf2g<?!D*XOps980T?hl2s&<LjOeh!ljL@wdi!W_KGrnnzCk<
z48n0#y(dCWP=|_X2Qwhe6-S4HFYi%TZc`GivI~m-xdhZmsq8x#IoDN-Ij+7t&I)2q
zJ(lDu0%H=o|MMbw1O2~_@;}n0@<(CF2q+NH!hgCY^8axZGfO)|8%wYMm+V&Gj9L;v
z@|{&j``C+_0z(&sru8ruyaUk#h#1K$Wi)M`BW&Z3s(!gw(r1u5Yr4-t3O+V1>bh23
zES1x}kki`J=PQt<UBd13a_jTCP>lb>Wlv12xB?5R&Im=Jp_>N^cBaeN+f@)IN@vWQ
zZ<504AzqZS{OCRzDIgaxXdtCAN%SLCy-g>TXJv{eqlBi<2!n)eb3eVwAsyE+w=QP(
z7-*_iG;oHVUBe+e94S??TowSR=UlU#aNd=uZU|9urUbO_p`}AN0<|znxnMjRBzVYE
zhk|q}d5^Nwt2iRlZ%%&ZE()>!ZTx2KXC;?RR5<6p{Z3VFJ2og$%dM|_qy_$<92~tn
z4_K$zi|uGupdlx=&mjV-+IRrRHNiv(lcQduggt14WMDX?Dp5&e$IY;5t3Gcs5USE}
zWRyn@7i?D=LW=m*P^B<XE+m~9kCe0~?_WoPRT2FctdIc;%Lr2wEyut+#8w_1EY4G^
zw>SXD;-jk_{}$S%cD6{t<pI9*0_89XH&zn?YxB2?E+<r+x$0cjc>a`>|7#*sZ7-<q
zLxdo_uV;@!FG*gEA`a8Uz-l=XrKkZ7LfZhrKHSQDs#;Jn)IppZ-?W9rWq*M4V>>&q
zDKi36Fw=QKTy2R!qPa+GGDRk5WP85t(fsZ^oEz59T(Ggxv(Ed-d+X0X*>gmxB7$Vo
z9`JN@I|7gPyW=AjITdS_t1L=j<`seD6jFY5o3Dmc%au9Jlg}B;fKhndGHT;oH!K<V
z_!eyB9l*T?;GARs=Qf^1HI2)ttYrVO6f~63;~jaXWj>Z!92oZ*{B?XYe@a^xnuJ!>
z;k6>Pixz&*@~VB*S^2TREj&pLN|2eUytb${LFk*D-m-EdLwSV83#|qz*?-WLkHbsr
zfJ*jeowK{$Bb}sUrzN2co8cuL`;N&&Z{Fn`f&VXFp9fC*i0$)TX8HoIodaXJlH~p|
z`^%#|Phq~bmUlZzGvgT+C0~cKbw#<Vj@Ab8k7)ecGCGCrRbfW%7P=mMb9few*U4Y=
z$4G3HZ|_(=g(clI&SLD;b(<)*i***FQ<khn<CA8_=8m4cgM%(7T|f^hBArC|o4@2w
zsI=39{%Kby0ZC1+7WLxmR*Hee02p1cE8dh^B`Yub$yvcyFZ%EK@~eOEw^eT_yHk++
zhhLV%@8w^&VF=uo)m!_TOWSIFo2p=^HT3~k)xCw;uA_#*Upj_!(>7L|nursAQUdyP
zK+G;$Z0lLQ9HuONI;c+3s3oFSJK$LU|2qBuh+MbzX<i?2KtLd3KtNdk$LXyM-3;}O
z?QLvKja@A5?VSIY&}D1C$%gcEqu=kTU$o4cXh#@kKoD=blls>_VjEaky{ti&!pd|U
z^_Wzz{<r5eHn|LQvi{IzJOIMNIO%ny^H7+8>R9^zp)$<~Pd@1ELZFM_=jlP)%CdR$
zdc~4QGen70Y5GtCoWut(0v%C^n^9zd)ce8C*At$PS1O1X>3H^^0xV(dST9pMZ^W%#
z(nu4w8i>OD7ccjK)AchS>#q{>$d8EJkOdXF(*^xN)23B(EXsMQ#Xk~@n4wB_{Cg{V
z|Jr4)JIuL6ya>WC((cs!?7zctKPfR^0{2KIl02vYcHN?A5ibCg*&~PKFfj~+$$6+C
zH7dUf{(hVa9-Y#;WmM`((@JH`vL*CaHk}gT56APV^IJX%#6xXp%t%Ah^AWs=9j-F5
z>~<=0N$P|}CVj8xF8*j^du}RgLg*WN@V|sE(BJ0TFBVxn*S*eprWC)#H)*xtFi!7p
zYb$lj1!NNadC0XhjHfQ~wJw^rrxkr1TfB+KdGDB(!Cd)c8&aZusgfNTp$m~#LPfOd
zsu*9ukR_ZuwEp=lxT~hVrvc`qz27y3q>YQg6*GUPhqH)MD=CU#bQdnaPG4<g7Yv%u
zKKQ&3u#=pS&uPNujz>aj$F`a@1p%(V%)Z>=P<^2*q6SbRc)!ZPEDC!F4>FdQltnIC
z@@D5>GXg#7`x?sMoOEL61MMj=7t2V_q-XRI8kC}n5Iu}1yZ8}j3?a@}If;k3w(eA9
z=mwOLba2I(aTMM0A(lzH5dkHO6DUMK7^PX~WVllNgb(O;76ByLt!_uGv50Vczbg3-
zT<6m%k+o+^;qI|q$$3=UkA)fN9<)J?7?2naxDux>JmZH^WjGoIQT9g;D*o!YNJqdJ
z?ir0lCyuq7CR`XkFDuR`tnd=m@Q9XCL!%UwI*OEI?%X-O9jbo~=tpekuIV+fLb*j9
zQ`k|4V>b3!rp{y;k_6CH2r5zfL0Y`KF{e50ztKZcV!ZY43pXcE^Lqovo*U4!y?=gU
zmu~Sn#Y^>0d7`IlBB$X$fh}5N1suZ~GL3AC{Uk4?ThAo}ziLk~efvi-@YoCcVRE1u
z(;SO)$?ld7O>7%TyT%nsune&c7AlD)yy|X$Ju-$CA9&!h_>vtU6b{gUnj3Uj1ZU#<
z)71w|H2ZMl(|x$$gsW0UVDfCLo-*NPbcVo)Lvkg8v~ozvZ{*?N@=&@5w2a<1Ei5}l
z59b6bb!+y6byiMlQ_k}`hiWda@cU^ZQVQ1Vr8rtzNlBCEwa(5e_UzA-HOQH&E~=?z
zj%a%lZ0^+rmcd7Xgk3wa`;t`_mC`;0i`X1A#XhohR(R8PjKjjgJ>^)y*|ybA58+`!
z^uz*RM!WqHhDN{rH*~K^%>6EqS|5S$h6(L-S|w()rV*i#6vpj>o2Xljh)w)Mp8TuM
z*cvK7`4EW1*S8+TfDm+91?RU|IOk2x$$U4B)TrCdl)UtQpN!tZ+0PIym%*wZPpUV~
zLi8%=S{Pvx8E;GOv$|UkxbK_e^Aj@{Y^||}uHlK!jvUrzhx<ryC7F7>os#b;l2RxG
zU)(}5Q0g9aC8v2=z5Zifna*m65d?|c*$l$Pzg4%=!K|(*tlygoa7yj$4B$$(RI~u<
z)r#qhU4HAu|AITMfbAKAzf)(AIvR6y1t0v%Hj(z%9=sXk5N}b1wp9ASk5@W|9)_w6
zm3`BSi0)@h?>Z+zZaQwPg8#ZD?_I}R4S@8mgEYh0V`nq+4&p+Y_e+C@bFK!)<FujC
zD)oyju%&}j3HOtYODJg)%7Aj7E3nF|92HkSZP=_xKPAr}S`)I$<j3N?A0n=ipv$9!
ztYu^kc$+$Cxg3LFb&UQ9)hiZBNN(2v4ecwv8SflVCmCQA_5V{*QQ21!?4%*-{+EP!
z<Z%|jp6ZOJR@X(bE|5arXES4tIxn(=?#}je^et3O(Ie%}zmHYOL1FO`37DQYI=G3v
z^&%DU88xX~Nb7(V03ny{75^z+MA;D#kSheU?T_yGW9>qs4L)#T;VuK1^H^O$=VXOa
z`AxQfK34HAhP7m_bIL^<*NzKYTV$@ha3P+toYn-lXivT+(9<LM*~ibx1rPwf&vNhq
zFT>IN(Q0XDeh|cPdcSqCJ2wv9=?`H{Lyr4XFS&}HW+Tyh*I>hpgN~O&8S}nnGcs))
zU3e$Senoa<sVgZd>gtkZ3<G$NoiK9Ehq&UCI=WM;ufG78;Cf92s+zZgtzdDdpS`2E
zvR{rHNQJYe!QYIWjVW1>q3qw$yQ*?$hIY_UaN=j+rH+mRja1CBDfJ-6GV2~rPNjBh
zHRUV<p3$7zhA+w+uM0B)(+;zgnm$nY3q3F#BTojPp-OSS3hX}kO62ln^))AVkLHiI
zv^4T(Q9>9)Q2Jd!xV($lzNeilCQ5@R0e)5L2~=l6;cTv$;ru3^u7hLG96bdEbtz-j
zvXPVCq&)%$9?ev>JO*8N^JkEktgW}T>wA*G-Cj9Sn~+WRXK!%uzgTTH5%5F-RZ+*q
z24EW#8#`Sr3YM`+1x0c&b0Zcl0M7V7aw@U$Sh>l!g3%B!8P~Qq{A>t%0mWa-Pbxgl
zRdYBFp|+?iHBP`qcd~5fMLCS4VHEM2lh91FOz|NGQ{CVRPFGd14yp(nQ2etd-?=;J
z0Um`7wQcBd6ZY4Vf{pa9_-1PM(<}jgJb=&vje03bpqej`S+e&=tHuYnby^T_?2^Lq
zpp>Kk>VCec*4h&yAA&WFe(Krmcki;brVT#NbZT`z$gby9T6v8L+2#IJ-=BXu^so;K
z=4{6zpW~aM4`?UI4kuhIW)0nLRHZK7I_iuuE(9`5o2;Lq2SO;GfU5*1BFyH_@_lN;
zrA1kNe35Eva>)@z2fWRk!qDc1xeY5XtLHlQ<mWJ{6cTf1C2_X+l_Yg%IDH;Ge*e43
zTNYocvdYol8gtYOywk~_K)y7}e&pqW{0}L+O;b4iR`@R(sX>wG279;(YJea$)p?cd
zf%jhIj|L48^(U#Z*>!>I<%faqKY{M!&b0^EOTi>sxY@A=A!<M0b=1$6SuXaztu=a%
zI0S<75OoieqJ!F7wU1~=47kv}Dh`WP5PYQ#LxfhlDcE$4DUuUU3F>prwMh$y{>F*D
z?8bE+jHU^_8-w6nS=|oBA6u2&12Y0{Ks6)?NpqdIPvx*RY9;1q9B9n)adQ8K;Oe2g
zVy$4Qtj@hJebNrgH!i4Jm1qU4oTlXLOv-mFycN!A;eMC?JdIkD|6EMOCd{nbF=R?7
zZfe(SS~qS=PgYuQXG&k3DnuIoS3BNL72eLz*w2v%!&V*(Ig__Rue0UXd)LpS=eIu_
zRb9>RXl(5g%I5Wr`c369EQw1a{7WzhwCyZ=L^GuP*up1(n<z68Eqmbz#0YPQG%h4Y
z{M1}Ld{8$Jxpys3%U?8t$Ztg=jitRE;GC(nm~2(cO3YHbql2}gx)6O}7VVJMhUqP*
zpL>OKb`fL-p_uT{B@7p5hh?NtgK!@!Xqg=|I!i8s7$?IS*xsl0$#Gaw`p31q$<)&l
zGjf_{Hx`S$sJ*92<4|Ivaht=Mma3#cTuPk-n;kcnj4~p9)cNv!M-wmQTc+COSpfgv
zP4no|LGc=WyTxX_3aV8SGql;xUcFoickT+I);~-|tt0=UcORFZkE{7pX56-GA3vcn
z)%uKyP0nVOo*ey~i;>a6&H>cbCx$MczBL@wlZIdKU#B|d;*au?j+65TdQpBQ57yKh
zR9_ujsSf;QcF49?5t3kn#$)*%rAk06IAg3|?AyVbEsz(+JlOLIUMtV?E%-Gt#czwB
zR!ra~W%dP9GBY}Sv7xEVT>Z^k#q_}2$C5;sGvbi$i-+IN=1)i*;Ih>k2Y_DNi!*o|
zXme)-r)R}Ev6JO$^ypS4s>MH{SIIh_oS=e+QAltamy79Wnv(8alL4u+v4I)yiq#R#
z0@d03c7d~fJyL0gkCr06*II7XO0pLK`;S%2GbNWDJ*?&8B%9Km;^x`?=``)&Oe^+1
z;VG;&eA%@N>bVMU9t{1;)7ePMK2TfpgwvDanSTsw164Q)wknp=e~~B7h`UqLz7rb@
zO<#!JWlLR%m$<4s8=$By`P@`rTg%Vm{k9D;RKV!Z6y(iJxg^B&5l)IMViPZ#H?bI@
z2ekb+8mBNaC!Hyw${k|{VH44vZ(0^EZ~VFBlW%lZBZ#QR13$Qq;C3M%udtW;0G`Vc
z*I43Rdsv*)vU2(96e1f%qQj1vNM<{xE(SGZ^q>E9pXIvl==pvq``2N?ZolY*4a<F2
zZo8rT(JdJ}f99cGNqP&nwsnc%O-&|Y>ZTUE{MvR3;>RYU_a8#TrP0Bk<NdoYC+q8V
zLIMrzf!_Z=ULx+Z0gI3U0t!_F{=Y?m{|hgf8@iaf8+!hSm9FqVy*DN9-G9*s?tIt6
zX|#60K#mr4pqnHm7P#}ClF+r~MDQmi+?@CCdzovu*TYS?E#nT++cr|&%+1}+%}<*f
zcVz%R52P6?i%yQ+X<a>ESL5<`lC`_Hy%i;wo(WYIMLJbcT)9<=oldSvD(e1p<dqNr
za?gjXFJFL<-b$q_pS<32QkIh<X=6u+?8DCb_eF_Ti2b78?PC|+W2NUjX=RpA%2w(s
zd!-4}Y!<E+FaTrRp3gsprLrqk>`}BqPbRt}L&bF-SD}&O{jWg|o?7(lu#u{e@*=gS
zN@Wop`SP`GeLBd)wm@doCKyXioe(hn>2ar#A3qo#;w7;@`XX2!A3gc_tN8dUfbZL~
zwl%FbU;SC{fhwSz>SXxZ_kpLE4#RVOuk5+(QIk!(cEvTpRMNAv>H@uw&M-OJM(;pl
zCQC6<pLN-%o0^Gk{xCDi>c0cmvPM*LF#I(!dEXgpYijdXPt2DcnvcJmGSgwC`%2l!
zGC}s&^_P!y^Ly2vadwbH%zT@xDkbOF+-56G>#@kpN6p1cf-oB7Z@`0>;bX886UzN0
z5yi&er7afGdTAcqT^TR2tNiay(Zc*PYGtYg=>vPY^mQ-`kI3yGmMeaCDBCBpK<dEn
z3jeDQz`E<z#<ZsV00-)!R1=20tI|1oe5e(@Hg-vvjhdaJ`D+Iw_^XCKC6vhiwH;`&
z^F9XH{@r62chy}c`S4Pzx~cfc;i1Zu?4fNCb%a#z1rn)^R{al~(3h;ESXhKGoLTAu
zjng)~zT<JLuxa-Hxj5*s39xFg>ei~sXG8h#UiMU-VUZdv1B;H`x~XdOgWbsK&q^N&
zF4zT(PJC0xgK00Hz1pZVk4$)Te{tfoF`oq@=u&nEc?UeTrV<ZYbl?Mq%p%BXE2(;t
zddULLpD}<aI`^Z-dm!D`m;>@Z>Gx@PE=l~p4Kw9SOVVKjE0Ky~JI-?h*-cjKJZ>L8
zf9h60pPtQg@pH^ENBwz%wD>(f773pLguZTfAIGL{cVF-C^TXggH~gPFaf_vQz2?53
zFIP`rD0;rP;k&;-uh090bpF0xi8(pBKR<|ek%W|UgM)>QlUEOK%k$^fWp@vIcW0}m
z^tpsN#@tF$+MGNd?+=T?gS%#!;-~*DLN4b|AbOkozxCTUMDG&hnueFs>l5JX5#$1p
z7OLq#bbcPbSYKMH)PmW+KMT&|5`TDq8Vmd{T<86{L)Pu0aL5TGt@?c^tkssCpsVcl
z&&(J^Mn-mlcR<wcWdo=5vA*^ga%XEF#ybt21p83HAxyHNcnm7^lbr+V^yp4shmJ;#
z|8f#gc=~qtfWP(O<{yA?yc#KyF4Ho?<83SeU7-N6fIm1}fc38%#e{~&6MtOehb})V
za>LyH;B!mG9D#Od(vmOZga$w$lUpd1W@C1DcIgn!=JqFTVaKj8=0#<7_&*&NY4~Z>
z^83iS5^sL=O<ysE!`*;#J>*x;(RSBXl1Rt<%b44(R9d>WzSZp<sn>xf%D98(LeYO`
zda1Ei#%;2OfS|p)x>Z1aa!ONBel&~`59qn+s$ZJE+#ulCLo^Hv7Z-{))`-TGVvzTt
zl|(KLm|?FZCx?oHi&}lA8kFz`xq5<{2I?uq+oO<lvs{kDM;T&{(#OivNmlo5{Mk7f
za#2fBL{|z%_DyAuk6si-ApT}INYknMpiXumG-&u)34e*AkCtT9etN9~u}PV{^m<36
z)}G2LXvTPf(p<6u=(8&*iXJ@OeVy$+P;zsAfE))4$I~`oo|e|m{`BXVyvrGkd$5zf
zR%a1|29`qn0VSfrSaL0+hrz<4&X5={B36U$CM$;Q*-nAY%M}}WTH?c+XVA?yI~A{`
zk5xEhbhVH6F>I=P_EDu1+&2ajQ4YYK!UVgikT7`vTVl49^`)kWolJwaf}DVxkzqM3
zg$cv7jtM!MHn>eOjtiQN`2|5)*j4TS@Nw{D>gMS6wplotz_(Bnn`ka<UboL24lqO2
zgw^Eh=cUk2X$5yk3w&k9^DobK$QOaxKVaSibFHmZfS0fhx-^*E1%^zX1r*DF=3&42
z`}zat9<#;}+)R8E(I7uwz<A8qC6cYI)l<%O#}UPqZqjFuDuP4}PL!b(f|cWV!ky`S
ze)DyO9`o+98+|?&%Io(=bZDHCEm8MmU-_pF!W`XWqPPQ7S8uVcQo{URpYzA>Qc6i8
zG<k*lJXHjQI%{{Evze&m_VRv;|H|t<Qb94czE>2}3s0i(Np)JHG;R6APkz|%f=X<4
zx|mOODVnip3QFx83y|;$sf07da`m-Pd{KJR@dD-PA3cMVSOh$qR7jQ^**e~xzP7G^
zHnibJexwH))xaL71Mb5b!L-1&ML-6KXi}yd(vure#Evfbu&tjNpYW`akOFf5e1Uat
z0>ub%Z0w*=@iAWClGP2b>jMFsE{wx75EibUY+bp7$0g@mG=d2#;G&(oauvjuT&i-)
z>jy3HiYzr#1nA$Ls?t<GCFQf--UBI0G>&(PBpB=-<e2Ty)Q~}dP>PX<qqVfn#u@$>
zI((lm_KrN=obE0t-(!2GV)gUsGIhItSSSlair`a~%Qk-(3lIqdeU0{H%BhMNiuy?c
z?6f$!In32SaL4g4js@kF(F9}qjr($i%4)hnFG9)VfratFfyBopn<?M)F=HX+%G}h1
zHvMNV;jTp4W4D|%(<qx&R02*gVe_)pFyE5kg%ps}Ub+fguj^f?;~x65TAxri(^H(U
zAh3x7Xo~Kt>@jML4GgpGosnD{>Nb+9pUbvcAoB^t7FABwcSEQ&hC>aKhIh|%E1eAe
zWhQYj({4&nz@gZkajIhe2sa9^{abkI{PGt<vHb3o;BN614DfEo2DfDzGDU?cQKF&F
zs{+0T5@M=|T2W`inU>YV3;}^3D#1WM9Pg+J49HzFNC0;(QFq3znCATg8MJ&S`!l!A
zB?!DhB+-YAUC{91WcI1%1N~Kvz2`n^X1<E5?z)<`jC}I$#n{ehV;%5i^m`69-$&iH
z%q-p9>yRqoW&N_-6`B$ATVk3390@8$v-2{a4wa-N6OSw_xVJXCyrZ^C1N8z<moDSH
z=7DLjH$C<FJ?XU#6WGc&g5C?#qYHyXQfhNV1`l?{bco4=VeA$qAnFz)VOx2GhOup8
z8Olx~dw|9#2pMu;r?Kg#+KBi^wx=!OxtY30hit4a-%B?>(-O>?fU#;wIOw4K!mLg+
zo@)+GC&$yhIvrhXAPz|i;_zM<{wErE1uu3~IA}<N7>^bR!piq=CxvmGeMZyiung?^
zo<*sIpOEzeH9IIwTs<s<Re+JLYW6$ts?nl{(u76B=?EP76$cZBr4x_zO(iSKTwb05
zySB@0A70|bK^(?cP<w9=aHOd!3rQ9H+Z-iy{vHX|F*gAF4*IVlVRkj93Dlz~rVfoa
zRBjJ2l%~vEgrpcHRbVDGac5Z_<;&Du4UD3OwID|Ov8-dX9hovl@qpz;la1ObVEfu6
zb=)-2UYS7d3^&n6Zx^(&59Doz%{PtRhAn+oaUnDx91K%_E|y{XlIl8T%{twqj1B{F
z1QJXNmOPw{KMS2z=~_c`6N_~ooE4@reMw1Z)_z66P#>@<X~x}>^^p{2JZRA=K-;=8
zm@s{bLt&f>l(5`hHciQu=MKt`J=13`Xa3*I5)@q*oy(jD#b&7<ePjYbtmr%j9_7&G
zZ12Qrf){U((W&`ff}2!k-GM+$LL=Hn0fSy)0XG9r8^w@~!~X69f9c_z!eq=_Orzoo
z_p;>}rD;jdfdI1L7D4t?2}J9m9pOfH@qqWkd*KM}pJ<&7tmb@OgRO63M#-O!m1Cz+
zx(1&2Xw#H-)6_ZGqnI(6^iAa9#0@)vacT?N#edw@S(A|NI0imBIpkrZ{@XdRuX5z9
z8~mP{{V*D!oyckS#!b4~%By9nVF?xg0^<U<K+F*(;^BNm!Yv5@Wc|S{(<vG>1RDMa
z=|-x|0Av0ajod|w;{iw!^a>f)VC&IaV-&p(RuJfCwgeK&Fu=X3R6lT)P-j7+zoUel
zGxjv5n}lQZF`I&8z#}A#QZLv-WQdqlqpdMdX&gXA^+NY(QGk7?-OKs*_2={Xyt43C
zFD3fc2t1p@w@QgnvnBwoM1UrBU?OF}Xna&OPZ<itCj^W3x^i@Ag1^LecZ0@Qi^I&=
z`07~%s}INz+EXNNk8B2dqTRjjT%a<Rh~vGgA~5Vho53{Jf~XC5cE19^&-qn@m3&(<
z+^UYZtyG)l_V<w3jD_bxgl(ck1#|}W!GggNFF@kt*xMqz8oBjuuR>vf=)SApA<0aM
z_76&l(>X2~JPFfM<)`Vs1Z?#4wW{F|X9}mbY^-$5_SllmcO1Z2qH!R=a*Zb-F>vBa
zt&yc1{e2$(#f%VJv))60Wf{cxA-IQ3gi)S}wAv`2l2mX&+1JK%&L?2*^DZzBtPJa<
zBT&-vW{lmiIK^4K0hrmfpC<b$f9c$=jA|SU9&|7x%pdL}ylvZ97e}m_7$_sy%)ZGB
z*3u&0%W@bcLq<!y#u^y5A|PLQ2~C=S3{=I1K8$cJac9jikkQ9zvi)OY^N3pwD5%yS
z2Cal0&xojK)a>j>8=FO*E8kr1Q4Qbk0W69aIRCz)W(Pf*V$w--PiI85A3}8Ec&^b}
zF1qshS-^!}V^i#iXns53E=(J+zMq<-(E^^hPz>3jyf&SwajRDsdH4nz1-ji~C~*5R
zBBL{tS8!hg50YN6oNqdhyDh8g;Y>JU;oy*}3wr+vl=$~<`Uv5L^glXc>dr2i{>Gk>
ziEg19o&yICTXfxyJ(d_VQjC#yjx!si&YQjfgs4|~b&?8=@akwXS?rDsfg*?z%$PLF
z7tiEFk?yDy3-_&XxZ-Y{f=(#)4p1244~tuW1uwBZQ_<Q<=@n_H<zX$%&wycq6<D*%
zFl`EEN%-0T>_&#}h<41(bV+@$Z-845xB6dYrE)6A@W!6vC!K}qYL|3QBEXlj8B&*-
zOA3!RhSQ8cl{TEp)sx{$Cm<#=Q63WopM@skE-nX|^Dv?A@1GdO2lBI{&73_>auxvj
z=pq8lzRv_Mp7cRs2!W|ZtP1y{CvZaCAFd?jQ0aJklgyarxAY-%40yZ<<IzF8e<(&o
z>D8xrSqx!$D^d09x%re6VYHK4l^y28_z<`rCQZDl>TndyY<PQ%QJvb#_oq>BvZQ!^
z;uEP)yD{RG(MOGkg=w?KkdZxJjv_w^a<36DMAK_USDaF4i{c-mT@Y?giDLzIe`M6<
z<r>w+Yx#J&*sJ@&);$y*v*31ld^-h|G*y_oj(T;lV0U)`>$Z#fY8#)5qZ07K6y*!P
zM;rHs(YfY}K(4G@7Xk}jU!52kFJVXzUtij(nyinQSW0K<c`ntg^%j92P`p70MZd=j
z1Lz#OhJTeZx&&>{Ry=?e38&Q=TGdoXEZ`K?Y1SA{R1v&~dR_j4T-f1Xv16XrDu{Ri
z?>3d_wm%KS`(upeK1xu<32t!iv;|GcuN<hqo;OsS4DK9i3=gZ1QEve@R(0WyggM9j
z|A4MfF!8~3MreYkw*CDtv3CUlTyMuX|H!0XNfM#avpO6fnduO8K-bPyMUODkkHzg6
zW}|~J3u-jTISYtt$&F`#IGoz(1PZcTYDh(BkR5)fxk4MkV!cZ!<>FwwN?E)&Q_h^v
z<AwKgs2)pBIX8K90_BAW35{1Cyi6d2wVTP<a?cGz6rc$8yBcw&t2PFrSy_iSSy3>V
zF_KXz2ZKuI$8)Bj;sE!}*kmd8p?~&dM@g#oe<=TvxY864usFle8uhUZRsPu4(!<>z
z{0^_y^ueOQ{*$py;SYRC*~xB^6LVy;Mx^Gn9uP<`cG;2|zC-$1(PVT0fO2{=67$#<
z)C@_K<A^#HW3XiVoHU})*vgsi3%Dp751f$Mu3CvhQ>H|!U$5F4zZ2ud+3P8d!ith4
zqNXIvLa3{6AG*;G+P-HE6G)k^cX@PSyW>8Z@0=7y?-$%8<|sYUBprA?$8(uXR)C0d
z_s)axI8vyCt_#EhM#dF-koF{%G6S`&jkZDTaxJ#y2wN$ON5IFe*mpt2s!m%r+K5j^
zE!Z8%=K1eKibgi!SiIn>!Q9v}L!bY>PEqX2?Ok;nsYN5BXoz|IJgkbMlhST~L_GUi
zcER1qph{>gV50-Ai-G*?uzsB{2;73^!ws+A;x2L$FKE?1jf4090bW3%zl<t?#gaUQ
zXt{%}&N`MA!6pqf1x9#P-`h59>nQ=|6(bPzya*SMt%SA!A6Fq@FiQOqLI5^Ct0*pH
z=uU=>=pS%2vE}yhv|!1GV)miKRQf?P62RX}E))bzi@~FFW&!|T(*y-bq`~l|7r%y6
zaOcmzIRCJzL$K%=^oKj1NmcxiLlhv+w#L$9IQ+}wlNU$lB({-1>nVOkK^nlHUt9vC
z!+J7CAk<FEE4vnrAILodmQyexNByn}!O#Q&%+T=wg(!jsi<%dUifynxaYv&)jz8&$
zsy$p`2`q1ni&CRtP?7}bCL|r<qvr{B2QwX3(JWVL;bTz`wgTIo%fmCbOA8d`VTbog
zZw_YuYBqyS?HT`;v4#gvS!HR9-QOv0pXbC1WO76~#i<xveY-5bJu<D2Y_mf%>yw)T
zqUjZVJNF|R)i|pE5G$}FQde-<Q0aS~gh>I;JPcQs<IQYlD@CLiS0TNm!EX|>c&>Qs
zLK|9DO-T+<y|T6JgL1k87w5g~<6PUgoY@85Mg>A=LXx3WZo1Af&ivhw7xlsQ_PWEv
zHQXms4-S)Yc&tLRWD(7}IZpv+YM(xR#!H{K&!#YBqI|@EtAJqm-w(d$F%Een@O@UN
z=TD5}@a2-?EWY6#N~RxlI(#necVX`ig4*L0)BWVC-w#ALWo?5#4y2M}>!kG!@fvLd
z1IPxUKKnjLQ#Nw(dciS{$Uf>7={Q}njnlGO&hvYEtu+6fpCWByQ<N=$j@H?Dc@iBL
zt(IO7FQycHhN=cXa!VuDdyX6>!nSEbH8*6g>=JR2B1ks41X3)ZH7p==GbVfF*LWho
zVC~q@3;502B(>DI0RqxOAH>#AKC7xea!kkm@ai5Oj_{lF`6*7s)moxP@|vE?I-t=o
zC}8kPzM9+sB5+;}cSk}4h6uE6tC-^sTya6N5E2|$6Si9YhyykF#|gX~eA4%aLJ2}+
zGzciK*%Er}DcBlx#;u|hj10*zCw^ciz)%xR0792I-4$CJ1aZ{^-FCX*c-WiAL3oK$
zXxW$|<~X6vOHztu01Hw_%XwZKYs?ryjE&h%O{qaIOd~x<*ukl5ta=qnUp70%fEH&z
z15#jp2u%uoS#WLk$6p`4Jd2NM=$~8c^wZxQ9d=kF^$##e{q_gm@eTcpt*t_^!>t9C
zzHHeY{2z}y8&b^U0Y(-4V-4%!%Y%!HokkTJ{jVEIvCBd1#t_$g_!<#&2O0oK*J6Kc
zr-4{%Xbdb>V%-LsVc|%(01fA{ZSB(p7&AJsyPh*K^49hw&{O=~)WdYs*6Qu5`gPQ6
z+g9cC;WixZK1NPkLqp>3kcqcNCU$^qd^u3j^#>i9f*BnM`kti<^(EWD28W(4BeDya
zO6x25nQ{6e!-pa<MveV@kpS41o+sr4l-W!k_#^41zX{q(<!CZ2u$`-*cRnny7(2^B
zpTgN?yJRoVF(V8F78<5~{2W)8tZ)5mtNj8Fj(>6R2RVPc4M2c**%+Ota1O@m`1*ev
zS%)ng^Ca*Xou2<8V6fZEKR<eje0GOwc39fqBM&bAv~U>3W^=-BW(6T!n|Et*t)efU
zz~1oZTOK~ly73q^{N_7WqH4n%>{*A_q+xa7SwilpDV$YXk?@6$7z>AL$`rU|L^)%y
zg<=xFPG-X;$!KVv;<GpJ7S$c;vwU#Iq=|c#kc5ifZ<4~iF;NSPvEi8n1yBoI^kt4Z
zm26L(iDm-)oMbkQsvn7uE1v`XW7;=l`pLktjBpwX@Y4|Lc$gP;$ay&_S1jXF^PU!b
zb#KhkHYQio_ZX|?co*AUw6ktANCxGot*DU6=oN-dxoOmt6!^DTyt!y2r>lG?J`o$V
zQm(R$GJt8E{Rk!I0(}IZVE7w+d?JBxwr3dB1~~XAK|O#oOfA^3x~Wa&Qq!Ch=I9q-
zEZ{YN>DjmK0h$t-sYu36W$3&Gfb=lK(AA92U~Gj90h|wpOm(P6sBkFUKA98y4}?6M
z>Kr<~kZGiod3)_+o!aw#3c+H#8w7g>Fal(d+XV{sI|NX#7gSek5fE_Lt|XFXKf$j9
zhdgYIB&hX$-CW2dfsa9B41jDK>wzO<2#R(Qn$W@rtU@_0VZt-xLyd4CB0jycb+J-R
zqxG(UCAC>?U$27wBNwa$+2V&o);<pY;C?gt!h|VLpqI3~Rwk^V;qy<8Js%4#D<kAX
zqye-)qstUbFJj2NP2;KbumNnn-~N$<?UmoW-9(%^Dk<hs8?^ItEyOdwLn$B%pODps
zx8O#*$Gp`esq`y5oaUYxu#lp(`$df_Trz$T3G0NxKB-nGekmaZXqaWAo$;#7%4?u|
zbWl104dV@T9Zko`<R{t>5<ZR+I<)R4dW5nXe#2%V7aqOkK_PLAi%Tj_duqKv1uBE6
zyjBoRGsg;eURg}I#e(MG7L=U&J_7+9>ZAM<8+2WBJlB>1ccluorkRdYe24h~W6MTX
zddSj0Tjgq6JuBz1q3_u;$ELrH^U<L6h0}6z@$!(nGbXj(t2}03%nf8pvnJ{>9atvg
zJmVxm3}pZGs}EQ=9ules$e<Yl0W<b6>R@P`i0jJeG%gGiRKJ9Fpr<Bcrt>uR4#VRP
ziPzL~NeopsoGvtOOR0+w?h<{Bkz^(&r83FPxCqRVa~i2O+XbFn>#tC|qeIPA^=`f7
z=&**4BHtThtAGT*r39c}c|&9bR}DrV!c^O1#RAd*qdve?m00VZnkBEL{;ij|vamgP
ze^a8nOsv-!+~G&8iHq``mWeO}$26Txp20-pY{a9gt1>zcty5*Q&_VB=xwWv`q<S0s
zb`QI2sGb3+2o;YXxX8#X1%t;0^@yYX+5Fm+d7(klTfiL3a4q%B`Kg@UHN+<YxIV=g
zLX)gSbp^gmV9pew31A7Tul_VEm~AtQa%5QlR2ed0h)A(NlBp&h8Rd%MBw2c1NkSeo
zYtD&Iw|Q6By0~vvuz^cbl}q4^uS$5+AOX9uC1lz__P}W?q#0^B2mHbf1c!%hd5wl~
zwDr;D?R)0@AoF<Ok`68EV!DOUZ9gr79?e(I8aC9mWq4<_3s^du5}+xkeEi>M?{0k4
zz>Q&5!vnAkAIuO4#H<AumaYq_B6LlaqQP?aGRFHC$x}KMjV57v&XNiY+!P+p$Va7r
zmQQPt#j1j(;bo(>Zd&qo`yENcy;H?8voY<PEy{l_2d(hKf1wN>Uh5CPkn$@iq(8z!
z%1Nl0{`3p#!h&M@!!M+iiNN$%0IP$puujLT0Dv>8SjqiqgvTHO>AzHl^qc-_#K<r;
zxG%xkLt-Ku>VwST4=ti3tm@5@r2jU^)wsVsFZsWbr1`bVXm&#O93hCnOU8ymiX`Tg
ziIr-IsscuC0=W<l#pH)sLK!q3m-eLtvcCGdD<2_(f9#1(su%U4ov7UYL`XR0+!jgh
z!Th*u$?KrmJq!wdc8@DODvEd#W)!**dN!{CL*5<Rz)_1cScMG)O>7b)yzZ>xafK=N
z2h_#G9I~Y?F^-qF!s0{rc!Z*R_O}~XJuw@;g=Q2IVpJA&cN(ImWl2AH&*Zfd(vfYG
zN2_)3RBW)3qLGvdE~s$S#nY?FxkpGJGEgIe&yg_LL?;8_g)-H(`a{#DJ(i6~GxEaX
zTu9IyZ3e6ST><I{jIaXmGKaU(&DSQWARY;P6EB<<SX@Y7sqkVKBfPcWqarb5;o|Z0
zuS8BMXTdta_`B`9TVPSNPAWZF?_}?_r?Y5I(liY#pFTfL&dDjQcX=cgO3rsC-7n5u
z8%Q4%H{d>tdbeYGW`7(bAl`#V2yM^Tx3~9niH{Z2+HK0=)i0(LV1P%=#5utG1knEe
zv(5^VA+sH-wv9}*&a$n=zEra}#-f=+zwqNKsAfAIIr3htA=S{90T_uy0-v^CXzkSm
z!2Bh6s}-Sy{@9_{5}AQd-GR)FH|3@TyW*2)QH`i!*Do@_TEiU&6WqK7a#JsIh~9=L
z6XF5E({?2UdSHFmWgQ~7Dd~@tjeM`sq*8;C&T&raB3V6%Et8^yJX#?l(+EWd$Mvla
z>|9TN<1jvH@)_G&6c&p+6MN0rlW9T8ZxovDa1=@OM2~>0Wsi(nHg~@(m~on6JkW9N
zCke%u#+g(QMq)!D^PuX<UjHcbC}Ydk-wD`lVUyb916H1iUcBsj4I8w=2@_4YyL-;c
zg79W%Y*L|<92u3-%XE19u0S*f_vrh=+88-FbOt2J;kv3X7_Q%R4x2kmlEw@B1LK9!
zyE&iNW!sI)Qt8LnCqaeW;)DoKy`#RiMOV~HszmRg=EarvMDrn0Umu&;YgNA^NyeS&
zScQo?wP!)TD~hG2xU@#AMh;y?YUTZ8t|?tj#nxVyGczd|lNHR14orBPgWnT@Aon-m
zN8~N@QlZgwb9*}DyEcY(x3pv+8D%A&L%*^3J#0b|;3!7SlCyjY8RI`Amt^f%q0lZW
zb-r^Sl)JZL7(eV1isO=Lr<~kgJQAI~;5(>;>$1z{*ABAF3>Yp0TU#G>O3>{sbsGQ3
zdk%HzbKvgx;1~+G<KbRFSMfNsga$`fBN}?PL0eLrkHmBaFnR)#A`cU@1jK`=z+>2Z
zzNFT+0g*&uZeNFipwupoX}x%i;<lpwix<-t8XvV2#IU6ePCK<D?wt~sFTi$BhK|3<
z+2=}XyM<^<T;v_!ITACMC{z-##*H^HnnVXdZ9nY^NR!XOogsT7l(ctS)M(;H-jkYi
z6qH>!>M`ua98~NHdt|HT11fDkAV1UWt>?am*PI>1W~+oa{(^V3YG$Q0V^y=s09K_K
z!1Yi4F{ZR{LFir;;qu>UU8d1^4<n-S4s&l}r(_KeC@Eqkib))3O9k4F5ZgJ8C?`fL
z?vYZL8$K<IM}A>IB#e`YYF7}#4nL?vt;A=lIbKPo3_ocL3bsb3wq`{&$iZ_8MPq56
zh6?GZM!zW3wZ}@Oupba$<!n8dF}AXLY?d?_vm`j?V^U+cA59VUr!em=Fs9uZxDcy5
zo08t%!yC2AqZ+!BeWB7il%X&m=r*d51;(*tbSyjW8O>JE$X&q+gUNQshC9~F45r56
zsBN$zPfhoL$<F}Jx94+;L=)!Bhl2AvXAQ}i_57)=Y5e57Tw2YJUZ72z8Cby>3#?<;
zuYD!g1XBm~k^#J8<nU#=3L;TLT@7<>i9R%L-Bs*051MboCmK1}WX@Z&3DhQ-9zeIH
zgwTm16I0BPhv|CF$)Q`6Mveh*Dtpe)qlz(4K9-Wn{|fk}bJ|%eY&gcH_!-)?kNvIQ
zLLPNVHW`jgyj91Bm52|v&aDe*8>gmAwXGRQyQCo6x&=CBv=y`mTm$Tsf>~@z0sqK<
zD{L#Z2Y-<ziAsmlAtS@ikBoTJfc_o13lV!A<6!Tw#?2=>6k@WM8-}@Ir;4eH&}yY8
zyA|=rE|YK2q*+z5b*rcrbzlR=_8isu`RRGD*Fn^}?{?C4EWn1qV&iUmMQzNRUMbi<
z$z)22n#4E6i;+OddbIELH-or=3pn(tEoxM+0@I^9Bv@LrogBhxcOjAp!KozijYtv3
zJ#7t>SY|;N99)4L9b@k3cK;O~xxZE!LG#C`R2w&WFrP=3PMjRLpp0+MQQ?<bWku5S
zZyQ+VN*|6oRVyALy<q<-o!Vhi>+p@SXKN0*omFM=BcwBQ;{k%##o+0PDhzixO1GOf
z+7ekEG0hz{*X@pq$gVAM4@Tbgws~`eDTkRDM#6z!jHFR&WKL7{<MXm!fjpVK$}t&x
zk#0J15G%H^0rTJ85^184+N_XdTYq1bC~cymMosCUwb>v4ovmB>EJHaBO3GFW!VSWs
zy~=HRq+l^=DE!}VkE<&j{ul^+C;1OLI4|K2<yWqev<*+sEg!3EaUPc~Ft)X)hO^G*
zwwv72I|ypar1J_ThBOlMfd<dJqdn~Ht}p*<?T>eOyy~vV-@%ed;lCEohp(vTw^!2~
z5@L<Q!;t$r8vcN9I^K#q!meebLz24Jb9_*njR>DNp{8`D3{`I=AIn@(3RixDt7Ghf
zb5(<G8Wz}t&IPyKgkoje*0YgbYqt<JgSQn8`t?=6+kl5syhw?`ZO8`;zW*qxCM2Cq
zSrEiN%J67_w{9Z+KgCu*i76UDdf`1tDm*k<g*ykL#RjDEDqUr~i%}ZNfsj3Hlfy==
zEf@<<ER@Bh;i=a$#=6NI&lsYdu9oYH8ZY@A?Hrn`uHAPfaJ(6DGvGN>+~q%)yCGl~
zbV8)A3K;Xk<PHQ_Zu&qWpsF?<Q&{oAWyN6A@ihk7Q9vQ=!U6wP#^08MM~^lBs>Rm<
z_9){o7Kl%lv>JaO?aJ|eI2Hp_PpyACs<)&7#(ZyQdI*^#!iPbQHdEHDDDmX;(=Sdk
zPOjmnu7H4dI-c8KuVznv(ic9uGv)2J|8{vk;*53Q&_WGgOUEn}@8qE%j9oHX7^b_)
z+zrVjcFkw8gNgg=Fl2*Kw%w5JWD|0L{X5yOzjDK8w6tS#IrMwC*Kt)=yh`;*c3nzX
zGb`^izEaJkJuOXoQa(bDNnKzwyT^!6JgV&jV=Fv`Ci(l`!0vh5*S}yLF6jpeFfvF;
z{^B(9ZCH`<Z6|ReT5D~f5%Q?b8r01!H4&JX(5N-9!C?4i@5!^b!woFtz@x!{Cb%VM
zG`&kD*x}rum@?oloF84jK0nD`LEy~6izCY+V$|i!n`e7(1ETmfd6OGCwdOle_>O{{
zHcn8yg;`$VGT|($J}ha($TBL3x##+DPSVc))g>5n&|qg$eeN*vGNiPcHF6tpXB_*(
zmK)cl#g9U@Je6M&-7@el$~`|iJUagTNW(6bEw{v6n$y|np`$X%piGO@+`BRK<Nx%`
z;+PY8LMkD%p%F8ZYGiUWfqp1teFkUo_^G{2LiZQc(#)-*0?ZhC%$26vhwYg)_}U4Z
zna;_27A9s3gM+aqtPA|`Usz!9)_(Vk*hZj;{q7g>8;_C*`h`ux!W&p?*e*YfZ@W><
z<&h5tU4y3bE?4&WwdL3&?D`!rv_KF@ZOToniFa?|uS{z?dsrrpr~W4L7xPk5fU?-Z
z!p1QR*xx<7e_p)*Zn2ucj%t(CDiqFOs)?P88X1wf*275rRLYUX<4*Vy)EoNRk@;`~
zA+Q!bGI<R}!3{hZaZ^E*7+t5xfP&O9s6EnaN{C_~<DHE0Qy3hqkE;$}Dil>xT!$Z&
z>9}^o1!$=Rz-}QU1rRiBi-3c1k;yM?zopHv=g>v1?o=JMAv4?xt})S8=0RM=8RQ4n
zxB|K19lhP|7i?<*zVG5vY^@g3+`yC?Y2RJ1+||0APhHCUgh?-2Njkr7o;BqA>J-v4
zLp<i?q1=v%;YmvWj9}dJ7MjNoeHSki`wkyvn(MaT=L>(mx^Kc1vr1ij5t_5EED~>{
z0r%Q+da6UtvSm2hvC=_Dz1>@!*47U8ZT35VdxM6AthX(0!)C!}x1Sl5yWCzM?rT8i
zM_0eh1iFO8o$7UF8D<YfYo6f^`==I(5qeY#;8Bb3ap%OIK_r0D_jS_(5@P$75Qe?n
zkFPcHl40N?vsZ^07&~71*tr|2pdr8csy26<;Av!<)+ya<T~jG2NUMB;7n<)B8Ji(}
zWtmUsUj;Sc$-b|FmI7khg@B@HBzJa`{r{W^cagL@ii!{$brkmsT+;3MbP%OAo#cFI
zR5Kpc61gB4NIInrKJwjEh1e&q)#L)cDb#@T=QbuVOz86YD?Uk_ya0XQF%n<hoA<($
zyRUxma_sOk>u}e=fx^!WW}$aG!LuC{ne0KDOy;F#*`|cp!eN^-tpMV0{|H{`ZIa9c
zOaQ)?pgR&W2n$)ybH`G=qtb$Z?~40YYfsjZ_d4JNJJdv7*;ebAlU^uVZNG@MGh>n0
z4lTzn|8eX??(ic|Ci`tjjbKIW2R(^I(yk=tC?$5i8%!n$+qjQ6WHQHjanhK>Y?`Zw
zl@|!l3N+Eqw%dCebkhOvUl7rLemJcDf2CP#d~B^5=L7TkdWIuUd7&p{c{CD9KQzXl
z$&lm=o0|=x<EBzpoNB|=1;Zx^(4INV2d6iK0dD4SG65;&9yIc9qObc7PV}zpB|5%C
zoSqHML?;7SJZ!sdemSJv7nmmGj>uv`H!4n6G>`CeDv@SBH29COws$&VXON==#lAO8
z4idb%k}a4_=hzinVF<O^;5p?4+_weL26Y+j>n+Vncp=hXi`Dc1=B|XZ#oS^qoHy}t
zdAmn!&lrSTREs%bP&WK-<OwgH{>cRfL_#e=(gGuf-?@<i!)gP!NtlvX$_Zf2N(P=$
z7<8(qvcc-y;3UC_HWSz^>av1~ijrkaP?s$ES`0{odo|}P&?yhJmt2xZI!N8&&%zt*
z*V|;@zdShj!!o?cBc>`3;tPVk-ZK@HdNFgv9Acb$E{84SG!I<I3KBeFO?=)D)A~YH
z_|I)4g>45450k&g^X+UDA$WtsN1xjdoL4pnwo6#iLJQZK7U<H=o*$oYP|fsDOVjP~
zTdx)B5_jWUh908HwQ6(^xF#O|9;MoaD&9_F37l9L0tZ*vc3!kzP66J)MIwc57jkCf
zkfHeXp%bYkUmZajzujhW_Zi$}_C9px9z1IwXU2@$(=|E0z#1=XaY3KXhWPO9^YOg>
zWAOM*${iZil%MMU><fBb^h#`Ghnvi}+>4{F*Td~br;ZQr!^}tzBVFri*E2CEr?3iN
zeR^b<Ep84Vz!W6facK-S=OGN81}KTkZC8OU{$ILWXp&nIq+n04eLuz0UyMe;lC!50
z@FP`hV)$VU+8m`ndvyp_Z@aC~HWEuHaH0hua?R}n$_S=TH@waPA~ze8SEX3u5RDXG
zJx~TWS6fvVq%Zo_>SEba@0%uez)BMetgog(v>L!t2&dLF8Xn@e$YywujP1)YKn^X4
z^ujeDHY?0kX)Y<198ZFpMl7TrSmOkxum(EMO7}>!362y%zCb!hzfR4WocQlGht7>$
za3%o4sp)uaN^+v|Wy}@dfGWUjl5|!VP4r=KXlwlE_qSs&ZE9|GZipdCuTyK*8O!4o
zYNUF_K2NS*qts<Uh3gu(L|O2aj{8{hgRF`i*|4J#3=0SKqW}xNX)-7oM?qvC5C-8F
zV8UzuA0zSAn|nxj`Zw04Jc6)e`4J`Si_3${<HKxc!5S9Cf1^kZ-Rcsiy*b!PsOAEG
zy*GPrI~Ou?)C@UlMtfMxafla%oN`j;OFM)RkU4g&yB`|{U0MW*p#+rKC2wzRNU25G
zU#P0@0_q~L^P|sR!cCaRrzbne!&sQa;ols5evrL9IC-J>{BC!$dgQj76aHa@<*qdt
z8<Ar%*yI`c82C4cwJHc-ygod%3F!PO|G2~$kVU^u&fDp7TbfV^AGVfc%m$8^ZduQx
z4J@_@42O@fiUYtmeUS?>0iKcd`fv9hV5|K&^v24{BgpJS6|jE?a)jz8klQKBR+Q}W
z4`)YRnM$^5=r|s7cd<Z~9pnr~{`mLFcN`if0K07DVQ`v@ejLnB{DXQi?CgfB0k%Zv
ztbYq;mix66a%Sm0YWY8i1t3lzyy(-vJ$ul?mu83m(V?W-*Bd;V{Xcs2k<p_YgS<4?
z`z%M57X_I$9elklzr?A($?Ndc8L~~~2C-<pOB8DM-x`LR?G%R^#mAkVK)~he!^_k2
z9k$kgRh-;z8|<T$h>Z-J=*3~%|GN0N*WmX5H7Lsm;6FR8FwJ>ut{MC!Cw!9ALB9N2
zyr7gVk3i9HHbs#YuJ|%XaD31`$9>}SvbZy+I^V;0hW?9R{35B6eR`5gsNOL7rdXYz
zrd3zut*uqVY*e>WRn1CkxaclDr*}S}@Gar$UQ1$;EK!?r1K}i<l3=fht4&LweR^+z
zjU2>3!MgCPq<95^&1ff=RZns)<9`(0k<ZzXSTx@~<H2r5u>IO**}%BG|G}TB$C*L0
zyrmQ{^o+lo8t$`!gEYw3K-Q_u73+h~`a@Gq^SST4U6pc%#so(kBO!%}r6jUp(g`l@
zKhkv1yL8Z?UND<wDBvD+327wP%wx^Rbce1e<tKc`6}OnYmdS8Y_HB{BJ^BN(#h)9p
z2aJ^7f6@1h#xfGEW@CNr8IF;glG&7dZdy?jGCh~IYz8L0D<HN2dky}Z+khWs{2e6#
z1B~{RSn8u&g3FnJ-(-5E61t_CWbeR4vg7Byj<Wn^tNBoP_M3M8oB~n4eS~feT3k24
zHe*Cb@!nUYweJua7iFrrv?*w?yzWnwlu+d&CJA3@OO4X@k<A9LPkws>`6;&A4FoMU
z4RjLOzs`CPY5$Rx0NObGEt~*vqZf}0O82^zRPCQD)OH?0R}4sd2l{@P(k|uz*8uR@
z?_icLVa)d4wxjtF)p<wrm<-F%9;7y6N87d`uYiM7k_h3vK<%BW`rHfp{43V&E`fGk
zvxnH&ew*(u^jQu#SPZv!a|B}O<m56d&)q|s?T4w`xH;QTgs1Z(9ui^lk7VXbXv|z}
zTQ0GeCVfd4yC?Py#e7Ot9Cxa28O720(aGTv8K9EyI8f3cn0pvza^g^5x){gMjXI3+
zdUZ(<WOde|y+6kXMGhm)Ka>XkDizo<K`Jx>-n){|#sqNGw8B2E<Kzyj&^^~Sbt1Hn
zq=wXqlr}HgZcdv~cIs=0wy9}ZSvjy}IGustcg8QcSA;fKbJuo;DkM46pvr)0?9JQZ
z*1h<l&M_^8V|aY@MYdBIPbZg;l<!Sl1H10<VY|gg*)Be!F1*>9R?fV(x>p?t*r0{&
z;%fV-?%I#wvfYJxsc_s~nEjD_#q=x5cQ#6p;q+~670l>w_P$P`b1-}qsk#kplj<HP
z)cO7$yPAD)oBfX;SMCbk5V6ukFaOu%E+o(B(!euOTO;9%w=}f}2e<2vtcn}n|F>0}
z)mZfr-LNp~yosF;aPK|hLV5{e!jBKX58e!!BTTl*s`{GlSl}!HkkN*(?8|9%03-MX
za_8!<z#LFUTKW8yD^ljpN!3r10JIf%pbalCAygYjh2&^#FSu*h0OyPZ&`Ibqp*H5z
zyd*{M*ly3-N^JWnK|Di%5h`6sxTd^}iR_`s7)&Y;8NM2ScRb$TYP)``LLw9hhMp5q
zM~s{{64imx*s4IZsSQ;16&E5&S4>xO_>ESgkCOyR!dJYR680{4TKuUE?1^J*j+$1N
z(9~c!Cc4!q7Xw90I^djICQ^oOkFLQ-@uZr~43V>!Z&BiQiqQDJfNI5ePU73yx{+J$
zA8^;C&o4z~Qg*P<(bE2oMr#d&7r(sE{3|K{0ABqQ@V}q_{tqqpQ__Zz83riCNi9g5
z#=B*n>1ELJWXpYof_~6KP@uIprl>P9kIXfHu<))g9Y+@=j*YJDqTwf@Pu`<ExW;=K
zXfF^+&$JgEz7Ko60U@u^@uAe?wcUcF#j6Xawiz+^r6;c_?W>8@B!El=NgkH<;GANR
z=Uv%2bJ^hSN_~W;9)M^Zx+inb*ePn1NShUL0H1w<%oJUyPT2(O^1ZQ9Yvux8X-XO&
zcoXH0S8x*SN8iJ#t6IMX3(x%#1<knYQNM<la0ayKqZz#H9*XXg(Rt#N%DvSAWJHJH
znhHcZgh$;(+Ed6L>FwSHf6Z(faN>;-w~X1EcQW+wrU2AH*ff{BlDs4LA9|ytlh}PJ
zta}r+mVdy-ur5J7pc$(1pc`oIpzPQiTmB)vY2AUk+KVUDL!cEF?aI-V$61vv#vwH+
z;twH99SuERrrq{8`5mXxtD_Uy5!%x>^qZ6~A9{Grs{eKna+nS-PjEWkMRwe1j<l{m
z+j`@LhjsXV;H~QmP;m8j>NeXloW~A(?*EM8g}f|jzj`C2QhS_3X~gSdJ7u+*qxJ2;
zH9kCmUSp{3D{HU?q+@_>CnIBz?IOY8{{L;FX2*Di#;~B7ZY0gjg=of97up<lx+FMt
zwtUf+&ydc~K(!wXM>~P|XL(1u+dL8c-={DE2?v5X1PE0WRPJ&s{kQ}uaTB+-NRwbY
zg$nWR%(XjvIdRGd5&{8@?cLdHc6N5ozMFl#^8I|6*HF?As@U$F*&nv?CsrW#WQsSB
zi!YzdzIyLdkj{Y)JzSEgSzVNP3nukUVO;yJO@zB*o2%Q}6L^>?DN;oxkz4FcQ$wk!
zdw8aB3J(>ApahOAe<VT(Ji0kF<sQlAbjy3-{mD|JykR4nB0C3=oV1Q!AZHBloiNxT
z{vkBnuGP|GdxubveiowFP0cDvF24Nwn<q0c^ZWM0>5~tqgHIF}mJqvi!9J=Uy+2ZG
z=ok0SC{=a&c)kyZ54xMv=Y;I%U~?w>#pQ=nnB${c1T;J|N<IL8&`r)LP0u207ZQW}
zgs5J|#h`1)_=E74k5-{@D^eAO=^SbV^I(c|G()<4Xu){05bvia^<DJ^SclbZJ~Yv}
zmq|KLgkjeX-c;$+v1iy}Sq0R8&>j?`^d-LvIg;aK75ZBYs#RhT${GZGc0t!gd*cJe
ze@BLSTwAGf*f9^7wws7HJEghgButWUM_^!RS?ZpZCx~$;s_XD-7YP;Wj_Q;vR0%%V
zy17J%gpshG{f@~n$Sh#4O)(czIGpi5vK-^Up2z@?y@>C-G#}yzPY&{{E~gz~?7)8X
zB+Te<mpRe=VBX?TBJuvJf|UdIe6wo+Fa$)Cla+R|2$M8KZZVh;?BnM9D26UDVQ|EO
z9=-o{>xYFR=}4M2jd{KvJ>$T2J&Qi|v*UfVkgVwNa6+caIZDn(3lL&%?(pCm1!>)A
z`d~9-T?(3tb=gwA25Jb)CkDwaFkD)oTDKriRZ)(KS}lcvze8ap?R=`BN=X#Pf~j-I
zZD<t>YJ<vGim*X79b)z|y4AByLIrwwqcFSMEO)C)SV1N|(ski3$)pSItHmW>MWa_0
z6t6eUI3n>hai`f=Ted={M-dlWzO~3cFr!*msU%Ms$XS2IKq#E=12!8v*LU@L(GdW$
zJfj|~5d<{k!^lZzxTbNF6*3(ST&q-g^)i)!{yegb%5rLk)tIF){c4MB%Z-P5FN<Bz
z04pHmd)}XZ2&~!+YaBEQ4Q+jx#rZcad5{%V%c{tbhe03YF}Y5XzYH~32iQ&gpGQwz
zyFjP2W1k{WWE3F>REpLf+`*PL+<aS9g!B*v-Xv6?a38&1<_cm?${~d^k2aB(rGWRE
z<R%XBqMiV9wzvbOre*taf8q_R{Q$J5=zv&3-+I+O{im-_`tjAtYd_D$a&OxJaamA+
znK@NN+aLIx+G^hD&S}!#R?ifQY%69Fw!k}cw{1HY-qs79m~}*yvFViq79oHEJmi*=
zxC!0p094s_M4v6d(YSY;pHHXfKMdI9R8F?}@jG16g{<DZ^yS)3m?mdg)6qL#Jd2^*
zIto{d|C0vfOaUH6K-~PEyoGs%XO{6wC2PHe8ZGBU?3`Wa2?r^0Z@AB-WS~>^&{)0{
zevx<P(S{5=dWzSaIkIna8AO<4!6qtmyTmCU+hoGW99n)L>)O==@e`8xtQ{9e<RK0h
zGt8$t`1)I4hhy$ZrZ&L+*G+qt)K_Ry%+%C`S1V9@3`mEWWZxs+Izr93%wtjYe}s$a
zTdl^ui#5dEEqnLaSx?-5PmaAD3aI}Nj{9k^ACI`45c?@2tpmojq5OAn*~#J^POIA)
z7r=iM<7wIdBj_%x@;4E^sZa!TP}~|4;}{Y0`r3%`*lJNA8}Y$r?Q^b!&y%A>Lr+D$
zN?4GxCH$qTwjtbv;a-}0%$%*Vhh{(}mFQ{>6tL){O`_gZAY=k_PhSkH`KYp-9E-C$
z%p%IRqjPoEkEI;R;htu-5|jiUh@*+y8O^AV26G)bUL!uPc5Qvnfe((Gy&~x7;TmqQ
zxirqR-)cW$i}1FeJrn{G%9v>?pORS)HBqDhstwCN0KI!nZg|J;OZ12sumUk`sngFu
zI_Y30uDr$HH(!*$RIOS%$|#|-dTNr|g6n!Z<jS#OCJBHhY1fTmvM{Tb(O$(3C@d^V
zYBIUXS4%M;7eORhV0vlbC#XZYSrk%1XVeHQ+fFuly}sZ4Vmd;`39PNI?`mqN;V<20
zlfm8cK}M?Qu@jgewrNj`Wf%m>Wl{r&w!8_>ubT?QZ@#CZ*HD`HyNzi`7R3OJPs;0$
z*0~c<V|6I*WGiQ!r>0}+7jA70PKr-Hl6=5lfV>jSY)gSSt~R^Q#%0-{ZJW$pG%MKr
zsprDUQYxvaF_2IiApt40DW3rl+HC{DF5}i@oTnM;9bb|sYqES@ui%-cS9}NhB~~L4
zEd2DsN1M$EaO1&|Hhl2_FDLlR`TJhUaV;VeepNPHq9BwoJwjwIyTvW^tGp=@JE&JE
zr)w%$_M3GH?djU{zT0pmGR(m8R(@WZtwFlY5kGthAM>AH7{#-s>rZSh;eY~dYn%*M
zHm_RHNPI|M7cx8YLYB!O6)Y1r!lyk!qp<fLBn%XeY3G)1t9{_&@yI6~3{_Wrty`^4
zQQaJup?h!Hu{_N{y($~)=Yo*tR;*PU7wB+Y(pG|<#$|??l2FC*j@FW1gyA7(6YlnQ
zcD8Qnv4nJ`&B~Zz$;7JMtrxdeBolBG?Su`LU%^A9fi`Z>MxzPudU@jG1q}b&vmbw+
z_~&w6%{FUb@oVF?VclnDZMOsp68l_uCE#);^lDiG?OsDrnVI>{bc?AVcLje22e@c4
zfr&<)A~JTfn`X1yrkKdjX(j>>FXq6R6f*=uca8WYp_op>i>h)Bp0wI%g!D7Wsu9cN
zymwtcYL}K?Rv>S@h8~{Ib;<gmYYA=uDJz)`DC{-4EFyoww44ucYyjUS$m*uy#EtX|
zd(4f*+ym|kw!EOC;dW@Z)q*xU)})PCZEvePR6v^YM6DxERS=LnfxmcjvQQqsLbG21
z1yc@Q3^MSWe~OeX9mmY0paU3mcKs5b&&#q|!i7>S>o*O6kgd-B3UZ8*qVSrI@-CaV
z4{fo9lfC|((hW1a0O*SkY#87-9jG(`-J?MhHrZk8BTdnsRR{suO?nD5u9~On_pT|8
z*A9Z)mM#8eSb3-|(&mNFx~iP@+-bNbTX3!ihqp6{v!9zz&5Lo+Uzc`-_a^R~Ah+yF
zJ|wUfF?4QIgxRu)wlu;e_5(is0ySfN3rW=}0&b9fvtKkJDwXh$>Wga*(U=Q`@NNiF
zBdY_4!fewP%j$j&$77+MPUHi*MR9-&19=&YE7vZXdP`BYN;pL;I>;NMOz>1>-{HKV
z9)&P*Rj(_?_^$T#;j)`rlkhdVDu$H#sZR*N{BKQ2bSM$3D*r)_r#m2R66V2hGF~$Z
zWr-&X6%nho>K@37n)Ui99K}f?oQ54}Z%XBw?snm{v{}xRJ&|SFd;Ug8_Bgw%Z*Gar
zg^`F;!Rmppv9gN-Q)4Q-Ygu|h=O~zp-!Zpbqqv!Zwy-D_*Pl`$1xpc)0@Zuw)LNA&
z&16O_N%>|>LInjUM2poD@Hq<l{>fKgeE9NJy?z-hgtmEF1ji$e!hlx>BR$JP=0w7d
z^fDvLwA%estLdQOLEP?G9H9EL;9@#NhYjB!RyMc2Vd3FP&?pH`9HEh2_^oWjDO397
zdp6Ss3J=5gyJq#$`!8Q!d<^!<=#%+;$6R)^+ja`)LT(fuIwR?BC&@doN8)i_0Et$>
zwNLo6NSCv#N)d~=op$5zW*bQbCup;~xh+m&Q-ktR?3>$+I?9JG@^g`nrCEP*Guq3J
zV-d@TJ50E8;1l=Q^d>W6yo2XS#CHeJ9<&rY9=~##bNg!`f8O_D9Qhucc`+jr&N#96
zD_v=5#>ajBAAlUsW|(3BJ&1^sU4H%H`xhTwyf0BI0z<|qPz7A`{UUO%wOcq3Y6(vY
z&x?w@huJC^3p<ii>6h_IW^%84NP=)W6?peH)vk?2ReHse5JL^#QhjR`g_LbshAZ~!
za(EpXJAf@W!>j5s5dkC{xh;PF!3R@_5ObvUc|e4zU&Kq(FK1C$=Fk9KSP~(U=Hw@~
zn6`AkknZngy@w$&C~5IH)A)bUjXyHs?-%eN=LYjHyN2)88TfNX+3krp6O`;#7CbGN
z-J?9o%|4O8vdM_L1yhXg4%^#PF~1^%9$>3)N(M^Bc7^(kuKImQ^G@l30f&@yVHdxv
zN|-fN3aHBa^9^8Z-CC5d+YK_A88DL+jtyf1LUj%(sNnc~5-WU-k9db=mK7Npiy@=h
z1+=VDXh_0{JQrT|SH$f4nY$k3Hh^$|A7KnzR^^IKxC?)bD;&XDHpP*In%%mc8D$d`
zAuc#BAQm1ow^ZQmqnhi;c_%v2JbjFc18D!*02&68kUJt$aFkGl#vRzk6fVsuuV-Y9
zdM4%$1xU>iok*|mqTxib88dmpzy4?ir5K8W<3G7HGcm6(zk2^O2owgEm0!5i+Rf-F
zok^Q{40+sdArTiY;9{}NdxsH53Vi(~aNrg`q2AuVImur0-@NWsz$wbkyk>O{@fV7N
zlkOsD&ReY!FLaM1G@jJ{(%5TL^wpnxjXqH39awDKwKq_4bS|bdY%bH-xZj<Lwah5#
z>&Zu{{MI95MFBC|P9clT*o0&2o$Zr^^@!cSIdm}LVHf%JV7W==$LIK>e7LHFUn189
zl%>(GK&PMGHrs9A^>G!;a$E-bkyEBnF<6H!;z9pFQV7(o?(q;(!EE>dGHdwtG#GJi
z^85<D<xf%z@rF||@n+3;v_5kOhR6%91n&bQQe`RX96`;AtXc{2EBiDkNo@6o665`W
z(MYLB4(JQMp)P}Q3D7z4F8aVefNQ+smaW*JoG7wFo41>`8y_`(fvr;jsMSyGZ9P1Z
zMAU{)dM?VXMA%dn_w?ETupS<Og#4&Mv^f~*xS;$}UMM<I$5ZD-NJ5cl?n;TQ8cKj4
zd98OFb)iZ711+KPDAyBQzYRXXLkl;{VrDWoxN5jQqw%3Lh!-}0V|+5$d~S3}RTU?2
z2LPcR6;V+`bnnoDY*`_Plpi{g#=*%aMP*sZ{68J;VIoC|QU;mTk$@5MV;Z8k2j0QP
zrw-m6qLtva3IUf_DE-11yY!0>v1-W#7tJ5|kM=s*GmHvN|Jh<k+ztxfk~0)oZ+0%1
zuXmvVqtTpO$!uk)r?MkX*XH@0nSC<JW=SSQ`Wx^cM+AhyUYOBxPX_b`L*_^wU5v-f
zw%hvtA?VFzcsH}NsBZCFQ*L?gr0WA=P3-=umu%P5)8F-Wr;M@A*d3UIp_n}%HCF*O
zi4CYjzia)H4;}l2d!j8JLpXKv2ku_|C%QJ$y*1Zo?a}@@kke~_@yMX>0C`K3JFR{;
zq@QU^Y2V<ct~xti&fp8V-VCVCCYy29W9|ckXq9&W68kNL`?r#GUL4HEU7694T!8V?
z5IL8H>0b;gb*8(k_cAfOvnSCJxY=$Ej>IQk?5VHdq<BmmIw*PPuzXs@5jedc?BhrT
zV*tf1q{M!cW8_{8A(F{!KSM)vEZ%Dh&bD33r91mQ;N4;*BRJ|16N6xZe88u|Iu0RU
zlu6GMS`mMhhg8Wx(?eN5nnZoRq42P68atY$Zv&g{ahHMY9A<sx+tY;_3egkyUK~)U
zM(Ge%51u&5#x9Tx>VOpt`DEkc^Z|@lR|@s)f{B&@_3XNVtE*Rlkjll9xS*uo^(inZ
zr{u9&&`tN4oakfS_CTqAuNFIO3zvq19ng;W%?Rr{YK4=N_JC+AjEq#mofSvQI{V+M
zbYTW;V>T=(S}F9MF3Yrf*i>uG#R;M=<Q^BKL&8jCb%xxmv|R5&m=^T)Z>vG&<1()E
z&ErS|RsFm4!QY^f3r0-*3zTFvc|e(JilT8kc)fbXSDYAjoca{b*cPN9gZHf}Zlgnv
z1#e!Kfw<IMw6#^ML8p-*daerx6}PG?l(vr0q!Oo`IC38s6(>@nx)~sLThM98b0CLv
zVf#8BsGc#`4f^YU;^D+skw+>LZQ|v-%{_(50WBRUD#~BbK1UZKbcADbh1J8;QAj>k
zBbn^HgoL~m!iv4Us$>vt_0+;sABfW(57;mN2&uC}m-^+B{b#zOWyaCcx{cYQz(}+h
z&C=1ia)eAJJUu5hpR06A==(TwhbQ4u3@1SWoYN@Q`tkH7qZQGAP`5v;{}zP9uC7y_
zgr)5-X0EZ83|S2?URCK`Qi3NnC(_^XIKj=(H_cs|Z5oQ9!fy+7a@gi$6Gvruws!z5
zt!r)~w89}H5kF4K=^n;mCMcjX3cwqV-7H;{yGly_5Ys0*RscfoLocc|E#PR^UBC3!
zC^1jfDm9>w&!_J}k|h|;=a?gE@Ys@wk}SEbR=K6gKk|rw9zM*zn}3ijYWm@mZ>C?)
zzJT8_oFTKJckuMJnIr3bv;JjGw=TutQQ_le09{Gwch&PvWsDQa2+;VtTYK886>1_D
zH_XdYc(q;znga+Uz2&jPF^?#J)qqpsx_DX>`r&l?)pYQ5icCeCh{d8x?ue2dA?ts_
zu!syM@6D*2a-|rf7)EKx?`YyX&JT?Y@T`;QW{r!!a)Kci#Sz0H(aS$Q7<I?ZY1YRn
z5yga8#CRP&PU7HYC}e<v)dA@>*8nkBB<x9Zrjg>;OT9=+AW8|3AaWk!a~amj!F!W7
z8C{f=eD!{h0Z1@ZE<#7AmNJJ=nx*tVY2)%G;L?JIPEYoV34Q6@$q<c#E5VIKczQeu
zx|l<XdaMW&A8-8x<Ro!eua#3DAhRHWOvJ7X<Wz{~GZtf<a+Y7^0Sfe__ml|C@i8`$
zevR`N7~uHiAQQl|<j+s`qJNwPLDc$#skt4U=!9}NhS9jRsQ3xl<%;^h@<W!WD2EQC
z(Lt?2HWCw7isT<S)L-HybQH>2gLzZ@bXO0@t9Ti$_K%1OD#|z#d1$Cy@fjy_oNY(j
zO2v&FJwWzLs-2WXk7FopQEs=mh3Sb4{EhB&Zv?cFIfMxnQfUo?BJ+&%C$t&Tgu2o+
zb!v#*6CvQ7syy)VDXWd$5D@6sO=%fS5K8vXxGom}B7||vm}tVJ3LI42FF8NnoaLaU
zxYF|{17)sl&Dmy#J0$@m*#$gs`);$G{iDae>9O{^T56A5H`{m>&!Rutc6S8_q`-wM
z|Dsb!r?%5XX@JViC)pgiX!AVA7d8V0%@4m)nHW4<FidRmBNoD&5s`@b!5+6F-+l>2
z;QewUxg-xa5ykPsw?F>$U#~&k%vaJ^bi3mS<rw0C4aglWDkzt*uRmX6+C|0~K}#BU
zY{2fs#n9rQclBz;(LW{DFWg*VArV!Y4A5}1sL=>VgB>$;Pc1M=T3=t=(Zw_xXU2XW
zpiguXu=~ypkmC2^`|QQ=;;$<ZA7ai|ilg}|m53G<kuy7!;fIu{UIy6^YM^R}Jra`X
zVSGAhk_4PK3RS&4&9GG-2^u$<57*<MtCCrOfzT`@vnQQSl6eIN+IU1Sc>=IPRy*_H
zLWfO0n;p|vfeH0=0><ce^j_K43$jQDVpx()nIC`$h3gq?Pm?i4f~Y5R{xdOJm3LRm
zGGnt+WGO?h6gLA%Z>cf#3mxHqc>UAa<!2XPqv%4?TVEpgOcBX32he@pklQ@5Qu^k{
zWX+#pQhvj*MFN81Kbd|b@{ie!BbdXqj*{_yfrI%AP)h>@6aWAK2mqKQL`mc-L8{0P
z006i$000mG003}#G-@w!a&L5RV{dFOaCx;FYjYYm@caD=m41Pm-o$C9?a;b27+)Mx
z3>0uuPm(#rA<QNBVD1h(G?V|{-IY$d7mm|MrnP<Ar*>D`T}iE0%UWy&+a+-pdvU<R
zc)bq<TZ@e!iIimo^Fu%L@8FY1YZ?-%$9H~o%l4^Qv(>?x9-h6_Ol%8y`YD6|ILpMx
z?E4i5JPijHJVU71Y<$lbQN}iL!fr(*5*~cN+b6p?6;>#+ZM<e44+0R?WvhM0?nH86
zeq;#m{cOtup4<u+?ZcHwQr2DxkMDs2XnJ5g5d|GOpxe10ap0o0X$$+-?#l1ge$!$1
zJY`-SrT$tZVh!S?8F-qJ5|$rjB8hmwRwDAYAx}P_Dbj6x57=^U88Hb&T9_m-vVbs5
zS-fGX|Cf;F^h05@++OQFXd9Ef4fit1jk6%mEF=oS7A(yYkeAZT#7A$-qg#;MR_2O$
zD7wrGIEa>i-|@uft3X)znRc1TJR9vWNCeM-GLF~;M4dkCvZqf@pIBCp?}1g)W9K{p
zO#%@f*aZjoa6h#$Vm3(}GS}Vgvwb35m-*o?PBONSd@o)LH}ErbB#7dxG!FKea3zK%
zDbW)@K2uaM9WVn=*6!-ZXCgud-O|`v;FuzX4<C$>(_Y9=1>7B;y|=wM-0>_&Xj=^a
z1u?`AiIukN=BmQrArm}F_~B_6QSlp*y5Nk$pBf84b_cLd^n(H$vMh$c=&C=75PoDa
zes?U(df9)~cdz<?y2J5;ow28<&whIL^AA5h`$<8DV+HwMgMOA&F#dcv9xiSG*cwK#
zGX3n(lAy73*<TD_IVGI$t4RGZ4v+b`1&TQHL4WGFgBSf7j=p^hgWobI%5YKO94u$N
znBBP33FzZa&R;r%g*&`J+^l5{CgU*-3@2lEI-5+L+2W>*<FzbM$$8zsap%tLl`|`Y
z%dnO;Ux5At;N?6Y{)q^G^+=zWudb{M=Xw8nv~b~XMV-8F6-QxaR}PNDe#Q4eMwE-g
zPNdfP&BB?xi<>FIfea?&Tskv7<hQKRWPGW=M=k4oG8sAju^w81u*voL$k8Kf6h9bV
z^+$T>0_QLr7C5CX3ufcYobkX=CM~N!oAqz>Kn8TqLVs>So#EVk2QBNWKh>YSWsR>#
zMrzcurv1V1{YxWZ2hJJw=VsC#uy7VHCKq}n1TGWz%DFms$oFx}dNmmu_#K#O;mrDr
ziLuiyNV)+o<nKR?{u?s(RPCY=GvpxAvYgp$Lfk*X46g?RXFkWT16pNqJsZ1MFanC>
z;5WboxXib(SiOCg<^(QIPU7S&PFQim8j+@4LNi8C_IUos*o82g(XvBD-NEGQst>=X
zau9Vtft3&y%#-7kSZgRMJ&3~)wyD$_^hcu{T}Pyc+@24e(M1ik0T8Yu2%GBxn7W4G
zBf|W}_2OdkdhDtdQ^QIDi(;k*2>^m2IFbvs22T(U6Pe8FTtkGta7NP_WD5`(!!-~`
z5Uj^nL+5o3_#T1Ip95#wsLT$*7sI+0)(8L_IsK~|Yz{E=K_53|mhM-ca)wgy5?0py
z`n*PWYvwRNf|3?cM#F*Ah!hy4kuz>Y5e0L>92U)tJT;M-*aX7M{5UGkhMY)~xgP&M
zp1f|Bv<Ds<Qw3-vVpu|?)G+FgFXb9Kt{&<)^3$xSnGC%7WJX3j9-1=;cKNx}gp}k+
z#?Ey!n-Cul`i-)<#<{v4HDZJsV|Wb3F9^CGH==Bmr=I&FSQkwpS9u<CO%bOs=FJ$X
zVtggSI4K2lzD6&cQ4`_1u#EeQMSt+3(LRd(3lZ#!%xRIU+}uPL=No|uk7L0LE5F1U
zzVn#J9pQLnQDVQEswk@OgE`LJ>CXl)jxX047~z%k5UWI~I)Dm##k~(K-9haLm3cY2
znKERj5}8{|T9ttlMtCxT=#2LWGKHBFQW}wJl(g41)$APT05rmF{RrYMP4dD|ivFP8
zl|pcSEU*cMiiKvJFvpSQT4NaJC@K#$;oQ+gC)bPVH7#k_8x~ZFe4;8^S2#V^4tTg)
zbM~>z4m~!GBhitSwyM!8Xm>o=3q+NF*#Y!oNxQ?&u+U2OqNS*8y;GDRQMaX=wr$(0
zv~AnA?MmCWZQHhO+h%3w=^m%g`0wo)@emL3xc1z!=3X(s`U5UeNgKgTUM?t|WzSKH
zhku3AMNrd)!tw^GkY&m|5p<vUYOMq&ni{!yq&pytwc?4x7YY$R!qeUM#16M$bC%c_
zvuG&uDzn`l;E3d@ecII+>M~9$a3k@V5$4-Wap}zVd3p+c!tOw_XbhwLkh2v9heN&3
z`HMGC;x1=AW`^kY@G(RHp@eXOVC?>?m|Q>0fSxAW+mubcTI|XY3~(!s+zqrA*En>>
z(6G^#G@#T|6d@&SjNT|ZcHoYJDV0luWIE3WG?N?@OC*7KMc`GABT$UFpx?3}O|yus
z&k`q+^qG*G;aTR`pN({Zxp$rH8DsMxV7owE@3VfyK!de0Og7uKlFeY7RO0JRefhjK
z){oq>dlUhood5ymxVLKdK}}Sf8s{lq#GqCv@hFAEXd^=@yYGDtz&}FxxkFs>AmCpx
z9w`<^2mX*0q_Gt%&S?k!!^i&O;}-8hV`iwSC_JL}*Zbh!s^<CEuX9YT&ERtno?~A^
zq2>44**AMZMK<p8HvnR;oTM41U1cV7V}e{Kmkd@Ul-@4NEJ=ZMDujD>Y4E=NgXr+0
z&y?Xn=te1E!t%vqv2su-OIz20e`wU%yMV#AHUrZ8UGTC-^KO(l<qY_ggDB32TAF8B
zcYn4nDC*14a7AX!C}I<|+W{Bk)NG9tlw<rVuBwcqkSZD^5i#Y|zRVXBv@5bQVvUnl
z{)oaqz7u&26@o9H#X=KW?QQTTH#HRK4a>{V&<MduIV~*L<nA8}!_o~_V@nj*pc3kl
z4l+ytcjnMSs@yuZy(2=3uar7rm73E#G5kUi<z6=`;i}tFM{K_18miabc8;Om%c<dE
zbk@dr@iv+WA=kL3d%XN}3A0;w?LbNKqeVi`-jULQz_|mOf?@fXTaF>T$8Xu|CuWO}
z&*^caji59?+UP^=fwJi|DDL9Anz*nGEPu4RLuFB%GFnl0z&o1+=<?D6&iIEylW1%*
z?CZ+1;mZ~f_WbyTM{rnW2U#mz*X*vsVRB+~)=Z}Q_0|a39z16VhpGaA|H!}Zs~Xo(
zh3Oz<V2<mkI>&WJ9`3<F%O2(l;Pme#r%O;_=MNiUB!vSpYZEET=P2RB|Gfo(BrvBv
zQ0XBnKa~|GVcQ8%NG+9OsUi=K{NridZueeVeN4PfPFV?AQ(;>}sB|I_qA~6-(&=Aq
zplpM5jKE(gk+~c$ZL^!i88M^o|IH<~o)lv)H=)(4p{W;_fAMl?_%QIJ7Xc{eT1Llc
z`tclvN)NPy#_S9F%qIMY4S3Xf`ZM}(IW;REZkZ2W+5{tPS>;FS0V59&LB2c8U&V=Q
zG2^rA&AwPC`YqI~B_yLnzYW^CAJSsmiUXb%2aRFu@I*~5mDJ~B?qz}^Us`*`UCHb1
zo-1Odj)@@9*DPf)ylD_kU!73owJ2W;fAwjJRBW;48tv&U6p+p|lB0j;AD65}cVOYl
z2k6W)koj>oB+4lXHh9IaXI)xtj@_r=tI&6n7_8o19ikRVjvdY>UM!!&(hUX`HoDAR
zuSMP1HYv=dP0G_Mu<Iw{I}vOiPz58^Nq#7D#>vFEW`!iH?Qt#0KuF}M$M46^jGg6}
z=*X!|TrLUC?damvEo5jL!j3RI^l^emg4Topk}ioSm4#=DNE_Bjpx|Z10-Go~h-I2#
zOUl6rXRZFBzzq1$P5kTml^&qHyrdBCHA?cUK|mIg@qVH#96Fw#zL#E`0n$r4MIe))
zM~=OT&ZWEiADR{LUetT)McW##`gF?j_kDC{0s&T3Wdrw@Dg+j>$^^A+N3J9ZB(h5b
z=UpBTHS(`Eb*{<;G$?YDULYx)bql0l)90N?2&=?`m8EfZaThvTh&<Qbm2|4-G=+it
zvojT&k~OjKseOl#0-yvT$wE$*gz<<Uh=t{iYmYCGUHk%G;6#5R26=hjzs{c?OG|TY
zxUI|&@?#`7I1BOuTeKXV&Yg=vr>q}v9|^O6uzm8q{AfE~SwEfzPyZHp)pRUvpM9%v
z^6Bcy-%%$~H+Seo3I8qBH<RA&!)8j<redxp{z_yY!BNpO?CWl6k?$o}N1d=!55B_o
zA=KqUFgJ2r`^7K&7iO@&SVGC11LbM)1*2Sze8ND1vCX1MXu<Xs`3b|L!_lIW7K?>j
z&n;<DZP+^ycfuZ^!c&=_U^MOy5&D-GZk{t{1rgIHdfa7WKmfgS#v`_-?L?PHNvOMW
zeB&MPKm;^05`#31dR6(pq~otSw@0Jb4%NghSn#rDLieXgc|HdT>o@Ul6P8YrZel1{
za1^6SZy5QkCmn##L{QV0Cnee)zAK?*4@yR>m!|PWUB|tf;>U|VkZO(aK{zJSHS7WZ
zs}(Lo8&FcpCM0uEz*_0NcisDoz|b*-<|v_1$w4bKUcY2}k+cvK5wj5>sP2=D8U^v$
zo9_H4@4F0V_JiKrfx^|E!%_4ypQAuvc@(5+pkkDn9}mh52$>kUTra|m<>;_*9mLvK
zGoULYpljp@5iq8!Lg1e#wY-CS{RFVPaeK=17SayONIUk7eXgdfKsO$4@-qb|9W?4u
zt!NpPOsC(&CQ=$0plWQH<MbChDjp&^a_`47^%gzmcyOCFl4V{mZ87qm_85yFS(44=
z8j>=;W?(bGc(~^#HGXu7X?h>5q-}AFwrVzMb(e`plh!!p<N;com}{&|KV93+xhA-+
z3X!~Kh}O0x3W@4sMAa5&1$S}r(T3W+H3OXGS5X|hv@WOYh}+mom9n(rQMI5(z8vv%
z$Fh)ep#Gi#i6i%|mH;*NB&-LGRMeHKZ(Im=$fe;_hb<RsKbRnMyNrrdSZunCT-m?k
zPh4v<&0IR6hM4Mhf^ND*2y<lowTd%03rVB+?a^`Y17mkz57YRf#bae3U%fxwboV?=
zN`)tUS@w{^$n3-(q*8mHpPJ149O=MGuuF;eUE@QEcuu_*b<YRM(2qHX8PW%zWi6-+
z!zWOVncz0$;-b2idej!QZ1A-D<wbP&K~QYyY6BLUliaZd6vqPynHyU9yYGRXJK$Th
zDo_BHe0{A)F0h;B_&`xiOLYF#zA;V8cdcAh^Smf`22isNlNWBDCZ+k2{TA0`OH5=_
zU~vHGzA1zzOnnsaN%7Bhm%&zjJP%>_MZP7<oU7GibvoO;WvzU7l^;~i=Hz!dOo{d4
z#fFAviiZ_NCmmFFo3m8f*4~xzy9rH~WMw-u?G>rMUk>OsHt&tDi>m=Wm7~1bf}6!1
zRRPqUBH{d>Z2yy2USjl5T&Nx2CKD6@0OFVE_S^ge*n6;8(mPq0{Zifj3+q&^Ha_!z
z@P{;QBxN9QIP%4K{4|1w@#?Y=pls<)CE>W%p}9Rg%(@3flwq`?tl>Pn*1J>et?S!d
zZW`5Cgz#D~``?Y7@aHnFmn+q36_{N5FJrXWO&?@f3{lL)ANjyvT;e;JE8*QIAKIqt
zpGKZs_ErK?WH6s#AJT#ahZ!2d?tX$J0U$nd*RbYWh@c2yf6S^?d5{-i<h4RRl1BIz
z19>V&Xo(`gWJO1hQ!98)d~MkI-V|3V#HQLR+PV2tOkkU;PRYxJ6I%<bB5g+QkZgkF
zY6pw^W6s3!Kqe}f=9*Phtkpe5{=QX=nIb6&rF8o~epoyoKyJ!Z=utzq#VzBqg>*X!
zOfLU}?!Z<yLcwz)zCchoasAL<(k_Sr4wgTZfdv9KSH}yqgc>m!#4=z2c%JX?XNy}2
zy2BPXG&xzp$1DjEz%;jq&!C(|heXq7Wj4_UPMlyeN*2YmU4y+-vhM|jZ6E_IOv{_9
zXS@&<1IZY8V{k4wQKgWeE{wvPFX2^x#&NRlY%~fW;Gyf5>(OUY3Cp<O=hWm%yYBV^
zH;6sVN%&jquz9fCqx9*;Lk@ed0yF0f+<`O0_^|_YAf*CHX{~ac^oGfNR(}=eYJIwv
zioUpD^CJ5^clQW(7_iUHtj5CQq5Jp|*0o*tZ9dOTL#wN$5V#4o8~%V2h?#=($pa?Z
ztOTUZ@`_+td8(sEH=V&+V1v&`s%J?`N@ZVD<ZUZ2&YI^oGzm*4<c%9%dc)z4Gw`rd
zJg%URWL90!1c}Tpv<{8_64X@v`#n~v&>$N5j@Ky;_xmp!0~LxGD9C`ZR?L$@4_;}W
z=fh)12w*ymhg{cUXuaFij0hye)B9sKS1Go)4ewt%Q`fGzvc)#jlLBmD#H2c1oEYQG
zMwmN~t167L&u~1}oFPP*R!k6ue9D8I_C&TZ{t+RVbcko?ffe?PB0fVI$1pk2Kd7wh
zd0haapo5G~rd;;ZpAybe3NeJwRC5tEk=WvTQ&XTJlV1D~ug1623QMEGT8MKnk_l3z
zUh_n%9XL*gvv)2LG9-qM*L#$fF%-x*!93xfGesyOM7`M_?NJqVgx*exzKD`9ru;4M
z?j)9Suh(lXs25xBniX*R<rZ?w2~|<SO48#F?{d`!T&-iv62cA(s14r*`l(0;?87Q%
z%XC<6Tb%VupR!qWXE0SSx@ENWgoV@|cy&rn6cOb<bJFjpa1R0(e3yj1LT<Jv=Pi~q
z-|eS}-?biM<e;VWs(+Q}?pCKZJ{|#pF5{2TT|~ssJ}9u+8kM{eQ+WmzH8pGHRd*qP
zo!q_f^e0~=U4Ejc1)Oc+f!qUqb1#7Udk$;)e2#I)yd^pc)E5^7`AD1=cVgkL7e_yH
zYf!=xKe7+Z^EzWXH@aPSN$=1}-5iXy#pOtk1g|1Tw-teIwj4-UCOJJqD@|8A!<WYd
zQ)n{0)8IQ?b(J%S)nF7O{a2t=2(x-k1{mG<z-8sVbaTV}kz?Y}UZYtot8-zU*T@jL
z*D$+=A@@4ya#|52_B++1m%_#umZO*}>z^78cSGK9@cKT0W0{HYh!by93lh6;##mM#
zDi6HFy8~NFJ0KQk`1eON7TPL!UOAkqU&zAjVLDmC85SA`vXYK=0{O$GxPv0OoNl*l
zWy`lCG}IJIO8!ot9|X)uP&s&*I5HWGAEOVEH8J6|;>di}t9O7j6ITEz=6Bfb^^ltG
zBDzy$^?}V{bnmzcx)we`#sWOY_wd(YN>OhxCm>fJK7O>N;Fy)+m!A(_H2~7h&i8J=
zKXeVq%R)u0vwUxd%MC2{#p=aDbZg(i^Vzsn-s9<2HDIIDHBuWkek^yb{Zo(@$t37H
z@>D)%CZ&v8{}O&wn$Kf<?+6h06>Gl&|8E@!07%PFkocAE|9e6IKK*YUcQ$cycB239
z7a;)t3+Pv4H}1O+0svt7tM>mF==XnKG&Hd_GPf~sv|{{K{$e#PyF~^R-`m={0Vyb*
zjhM_f6pKE60?v|Vq&!9u+k=*!EcS+M8`OhdC-3oM63Xl`A^Oo%@6$)KTq83|_ka`+
zEXtX5VV3bWd5S?ng(=UQTPiAF%8a;!)~zw!*&(w`a<zWcp8kC(YR5=6&>6EjpQaK(
z&rnRQy)#eIAy`B+^}QepB7mY-3tw+vmY=6*XEfh8Pdr}^-fwokcSPCPQ*yoCLP$>-
zUc^Dg0ypk)0>}Qbb<t{%i^pNqaOEG9+R(l5aim~EnQ|y0J){<#JnSh)pu;hq4J-^|
zfXrJe7N#@f866tGRibl6Y=q>qq*CF{%2Pm2|8m0NcmgVPN=2&xYqXvqJ_O@*u;;pg
z_5<Y@&Ja$jGV}AHZ(TDM&a&At3e%OmqYCtp_@E+5pqJoMbXt9O9nj>u9nvzJB|p$i
z$|`df(mWUMiRprO6jj(N?qpyRt3z2TsIkJFV{)gGiQZPL2?&^8QPcp~axOc8k7HxP
zn|Y9EFV!o`9y0Nh0KX=6%zA@{3V{3t3n4=ijs`8z;Vkha$6!M$Vj4r-lRNjC25pV|
zDX4aLaCPJ(C8)23Fm>PEGR@hQy5-D?xrzD((Ucwr1-le8;XHb79YUkWDqk<Cmh8UT
zyiXmv!;p;SE?$24vPU+Vd?yhZ(5wFXJ2vy^;VTqRfd%tyeTVDPIg*yPz90~Q3|W*j
z_{a++cJ!QPcjJJPiC5ofU!n9bte=+vQglewo_oH!G2kNR7n>offCVF%3N!+mMv%oO
zG9>{jK+wSnn02eaA?DlxCkkxc^JD`g<(ROF^Zew`JLjn#A~m5Q3oP>t)SYKVmE9k<
zObT=$OW!yk>53y1zfu37YHSkn%>nD8R0$x_09?x=ZdpLVYIaT&hD2Fs?33bHmle7o
zFRelxTi7L6GXXRPv`wl#i`xPm#y@qtrq3%uj5p7hHh33;NA@S-g)BJ<L6OrarS2x2
zMu5Jx>z?GCkRKzUk=>lp3Bg2)0{q#bU|ypdlSK?xlbO^L0mOuNC(y}#jVC?HBUad$
zt(d1owb|rRMZBiprq8<CDXe9fYi%U{J3#EHhZDVhE4pZv8Yd&#O*yS~RzF4$3SiAF
z7CSH_g|xY<sU4$>tg)pov8Oh8vzyM#S)5nbC#BaXP+dZCmY-w%1Auv}wTh|z^M;)M
zws3#Kr};f#B;a|1e4XEl{<-`hnQuOu<Wvg`+ZZcssdUnJcg0MGvQcP9NLB3abMQAW
zR;_=3e9tU##k|wObqZz}SF!)-=K;=4#;hhZvNyNZSCX{@|8)c2;###$!+oE$aoD}$
z(nA5;{PYp?U3Gt*f4MF6k$1l4QFiUV&Uq6_6R5e+t#Lwq++5NYDf6A4XG>-fMyA+j
zSVB)^c+%~FzGmk?^|0KTf@9eSB<iQaXj5=6Xv6Xj4%nC6ON-YknuUMIj8sdmbQJ*K
zP3$BuM4v@+MldnYxAGl36NH0}2<OBAJyc)_03)26W!mb&P2j59<qk0V^hq*_5;Yx}
zxt~L9QG1rQCrIkDerSfPMQCTXOEcrsKbD(~IUz@;EST8t;1Shlfe8)SuhTAZh_i`$
zT`9KVUWjd`pLT$tZwExH%<SoD(BO=wXj4t`ML(2kw2qE6yoLWySuDh+kbVRN08svw
zM!f&jPeTu96FnzqM+;jsz5f!*UvUgp?*A_;c=suF$PKB;W|*iKf3hba6cRXp!K6tr
z&F_bCIS!XQ+4eN~J?<CAVUfimlxtx{L+-9u&M#pAxN((vOj~#38t7&f-^>me33W#$
zPkG+{P=S)7a>E0xo6rnPba}S$ER3ziSoFYP#w1w5PO;VP)3_F1)8@}%p)GjrY=F;2
zK4RlbjLykaUxB2TDu-k^dqME@*?=mjx%@EVm=I3WTT$gk-hxZ@5RXuS7J*J)fzDvN
zSJNUz1XE%7p3dZ3c6U%cTC2M`HDu#!hJD9bloO3#-0fxB910*euR+FA-Pg~`{gZHy
z|8osV96lGnR07MLY~xox`#Z3w##J>#C@h*RMtUO}a|%*m)sb4D9=fL>Sbfat;$%X|
zS?b|Zagq{S)W+X9iW+RrQ)kUCOV;(y1zk<VTqhiuS((4@9lErBvDI8wnsarCfbyk&
z4FKK)Rp%MD`&yG+W`r_e@EXC#B6{Qh<Vm-?*~w(XwU1T!V#H8qMcC&@+Qu=ZK(Til
z+K9j2c{nbcUOP~$ud=L<x7y$GWda$eYuV5F*w0lAGmfl1o-60cSj_{+X@dlMC!a_?
zIZH}s=d)*rzxSu*6r8UZe8Y2jx3b^1)pZ{Pc|ldx_J!(_0Fo=5Es`^yMm;^HfT%2f
zK>xE`wlrE5OnzIYCS?Esg8wWRBWnv2TW38d6GvAQ$Nw&vYaA=5P2s!Fzr*_sfv`1X
zvIhE41{8#}{w?DyCx80fXCBs7sn4TiV;Et6vbVdrvWpUJIAq*m(^>0p>RUToXFHQ)
z*8<(}>*Qq%o&SaTFnd1i5nG>6F{Sf6Wkl*dvS-v$XiU(w(n89;Ya8P(a>75q8Ka)g
zGbQHBL+!0gsC5j-s8)_QY$VV#^&<jr)=3s}qf_0Qw@pre)&?-d2xm~oZ--3@-nd|V
zWhu=5d4^e2`wG#!k0K{RPsggOl<A2pw>nSN?+K_s<d_}Ku%nqO{kWGdDTjCzFOgH@
z&V*Ah8d0cl%n321F#~=<uCWnIMgKbZ8b7qli5^i)r57tM#TV7n?ZMFN?)v%%enFKF
zcr|81aTs8b`P$1l%ajg#MyzOzr!vs!&5c@TPyhnpQOxp(F!m4<UN5$O8mQ|{aj3pa
zJjKCBF&K0hXLPd?f9J>YYx&R)KeiSxwg;!qo)ExQodH}vB3K=6&5e74K9%UOed=R}
z>TNzhCEPx_{7f`zxE3c4Df!QJj6XLF2BB{@u<y5QDKjc_iKDTA1{O_Hz3<gCJ|YIW
z8DP!UC!*%F9D4W-oce43>v%0bj2zMj4&A<7v@Tn47$bD~9#&lBaM;nyy(6X=p_r}n
z-x|YmG$h|s1)`&i+=$0b9kzV#BP<7eizT<XF!S$33SuL9k+$I9gkDGMxl0e{Y~KBP
zq&m+}{J)>DD1*_&FW;}bjr4Tuk&}pCj_(iW?_tq&>#OLqIra0Yr!Z4P>+jEhI^6|h
z3m-2Lqj-Bu?pJrSe!)n81^fyyrA%&aK@ZQBU*FQWsVFoWl&rG77;Nbt7P~K&OV^2F
zOk0DtH7TzoC7b*rnr^{cV9cFkbDGW<a(M<#GnnIR8<J=Rf_N)A`H;%H2b2kVWGY@F
zpyany9b4&F+M-GrW?5spGsTCLT8d6Jk^%}cWT(5-m?XVmKjdO|X-wXt!@f~FesEIh
z!~nK)H;L8lZ)%ns1f*Y$8ij;mU5|3fne-SA>s1gyrVq64CO=+G5DT(A1*G7+Z!I|N
z&tGprX}n^ypR_Di2sgpDc+DH_=wA({k`3uWrSbZ8pRDt+_7*Y8i;F$Pfn2?3N?AHe
zg}DP~1S2|i?CUDkvCm;b8DKJ`?*<(u{H^uL*IbwIhuI@dI7<;}5a$?S*USmaQm2dM
zF%&14y&3frQAbt>2K<Yc6iX!QvU<@#Ntz(q$RMZ&5opg)km*d>`2sOuMS<Ji`wkjn
z^qVnb4Z+~gyynj_H2ro^h&{6=QRJXNSYqBep+x>pQk7db!o()|$z6S|%F`&Y1~NHC
z&Z08vmnIwMtGS*x59Nnaqk_G&v%l}P-I62opqPFGyPrgU!{w6z%R*W4u~P9?8{&6$
zNCZ?obFJdf7}gwjyf_UWq6RP?0ymJ!6-X=SyQPn0?tqVu{^Lej0nmuSdH4%8kLs7a
zZV%~!X!8QoXH<|z(k5fYzYAqFXA_cAlK|Nl&rG}xnHwQ?^?JzH(JyTZIqM}X)*Dfp
z=i}DCP-sf!#|U(=W=-n(Hraotvi-gER)bdQO|_~IR59_bJi)W!Q)g<Fx&nt*oM)G=
z4IfvX2NtJK`Jt`u&k?IwD&$Fbh2>oBpq$i5xdvopAP5MVn*KgX1*JEdIoOOd()UA=
zY56B7EUp&inyXsZkD43opxQ565pqFGkjhcD78g!}Mem3m9I<}#f|Qn}8w|)?5a=JK
z3Tw!M3t%3lH58&S!k^(pC>kaM01<{baWG<7VlRFqEJ9ing>;=zbvDgS8=iC|GY%h5
zAXkmuNI;(2MhbLLxH<7R|AHFqJd~i2bbwW!ZX@81QEg7{B(t*!V90G8>0x^6d5*b-
zj5n<2p6GcVL$fw$F`?Q~wt-db<VX@fr6?HBxfEV-RZ$r-_2c!np$8@Fvt;w1dyIqS
z>anaXMy45}oS%jR2RnBCdhyA(r`LG<1#zLs<_di<m%2Z6ZF{fmZ+SnnMuUSX*6fC{
z3f~uK@wJYSIkbYy+bB?N(2M8|sQCCVT&)g?M_1Icl_3Ylfwr+i=D5zcOW_+3?~rfe
zO$61%CN}iNj7%|Zs|rZN+`xk|H*mfVDjH#ihTsQ?-!&jBE~_;6+Q{jhiyDYHX0y}A
z4U?R<w42A;T;|X-nI+kus^|??{Ugoh)-Hee0z%@sTXp+GV3dyV-GGb-{5WHkSx{BL
z9qsUre82s6tRcfxXPTNZAHZ{4W0q-2n1P@4x!MSTM`;sh1)3XV`~yU;Z{gO~|AlQA
zNP_d4IVPWghu(fLm^K0UrxK>Q;4Z7Og1Ryc7^?vRQ_MN6S3m8e8c;z}UminX{VT^9
zoWy|9X32zn!Yg8Wn*=ws^R8)Tfrcn1<m%_JQkKg)3Tp_Pr0t^`ka8^#F&YzSbd(n-
zic@r&TLa@CIIpA8$58(4-%Z21X1lc0Uzn!qk$Fw5@NdM5q|iRYV~%=*3ioeL<Xm0I
z=Y@-%mzJegqG6ep{afRN480yU4up^ckvQC64>15HNUEJ40xc;6W&+9Bf(}lF*%Km^
z9z0oqV#8ptIP}Nt9ol>@>_BK8AUeb(`MDSw*>}&d3#|eL!P6x3x<>R9HYinW0BNo$
zLb(J4Yf-CzFu45-*KeP&3d)^PuXs*|B3*!AT!oLqE)mlndMT9z=<`ym$l)Ej0sCj{
zOgAR9#j~vQMQ(FDaR+CX5f%fF+$Q=k^xO4Zp>Wk-p7&P8g2Fb*Mu@|&)ZCkz3g36m
zF8y>1m&p!qE7$rtpCbG7@RK>PV5H6y!3o@;n**=cZ}6)fE^QMxf%rl5zDd&JxzZ~N
zGzBOMbp@zyQPF_E_AlQCj7>2qJRFxy*LsXoaTmUwHU$DXPPPYeAbrV7o6WR|oHL!4
zV74K{n0f2$d3PM8JHgj=nX8h3r<?Q$YG)Jf6<7N3qAhXQ{;cj!X~A8kU2digZ(OJ8
zz&`e_>tp3aTp#B0^(@ZCJR1LJ67DN}9PDJ%C62fBL#=-Lt>c~%7<vd(05VUTxhLU@
zmSR3}gCw$~?;7w6R3K{Fo_B9n7f^QDR6FJY4|v}n?u~9xJpbkgy4Buf6JYg)N=S#)
z4FRli0&RPS?RqF(LE}F1`Zw>yNapReYcFMbHH3B%4$`d@Z;_=mf7{N<BCrYhDCBvK
zt6r$^B+?YPIxww@ov9R6pBW@DKU~i5^7uEE*Yqjjfr4U9Qu`kSp{#=6V~zws!%}@t
z-EXF{H!V@Bpf==6i9L0T9}e8;wGf43PT56Nec=@$Cvz}Bk?+m3Z7H34vc+4!*Vuji
ztMm2U+8}p+ww%z?w!&aR+w)q5Zjc(G0&Y7*G(+4SRG8~!!o8~k)!MV&{nQ8*nP6aE
zAYHqtG+Eu((P^2<+prW{WGh^Oc8H`{?$Az`GEe_Q0D(~e99~qB*u4n>tQ7AUu^ySz
zVMb$joKe4i@h0tY_lpW+4WI|yR+$U8TP#>lr}aMOmY>(WZM>#NcJVwr<CrsU>C!hQ
zP^uLsuj+Xvqz9|8(tzH1qGm{Fh~UA1z=+%s*0d|CjMw#SItQ44^HHrGz<~j9(&hD`
z{D8BA-g2mSqkP4Kho_mRnXSH(d^eu{{=H`sl`1kUddJD#xFU!l@tOLRCRf48vcMGC
zbC5bu*jJEkCb$N}NRH9rvb6kVkpPuYW7A8eIy=5uTazJKSBKMsY<trX^s}oA5msb>
zhXaj^<w(f&nQe(6(7%vNsW2>C8y)+yXd{VYrH2A8BS7AyPbc#11k7G($%11T<xW{!
z(?WfnzohT5^Q~G;F?1!IPL}TFIPq@a0yiIGeFkK0k=ZO>RIk&n!DY}fAy(X5BpHvY
z3>N-6x4IJJlFvI!BE(&6k76aS5=mnRuo*bk#=_@{DIbWE*}gtEAUo((vt7L-y}N^5
z={u9>gZoGHvD~Q>gs#*CY%IW%Mo9;!t?=8En-loE$|$)ip()X}%8GOC@JN6rbV@k{
zXMzgB*Gw{Cdr_ypsJWMg0~CKi4;7Kbd%5J((a0HtaXB4FO;ZI&Y*`PSx{u(E4r6lZ
zq^1>6^Np5xV8?^QrW8h8al^o9U)+JKVR1}6eB0NF4n-c2Ev&20nsasPZV9L9)Cda6
z%(vaHnDMyfg|-NV*U+uDEJ3LfUeTT74Cbo^Hu-r;G(f5wqzvi_d*KyRG7zs>tY%yL
zSW<LHKfhRBGJbaJ@{{n0Ts=dx$#~oeEN@1$<L(9;s-a=vb?`gUcA}qQ(tL&gv+@H?
z>QGOV1BP2tSA22#_h#VpRma^!R|FNas4De~-wWS+GYQNOdWL>!nnp0H-7P4m`fan&
z7fXp|EheYj-SN_Ebd!Vr#k>!GKZbsT@q+S4=CD=tT`tdArEytvw06*OSa9WJcNERx
zfyBB%W6(43^4`YA>}?~k7XcNDwhC<?zBK1G{W-n2f|?~AuBwsV#|hVO>NkI`u+RG5
zw3&_pPl>O)1i3fdpeiDSTOoTdf?H2@3@@`9%&%YAYApQjgtsZ(4iW^~)@iV)2;e6y
zK{uxAsIF*Y2R?b3bJ*Ls+)15#1NhA}HfL|OsrTh!kS@(2NBmN%`-Hl=5K_7(AgY2I
z)FRUPU|M!m1Qh_%T0WlHi4@N*LrS#7*u~qZgR;p)$j(EcbVa@g`(>8FgO4X8Qc^_8
zn?tobMwomk!^#Rfh!h{lfY(Z~PX-W`MbA`#Zpm#4I~D_w$(GrLX!C}FW(hC?^!e)$
z<s+3qnm8@nFkMUCy&GQiQ&x)h?Oow1Ou*#(rT#|e(^myxy09?c4y_Q3r$fGbNGk!g
z4OF-6DVavicaZ6*s079(93UH~9B}%UUf`HhIK@QpxdH$+Ie@L~^BWIHPx+-G`iN*}
z`qe+ZP_}Z_O{T5YK?g|DCkvJX<oofGqbz5an?8VfFDK)yUPbT~tnNY^oBP9P0dWw4
z^t<`7Tha<kNI>NRnbCL#aa5Mcwk++dqL-M~dZywY<fWs3+6?&;41V)tt=fWCo05I4
zjdn{1u6jB}4L|-_a`q5C5VfRL;@S?`2(Svhe#On<Z>}5WLc?v*Tte>KCi^5zztZCH
z9dzW4XB#+5Ah!mbbB4rAzVy(hU*=Jz{qkYIT>$gOc9@Py;k}nh!EI4@6_Y(i>edNY
z_;kMeS-Sl{cihr#Z}JAN7P1}ieI&iBmTS?xBnp5SZhx<c?COXiH57`brpyy`Gc2ht
zdtm#jLC?D7;DzNT^HGa-71$;dt+4+QB$lfgqjAnR!1C-sm&7!nMJIPV#4J@MJcMng
zy*2o}yjY!o%C#8KQhNr4yArT|8iXJ?^f;P#W?Z+(%cT&e?W|{J`6=Tz^bu!PD)RzX
z6Z#CzVZKYgJ~qVDqy0P{aTptX(pdm-wM!V9HMZ3UG%G{pd(<9*6$m{$;Vx5|l9eY9
ze!FD`7PHVd6<n0qqkPw39AsNi`Ut{)pYA!jjAi3=E3-vzh^&4w#r5bk^?p-gZfF__
zc~dAU6nL#br?a}QJ^Cid-%8r)PSEc^_<?53Lef+lkv9(5ya=^+smM45WHG0I6^QgL
zKffttRu5kAQOgJ1aRmB*+@;t1`+ODR@b)0srmvRZ?At}hjXs{`=+9r$C5`J@(X8{|
zIU&4q?&BVwo^ZX*E<{oE>>D)(PMs4`Qhjd*<j&E-2~Fl#ovFVbB%!C%)C_v0K$E`r
z#{(g8q~cFf1xz%Yfx$sa>~s4{K6D`BEwuRUOx7ED*?)TdtAC-r03bQ*uQA)GT;0A6
z^aC`<<|ER!$GCWZ7QAv8rb$+r!^>=-birgOl3puSiwag7%FfAs%(M~&5^%nCbxXBb
zNe~ces(u`H-8_?zsz04h<^I#6NfUelKmi5-Q2VuMaQ@SxF}3(T-nQ`kAA4q5ecLvZ
z0j1}ZlExEQGYF`Hr5tIY!n!+cNoJ#?(xL%}y{TwKHd<qe=R4*-)Y!%xnf90nF!$&7
z%^R-XW>b?wAIp#kwB0nKD>q*MW|sC6nEEMO8Qpk1sRtNtz0l+dnUq=3Lh(!xu$W|e
zMbSbrj3t3lKF3l%>XdmPrTr)vOw&SLwb}z)BRhH}?sDQ3Y^^(>$RVs%l~kQCr?!g(
zHLCP4|DT6UE@|$6Ge~Wg!psYu(d-N&T8&!`RT$l)>X0dybre($0nD@tY|-1$k$cUm
z<#TYnDRMlx@VDcgac^g22Q;(eCgjdXj-c^1K4}z5CF<D4vT)ptm(M16Qfh?hZaHWt
zmzd;a<LTq(SG3U*VGTNjca1tbA<$M4hj0YoDw~ramwcJFOBVJiv__(=U@dhps$^)a
z4)mO13WF}&3fcYf(BF2)KBm3vYDychw&%p?8gyoUz*wggAMck*AJ%?tA=Z$V^dumY
z0HZ2U(~5wWQDKNi;U`uU=Ctw~JHKI&b6j=ySTpxg0|Cc#Qqsgm-bvVu85^`z4z|jG
zHOm_|MIEnFg${D*as#G4-;&~En1~PlE}R0T$S^LVgejjR?h|gaGg3t?*wkm|Y*zmm
zT<yc=w2G<wgoI%N(s!D+vDgZ5f&d>o26+@*fR;rs(?BS`&-3-WWJfktsV==o-n#za
zJ?Dy+N4N$w4FD4C$KkciX_~|lrX8-=uY1B0wU`y7yX`fm|E*nckVL;h7zZT_7B9Dp
z^8-Q@xFZs;JOG2tji*Nt$YFjKPeG_B0w+R7S_TZaBs$4CZB}hXjG;Xp1D`=A_@o^>
zbhJkpj(twTnnK&|?RGgbV?zMnzcSp2A=ZA~eR2ryjooY3ctdV9AKf0ochcU7?we~H
z{0awVZZCPyIlsAhB|Ww4K6W?k>NxxqeEW!bUBJXCBe#zN`^+lre;Z17PwQqF8^^IK
zmC8^$J-P5x_Goh8AQ9*h(ahRd0#4YU_vFm$Tk2}s!w!4zB^EYIGJR_iDAys}O~}^}
z8TtYG&(%a@UDJ2*yP5`20RXW6b2VAm{+i=1HovB)iSe&Deyk?t^j{+*pK9iNhRh@6
zvf1N+l74rR;1hs!Si?I-w%>w^SP!@HG~*4%Y_LP1rgzdx9?c&w-77T~It5L1P~slC
zYB0wpN=h+IZ#}{My;pu)^6KCx-RpdvvRImrnpwg@`L4L)G|j1K9LiaM-Jq5Tg8<>D
zUEA+lH(9GH#X}=-Jfe2vv<knWy;tulI_24A<cFGfd$9X;#9ne>lo6;Bb|2=os76sw
zg86Q>=uNhM4?Z-2P2vKd^}S`619%NkZh|z`wK(7-8bqn=%TZ~99mO=gpfG&m>nM56
ztOpO4$RM%v=wK6{VXNk(zO@u_SCqt75WG`-G<-h8b}5r?x%1NX!3y^OEdG5t{%a8P
zXnuS9`aZ}Mf)0RL2^%0Y!=aD`ty1QyQd^zZuxeE|7%68b_SN!~TDgO?z$$BycPj7h
z5e*S2l_L75%+KPw)s0f5Z31$AhPjZj1Bfrtycoa`f|i_TD~0%HtZG!S6GgPitSA}d
zRAq({4in`Pqgq0DN;f{hze$<TnpzMqW^hCtFDd~}j7WHUJ|CI1v($?IdSHi>vr}r4
zJjgcf?e)l%Tn$*oN4c<Q_#Pofeyyy4wGRbF6}76Z3h#z4gHk9vK@ss4o+T&&DG11#
zj4%;!dp_CD;&>d(-11Vg8S`2wb^}>|`Yt^R*LK$))}eW2o5;f0P2MZ#^x)q)u;z4E
zwGiIDdMMrf^W)&@E%snHnR}mOrH)`r*T^MK1kDS!DaNpv0KbST=pNEU#*KpnRRf?#
z_{xKKH5^+tz#(xz(OV{G?OQsrISao;;~x!+TrYj2X9=kje0mYLPIWq7-Gk%E*2hPw
zF86bAQ-YNje*+J8J0e!{KfvHqO@WY|0vkSCU&;+!zUXJHF_MMuT@3|@0+5G65B7UL
zlLlYBIm<hptS9q@Vw20w?6GU8`EmKnJ=82E^a$9Cb`r_YL-j)h(ul%^s~`r-x^2`n
zz!S<EYk;!W0jsIotH^w%bZa0HyGnwBg4sI&=s?1pYLpN_=vl}98x5LxR5dNDm`(W@
z!%lm|#TYmLa43TcEA50$gfi$JVGah1ch1_7ViM(KhVjk<F_&^np0+&_I2Gmpc0|Nb
z=j7<YqsntBbU@nOLil`qqYutoDh<ukYh9A-)H{X^%?y(`NAq-mT%&By;4f-(kSqjl
zvr(;mFJvFq2D`FEc!m#mt4}b`TY1nDATz2m`|K)rKCNw@Z%>V+T%M7fb&j2Dm68Pv
zc-BSmafeC%xGI(9Uiaev_^Tba^RfHCbe{D^>_@#zwcy5Y{wxepR&$zcC7u36YCMwv
zr$XR%`2X&l003nW006lEsSuV1t_FGrj*bQ%|9u{ktSV!d$&b+eNUfU1FE}>h02fd5
zM`ocSc|&Sb$+(*AWo<+gt)p}3x%bxh&VV5;1)KS{JDm*%y64tQ4oF}><tWP=SRkP@
zjHK~>>XDbf9IoA+0AHXCm}zR}2=ir(FgeHIMmPsi(yf>ZR8exFYAl$DRF3T(jZz8~
zN}q}7j?77wst9i1168pY)00+;GKtf|B=T{ak#~S=uKVT&<W%#!GRDU>AG*k30xl;*
zUIvQ0fWEv5G2$-)HHIPulxCQBpkheQsSSp>1YWS;queyig|DI0D+tmcICaA?ZhV<9
zs=sgxbN>ZXtAl{*p-I56D{)$H945jLnQEX`<l5E2QaLW;6HJivh>UMbxMDP`SYy#&
zXHNrPykYq-I%Gq0qbFOkSKTD8s$*j;nSGH+u07(ltmp827?3`~e?HU7wAjFpFG>7;
zNA-d@k?sN74fc8a{4`Xs`-{#WG%}3g$e$Wz2Vh4B02vlX>?3k`Ahu#hFYy_6-UL)i
z_w-V=q1cj-7vRmML?CnOyRC-s+X(t~kk+1C0edz0;%m^5rzWxOoux~wF-xr4SEogX
z$ZZCuCSeEFa<}&%6Yg%KI{9XTtAuF%3qTm=ozK<Ec|*Wu<*X$ZTn5?GJNgTitOIUE
z$>p(DNFwp_bBWl+OWQ(j2z~8-ZKKPY7t*Z9`$$Oc&`QY&_|*f?+mbWY(zCrz!MPGO
zi38KoO}t5a759sH{QeV+uOSBvz9v^9W~<pE?%mM_`&=^{_K%!ZiqqoyBH&P7eY9-U
zC(}Bd^F;Enb|%xllpODyQVwU*7{k;m>-IT5I9&R>jk*P{B(pe2agWoIx3|lZck#df
z&jyDmeET;E5&)o);s1A{_rJ^6$iUj#(7?#*zbp7!(`@#4oBOw;X8?tdq)-4C0-{`y
z)F37@iIg`cv(@S$aj-|iFYx6yL+l`$30L9=wZnS<{<Zb=p5WTKjl~>*d_aU%pfEI7
z40qw?wXJ#yCYy+AqBbuxm4JFLQ777Z2MJk^-ugB~S_s&1ZIV$GwZ{4o5Y_-voPR2x
zy4I755c1s?ct=#?n}MLE*mNWv8Up#A^tK~7J#_=Z9%PbM1f|G#Pb7+qG^*>K5HEY2
zozfAt7+etFn1Bj#jCOW<5Q6o~M}@xXngk<02@aRx<)Eb%Oo(F_!m6gv(S(B~$}(2M
zfRht9ctZKx_X2r#Py9RbA`0Dq${V#64~)yA6DTPJ@MFPKymXt#_~HX+BDw}XP?EOq
z=eKbXlZKr*<T{QN2xl#(u(9SKp0pN@j*vlS3J4O4lbW~orHVQGv^@Oz@;(0;&x1RC
z>nb0WIuxU3aj+0esQnQnjC)iCK?4c~8J_6Q!lG5Zf@dC8vwW#{FvK5r0K#0Oh9k^d
zMJUWNRS6W*k~ZvtS-W5J)zl7f4QHumPxMx?w`?qEJj~rf_Ap8j;XT`r<M?TVDJFjX
z3qXUN;c&TR^brma8yE&gB7^lK-lt(jsBgwqxSQVv)BLy&B2HJV4d@u(`pyl0mH-(i
z&6ftu3e06A#>5FYGPyffti{KPy3mE_?+kh>(6<Q;DjZWKfO+HbmKG}5%NC*6>k)3v
z$s3hBIeSSgbt7|P`5S<@<@JUzeJeZnbe?!`%qsM18#(!I%nE)nE<HD!6Fr(ZPTN1G
zRS-u8v8Fe@4-|%+khd=i3rTe_$ek7#BD4fYDJ<Q|+l`{Y)h^D5W=qrM4m_xXM^6UO
zN^^_-n?~!K?zn~|4dX!inD(V3WqQF%T;~u>0R+8GVaGFINELiz#LT~9t=~={UM!;L
zHRNXC@XzZ!crV;ie!>xI0h=qk)b7i?c0KPX+Tr_~(+<V@t`h{?yH`IZ_7Gz`WFa(4
zdlgyO*#YWEl8mlhN(2l!knuhh_sC_=0=%30BN@twLfLT0?y^(y5BIiTClOR%D(|<u
zXsnfZAocgZ;+tfF+SFJiKUWv<Y7Df3Fr*Ryh$hmO8Vr^XPbp2?-f2{aP{mhy9smK*
z^w62YMSO6}tPx09+nRjd2~n%HrG)@-1Fb$gZ03;F2<cFl{YHKv0ZNGAzh|=vOhNAi
z4}#OO%yZH-5a381A$efuUMFOcv`q?p_mVI&Z|G`f;l(u$(K6OSZn)(-<wG42xg`ih
zQ*p^|0+lV-0v<M-lEu<->jil?c@Px`0hY(cl`JBMr)nu_VIicvCQviT5N~Q8em#Mk
zPALf~tR7U`^7?aVk8RCQj_>;0|Jo<C_p<5jMC7r(usmSF<{n8|ba6U9E}Bn!j<w3w
z41eeKqGE#-9!ci*xfmO!RS;t0kt$A@!7Th}8?~(tBif1YHllJ}qps9V?zxO`gfbR0
z<WN*pz85@~f3q|UP2Bg>Cys-nqin&SJMI{h)gXm7v?t%+`n<JR8Vx~7FqL41YLjkZ
z#H5cqKm3`4YXqW$NVimAB|tXVoIq!Vy`i?35;3y2d^XZFH{CeKGw(o6dNxH*4b)8c
zsxC&$bx(5$y1*lsQ^W5+Y#m7Jqx&5k2VGB+TN|k|-l0PFbM@Uyy1f>uX4_uKz81be
zieH;;{9S=z9Kac-h!ZnW+T)fh$O&&II&r;(OthXKFBo%vmPV~bvyz5)z!Q?GhlRTT
zK{PdjkaL<B1FtdmLIPPv2tt@dM;g>?DV$^xDwrhT9#==~x8Q{sxJuF#!Jz~eP=$bo
z1@FC`kyegSb@?{Pe^iZ`l)s2lKFt&`)*j`l1R)c$HsxZJaon#8svl5kwXzBy9Qj1}
z6z&S}LPi65Iuu07IDO&>je!0@hnKTHyS&;A8mlXaP$%?v#EaZk!>uc}8g#a%WIfNQ
zzn=i6#F2|MYUVwR1+DkMZ;Q-bicRcDm@>{4e$L`+7!CmXx;N2VHm6W~evcDbrG3iu
zi_x+(aFgLLxAW$fD?vyvzAHrA{W1PC?y_{S5X2PkSYpbYn_+p;UbpJg<lDsiNY=G-
zqg;L&`SPl%^6pbY`aGRY2$b=MUV}Y0f}VTe79t{Fg+@>|$D@1J`_#3Z!VrzuWoP+m
z<A*Zh`ZmGAJ}3(F{ydc5*?<-w3f8J61{q16JVrrjTquAKVaThr04c28ve4Fk*?&Dx
z_Fpbyz)=lOA!=PR6)8!O4U->m0ktcJz*K!|J79j`qWWFJIKzv!es4kfZ3PrE)B-Yi
zTDP3Xjrq!s?ehJBm03-SgfyPhF{XnVP$1cJBgLd1srR%@hUCgNF|v;t+60q)Oja(s
zo&O%{-7(mA39|eOO+_Q`WUZ|Nk`r>pRVPrapBW}JJxR^fFiO-+Tooq`wHs)M@Z<fG
zFJlKAy$w7wpvYlWv0V5?98L%2UthPdz?#>|%+J%Q?RUj0WtU?ziw>sQuum`2%qf_H
zEj89M&40ZzZ<1OO8HE*zKM|nAH`?WY%b4+1FvGq-L#^s|N!Io-`azxn|McTM^Xo!e
zU&395Bk72JOZsgUoDYy6XG_&tUO1zrUi?HfWY@M1;i-Gi_L!*%e94JCh=Ig-+@yuf
z-H79w8{SM*kO#-GLim*0YvdIafd~E}H}{UxNb^;FnwBT%Lrs4`D}wH}3PWm{$y^nY
z{zDd%Qr%bnm%ZzOYSVd2!>iyV?o@jliA+Anqy_JDhnIYTuQTtEC(Ym$J=Fy~6|g}L
zD8wi)Jz`nvHj-lgediDD4CUPclA5}`V`l-+Ac~t0JdzFs{lS`dj%gz{Q01qjZB2(f
zuK%R<_imm6>2?*#RAJ3&luy7Q%ZEn&%8y9@3)f&()%XRK^05R<75D45`E%(V7;d=K
z$K0lxS><5G5FdF}+Iz8_vVT&GUsX<662e%#qO$PK4{sf*P9%6lr((v++lX1a=`rDG
zKrAZ>Fru&t+B($qbB$?lp_^B`v{(LX%Y`GSt=RixGI6e^aV=pqlpLMy3jPK^7>)``
zH9t^LA%41X<Sq?O&hD26hxwh_QJyBw6cSkGsep_7cr^$9VTr+w7Tx74NDJDIOXdy{
zA~*X!J>rk)<-|K`Bb<lK$F1f$;lHjNe_dbA;_gzL9<GNAIr=+D>1RsUP{YMPKitH2
z&f<gIE7U!?KmEwQJ(-?g$$1D>)|&?|3Y&0s=o>AbGAys^zUXA$!2j8htYN8Z&HY*y
z%t!zL*#D1)#Mt70#<SF=V*kstcvVy1PYA0hF!7M|Qb!156Ym4epI{m%xjz`{cg6a+
zb-3!<Dk@F_LU&x7jxy`<h%FomFoQU3Q!WOGLY#)8L!olQdVpXkgaR>ej2S~8*C+?j
z3SXZxN860N9R9P8kc9w>y50it59WL{OMM2%Cy5{jkh(^482_<KKaXNI5Kx}5?SRmX
zcC7PyIFek@o-s~=AlX^^Amex=__k<dw1}b(3|8O*0_$F%)ypszpPA@b35q7+3`wFI
zF8()^&D~*Icjw_m$niX`Qj2aqTT%oCwtnBU@j0e{HBXVZ#T}TZ+6YHdvuSD!`z}Yj
z{m3S@%YK}T&ZG4)5fDYLJnjN`de3I2<zwh$fQ<ly98p+k+KX<^C)L|ty>r@B&W({Y
zPgkKFtf*WvcP?gLs!J{4WIP$cm}`%BHhB;V(x~GU1D{cqYq^29T|Uka!yXUvAFAVT
zkHo6)7G;3D>)Re8@4{4)Q!OiL{Gl$VCN<caP<Ny3fUPNYsy*+1si+p!92Ft|$F|{>
ztjs}6s|PXtVr4u_{Ye9av4R{{Oj+M{;4v^%w5Nb7JEQ%jE7?M94jv(%;vD{Z%F-?|
zSat=wEJOXP50TPYH)&V6X!~y83qd+udMcB;<UOx-7!IYQOBlIgv41maJkg2&39AVs
z6<-*Vyu{d!fVMLJCbwM2897?bu#Vh1{Xy~|>Zlm6n6?HSib&efM`6j3{Pge&G7VyB
zUfFO{>ynm^2oC9zWAHBfV$PyQk8_eMmOvS(qr#_*p4my`J*PRuxkM6;5H?t`0gz;p
z0c1{V9@@14CLd$V)u<NA3;Ssn_9?@=8WEHr8OU<d-<l&XeQU!xSh`zY11G&%&nk0v
zXUE#(QtvX49>EFlry%$r+rX}EkUVv64@c>wHG@obE!+BlH#KXXBjeM1={`0=W?(pV
zGbHIUeuL}2fsL(D{=+5T$}wXVM^jpEX>TApeXh6WtZQ#Z^p5u5mb<EGQ-dxI&wfIU
z0_gaFx&~_cw{|rh&6m#7dhE@QY60CJfg>Tnilm6K5VueGkkVZ&Y^&1H&jzDu5>l?)
z);VFf^cJ`mj(8y#Rd7prW+|*T)&Q%J&X<?yK3#m3;n|zR2Q%4~DpXF|gkuH8O8paC
z<3?-b4n}AWA;f4Sgg9!H2krBpZ?KGuTa(2-M*3h$n@~OXet~S2N+F|U^<*y1<GYf^
z71z;2w^3tW!G{cl6|uI)53QRm%vmRafm#y(#+EY*Nv;twZ4j}%G<<;>h-L~@6S2K-
zfd3Q%t$xgq+25gF(J}x4{Qrj#m>D>mxc!eeagBTJcqn}JH+P_IN?*tlsg);B2vf2)
zIIy#&>DcNldQ^h7oMRNOWl>k(oggO=U1;IP^V^2pV*iaoEORYEbwN_|`n9Fy@g{~9
zYE+kjXRfE6cl#Y7*ZRDp#|4Lx&A^Bjrm57r2=;?#`sS5yW8-5<hosfOIS+Q5qTPfy
zlNQndDVTZMfh^J(k#TgsYn?j^$p+C~9>B?)-XQ*RlxG@uB)?ZYZIG}90=ezZhgGtR
zA&m4Q?Ym3=!6vS5HT)KKSnA#Q1neeB1Oya<*zV*5>4@^DgS1Ch>|t?A#+GBmdY^oU
z8%m5OMgS7&F@P`r&R{D4e=+t>?V*LymS$|*wryv}wr$&Xc5K@=cWm3XZ6}qgr|Wbd
zT=doY0dszuYm9ds*gr;)BAuQtdbqkeZY|VI$M!hZ)X>Yt=sto(4Mw34c;u+|n?C$K
zfo@OGl7HY(x@YAT59mq<iP}WO{#U4mWRK?|xL>zYNLzeLVo<+$;@$p|$O>iV#BQ>>
zI=@Tq7({ttq06I9HW^Li_E+y%s%%joJqPe_S!pr-RnBo(Uz1py9{p-ydO8r9?(X|G
zJ<a%o-;wNUxpy+)&sdjJHs$J4SI4B&6KUsLR?q2ND!MhBaqnmn>GYt{cbZ=o<F3<W
zJL2s~_edZJOD#b;j%=<HH~u%sa7ZARE&{nNgP|<PSbV!ZF?v}eg!JR;laKN<YUH(g
zlF!3Ke~_W5FGdhtZcY}S&%@_>J4a6^ndaa?n#^Vhyg*?1L5!$%4u^XB9?@B`cbH%=
z?e{|nwnMtO=pkN&{Q*a%WnCc&2>sZ_m0|nKkpxPJLM&fFReG%%O*?#{+WuHtJb8)A
zgp{%1Uit8TJhz!_?{%LQaGVoj65bH=t#6nTOKVF@jVxUcZ##*Md00kQPe(_Gi;tsA
zwvVBm&S%|Av6F*`<NKx8$^Pj(v^(PPGS224mX`v6CflJK12H~7MoiH4K1a#eTo2Ft
z^RMy_;6n^aaQ_p<GeCg`xQOQaH08<n9X`1uh|15e?e_ETqsSethu^77fZZ#jrCNsA
z?Xq!%8p`K7J~Bcbhpu2wk2sa$vYu3G6JnEeser=lc)NZVF+xzm6|bYpK$G1z&bmt(
zP0v0xZOTIdG*v+D5+Nw6o3-u*5d)xk)y&#<)((i{QtR1whlijh-@<SFuLFl=ViQSl
z+a8?HtMf$D1GP6oy~res8&BwYY`{-_sKa=I@-01~*3i8oA5s*hwKFysBdD(MZJwqA
zG$@nrqJoB}+=82^(`%v{5KoD}2T-s(z>3~7+8#uILz&Ih$4$>lPL7q?$;$fK+tCU6
z%KB6L^HSr<O!Ec3Y*XaK!$D#t_0<}s&kzFmEV&wp-_z9H-Po8C(O?M6*#i#MwNTPP
z^;z9uxPQ|r=q@D|uvM;KG)9qX0Nc#O{d)G4&vHrqB2e;qcW&1<{%WHpt0YRktb)9C
z_AR{FZ*otxwslX$do&?`Qp>Zc{cDT6b4Zqs(=Zmu?XjEOHbr-?&L(g93A@Qs_6+`1
zUS+g8>bB*`=fi1=AO}`)du)8`l_&SRSMWr`ZCXLW&=)U~;TSH+#&V7OYtM_QG<HLL
z<lJ|T#eqC@e=u|<rGZgp9{f{#Rd7SaAM(ER()O1<4Ge~SRO`qE-$8?x2l)_nUt>qA
zxr9XGH;S_rZJW-tsT8amGjo30eUOco^pA*u3o(+rQP|+T*BH%S`W_#_q=-ok{;jF1
zN)wnaKC7m1A~g9&kG;2UbZC{jmEXvY`zIQ!9YzZKy;y~X26HK|X(Zm|YaRMI?en6$
zW7eP->w><CzZY+_1abn7`pYt&qfFuZeZn?vX_ha#j+KJLB?fFiXnU3K4V<Zq3lFK_
zONNSJqX>e3xfna~t&OQJBXR^>{;H{bnKc5~KpMab<FDeY`@Gr#+_79~q;Xm)y3e=f
z;X~1NiVyGk=#S>)?v7=ex7uEkNhEcjS=>PU-9{7`mu2Ed+DzCl6|r|^YW58Dn=Noc
zQNGD`ezx#s2)dA8edyS&oW1dW*;0FXm8fH-RritX?EjSd(h64PF95(;f}8{F`$`<R
zFz;hB$*{Nv8<Oo<b(S@ex=gZKFjb@WSff2&zR7DBn=^zHRyZ_w0Ky&6$eQnW)`m$k
z$9FTXF&<xN0<x+s{1=iO-`s*}J3G>ufj)l0iNCluThmt-tD+BMK_X0ga*q1O7HouO
zl-3p^NGkO~W||mME7KpI1)OU@6*G#VATSBhwu5;~PEmH?&JcIVAv*afA?KdL46Go4
z9`?=z^%P}A5J#%$OOWI+&jaxcvisG$y*1G@x&R*c8N^eEK0y{34Tw_BVv?l>=#af<
z2ZuEo82o?+ffkh?g{l0dZZ6=hL=l2jUxc><{HBOu7nn>C?;(X{l*RhECd1GTS%l^U
zBosm}iH@Yz54ojGb<O(-Q<%YQB86@U1Gxvosc(*$f-S5q{p4>XDA$J{vA(`+Z%(ff
zal9QAihv;E<#=P*8Y{*}wWDYgA2CR&QtWcLA1s=7Dl^yJ7MuIoY40Qa6Zq!qp1ApE
zCnO%;v$K7U8m#P|u;#WQA>1OvPpO=fbrkC#kT4BJ(6-s;R~-fx6p-?<7(i0_q!inW
zj`#`5X?Uoj!ENgJxUo}m5-pBJ94wp|>phPt6tLGcv2X$b7rSWUu^79cBQnKqCct;V
z{R8L`5BUOwR;L3dMo`W$Hmd?O;1b|$fLNS*zul?n%0qmu#%~+$fCRqE?DqW#=fPJE
zWpz{Rl7%BM0HlJy2fol$+d4+2+tp!7wHp&3ZyoW7HA%`8;x3KJFse+LII_y44NG4_
zUf9tJBh2grkB4@PlX2spo1*i`&u64vq>NTd6iQl>*29tk8NeRvv&fT8=bD4ne&{e?
zL3u!uF7i$L`IEOJ5RRyV%kr3`XmOT_f@U_ntPJX?IrL??U!;iHHj`cdGvV9GlkC)Q
zfU3CUC{L}(m4@dIenhb?mIWaD^zM$D{{o7<bP?&~GkST8+nU$6w21Suf%J<HSp8*~
zu*L~jhVkE<8PL?t;&T-cB8x&<V3WDiH-q3ZK1Z=3)``V9reVhLSr28r=4WDXrNwW@
zsL%aq7XoV%MFN9`!of}=n7%aI|G_P<w_SEq%n{6iMCMVKwX%iO7+`-G|5Iy~*$lfd
zDX(=KdrVk2BA$(ZPUuMEJ|<Dm_{aS8r|wq?ToQgvYG?D+d}b@FoZlyuS+hu?@z#^A
zo1a}{<XJ~{EX-h2D|E0kht;d!iv#A*ct8*Sy@UlPQ85~$a|r&9hZqjQm+X8pwe8Qz
zJ)qM%x=G8x4!SGUL;&?O0u{r)xZ?@y{v)(f(Ksfx5>yN!Y6=>6GaYL5l4tbqNu7*J
zG0;fa-HOS7W~Tqsv(k||D!d{jELb0EJ^6uQwtyZ37C+1lW`(*x4ig+sD7u1Icr3Ed
z$^cO<=`0I%SnOPp|5N3*m#sV2{cTM#KmjZ4kHp|T|ByI%V<I)z{H1uh?p8&!^?6!m
zods$O19B173uwBU)s?H{(_-Emafv5x=Z5tR_=@yJot6WN3~BRqjUM(aGcG!T?ix=;
z4pd+hi%mVCxcVpMaCrcrPV5#N4EF->n}6yYYhcq09~9PSXe|-l(g?#+UWQ$<N(n>U
zZ>WVDBnr!yHDmfNPz+24=yE7=)w>MHK^<iC#b@C*3x-DMHDNu;At<J4tKb!ToYmfn
z`d94Pa_o_T*z7RUO`tBi-;QT*N@oXkY6ZA=xcENQhhPshLDQMHwLveZyeO&QrEl_5
z9O!BF0*=T<nH|aeKA7WeQH^F!gU6s~UWN))S>CnSoGYK~kFd45sMP`UQ`b&yH8Fik
zB`QozijJ;b3d(IN_dF#F>4inUprXnIr-fSo){Q^4DDFqP4U|88qI7}tWh;sjrf0J3
zLQp5(<yx8C%>o)rE2<D`1AK__#4Fs&eV%W5o^u9IA_s@i$N7~W(jUiA*P#UV&2bc|
zC06K-d{?)Z^QYgCSTaJ!l?m_$BLurMQ{v2L{|W;YGQg=xWn3P*q_rvS11cwx7=7tZ
zeYo7lU{92C-lPS;(Gv$%7R~*6C^d|iYIZPo#~~yl4F4-b@k({Ut{JlyV#uw#nY<2b
zp=361KLu4(bhE=qkO#W}dt3D60B9Gu#*YjxTB=cG-bufy)kyNEuKxScar<hYf+u{o
zOq8c5nf*T5p(A2dRd5%+Dq~QwfxsArWuh_S&95gM&BAr_CV(S#O7#ecjE|#KK>gxc
zn(i>zLiC-{l(3L%3xmSU_<{^ASQ#l$bWgb?xo+ru{7ySwsAnh?3c5rzrz5AX3~a;(
zYDiS_h<!66J<4Sw^DZw`EK@MM<LS(~Zu^4`^lP4;;M2#8Mospqn--RaBCo)?oRo5;
zifGGh1>l?+!289Cphngq)Qu&fn3R8F(aP}se<a2{aCsV_G?k(|MHj!<rl_#K*~XXC
zVK6GvaU-%q=I%kOm$C(c8+1==R$F0=iV(cB|9;{<2+l1Pd;x>qH-Y|+P8&NFuDeMg
zX?QdVE9s(PHbnfN<`*LjDnB>z0GxHkX5&msobc8R>CEXnY}w*RKP1z2D`G?mq2K++
zmsh$gl%M3&bC3t{!(d>=UuhUBU{w2RH#*c?*N27f4O0BOO#57I#-RH5NS7_@4uq!R
zA{JdMD)UR9xHIfCym0ZIzPA~G1;2T;$>DAeb3$RGOA`TY3`w?NiX}yu?O--$xzELs
zelv`%PHmde$XPcn>o?ZTypg1`!yWVbS{R~lKEs>iIns}Ky{T3MYww%;TnbJh5iZF`
zilzuHknslWz-(x3;0<h}CbD+z0i*${Ji#kc>PoMUj5&y4>d&Ml?$O_lGGaW?&=UTt
zXuCY!l^gyeYgy!GHtbI?)(+pRw44~DQ|%a%32btbv;_p}C@fbj{$;$J=DEyX|HKyt
z<$TBDQ2&YfHjOV7c5#M*ava%moQ|;|H~!SW(AeqB&pTlVfw+)a(MyuYL!P3A=<V{A
zIl%YgH211|>DvzLkbCC4jA=|H@>#@%G?Q&CX4Q9G?fPW8+^P`)JgsOtTf2^+@kuH`
zrFvzyqmmWy>g7s7amg3D7~2MK4>=F&p1%ffiztGw&+e;fZv80|p!o2O-A*;;8Ctp@
zSA!0WaOw?|aFV)X(H3a564b(F{$88S1x4Nj?+9kgKT!11NO=P&?s?H40=NE$2L1MB
z%T0aTNQ|L4l$!1sl#Dq)v&r1$D+&<%81&UAuofqhR?PZm;I{c?Rxnl@ulH@;N1XMU
z$w<k2P4R<j)K49`q1rh^`kd@~^`e4n8s8mQOqL4O%zg&guY6AegV6-TTR?0>1)<mz
zp>l3NIELN(ZscY#Nuw(&sx*`|<!&+wc{FV>>L3uf6#KLPFOO$zu0uLdcp)wGLTn{;
zt|j(~f-}0=>=LsU-^YR5Qxe<-u(_?TI`YmxK6fD=h6e^$&?p<WV<qt-F%LTWA#32)
z#$8urcZ}$O_t>vgf5F+;cG&H22l|kanel;|Q=V|Y%%E~I9)HZ6${~n!N>JMB3#3pR
zO$#Ox!*w|RMKJ$*Bk{m+JOL;_G1HL8Hc!|GMl#`RFh@4BTOsI|C9uiZYANx<dhAb3
z9C43He6$ktW9XfctCqGONpaF?(0Bpo0vS~xYP3KAq-wZh%gZ_(h=7ZjOB&jv+{i*o
zTgd|Y$CFv+Op9$E-AVnYSa<_kQA~ohGeR-O0wMLeU5?du^w_$aX@UlI&xafo4fAVp
z+3A)LAEL(@l+oZ^DroKy(pH2y)t0(r9Y}M{p=?O;{YDD!As`D)ctO(Y?fGDF$3a3o
zA(o|ycuGs|7msOF)A4#J6aCwtf6{`gASr?O<;q>Wpcku#a5@6ga<#0!gCW_sGv5Gy
zJ`j=B&o6r`yO6vsSHr_4^oi|}2w4jqg#?3{llhTX%2qe6aBj2V(3@<A8?>wVY=$>%
zhC#kq^)DIq;Bvnoy$qel=;!*F8bmu@UgBdYu5Ch6>P1wPPNiejvzF`6R8O(B<<M0g
z@_dfUjoGa|iq;1k^Ald6D?HEre$@AVdI5BLrzxr(2X?UiV&?qld`T+OEFDPjzb3*d
zzY>BGdF+N=FjAmB&p#cDW%;J|C^vn9Fj0NuF04$wkI;h0CS*jgj;I9jPQp*Kw`T8b
zuCcq&Teu<&YK5+#BJ5X?_GiU^O}K$wS;Nnv<<IaSPS!Z=ZX?;*wO><GTxYW)5Xj`M
zh|FpwYLuIw^%Pp=tCjocwH~v|H8i}Q11cD%;(Ww!)w)w@iwvAz{o8jybnlW_A&4<<
z6&DBFytEa1>F5b$|31_T<k)dVy{;vF-o?^4L#P!DGi3f5fu`^z;3ox?i9ATA4HC34
zQ2xgNkjo!|SzNf}ZcJ&JwOo^Iv`?BD=!`-Cn_0bz_sD-fhX|_xqIYAbAu!TqV0i`1
zTZ6S-8MV;uwH7+h2bD5Io>cV#f!0siJX;3LR$L-^`dX0?#*K6^HF-|aXS3z6J711L
z+ju?eH%$(Pe@}6RKj{KccQeefWxFtXo2YZ+*3z4eQ+jG>dDFQ~EjMuQMU@6pt{J!H
zQE7CQB8md7>0zHUL;+bi_4tavu)|TMoXS2Nz~tldWMdhsZm;nRmukFpFGj}MlV~RP
z@zo|pKGA~G!H_;NVh1Q%w!qqMs~c~uqs;R)tJf6q=bFK@MXMYFy~PkZTZ|69X34fO
zwfqrD=VbycQtsGmRXj-#ic?!n_8TKJoOfp~O;9y(V~Pw?0YL`mt&M5R5v@spJ(wtg
zQ#?de0y8y@6`=*sA)d1@4U-aphGXxWg@jlQ{&9>8Eg{NG(Is;OjwIAHTnp&dRo}nj
z#RK}bvpHjiRk6Uqnk*uEn6BlwS)X;Ca6?H^%ZX6qKs=141_XoU74;v#HaKWv0s&u(
zw&p8&y2>Q^6oj>c%7eCI%cZ%?^z3{?0wCm<+uvB#q3ngxT}~j5eQ?MK<%5yLW~$d>
z*mv?M#yaM1$|$wwl(xYy@J&RwJ+f2;t|Fe}zSIyWM(I_94#Sq6Xqby8m?}UNA9$Uv
z`RYrA_sU9m+Fwh5xSD&glbr-t#IO01dIC?CS9eCkg+CDy9FpOkG0#wp(9|NQz8utQ
zOjxf3gM&R~kX$%&uI=|A5BcTcG`!d4q~)MNP3x$i*vG|4sdF+oMluHPt+z!Y*e#6e
zVXGNGY@qLWUUrY-WD!QkMsD^QfiGTe5=QW8^_w9^E_!?+xn<j=hzl-xr<XMBOqH3b
z5JbHQvc$E6)fr#5N1NT$go79Xmc~V#n!4?bvyf+uw0;++*9ZVgp;QMZcgjAxay56D
z6mfydD07or*s7ch`a|C>*$S)<>(v7PB_)6kM3)<fL~DE=*pZu=aoG-RhIu69+yJ_r
zAC^e#jV@ze*VLTO_Ly4}(^*4^ANQ;uv>Q}9aT@O7@c91Rq)X}$N|)5U7=oX^ds|z2
znp<-@#-UgL;`wPkuA;zx3S(El2Hgg`7ocwq_op}C;O`-Ts;xn3Er~L)UUW-sGVST>
zVg!^fH~QczGr^C2Y_rezEP!h)xsXU;W<ar;p5R_IyE%`gF)n7mpUj_t`iVhu+^wP*
z{4{M?>^nEKVubT6<ru%9LqAP5ocre?@AU!|z`5VAODJP8Q;|GCb^E7n-5^FeTP;HF
zZDp@=UxZq~3YfX598fvZP%d6U7%2pwKifGUZBglQ(TBQI%EZ#HFXk^#NP~d#QZj?E
z{8{Cr%!001&*Y%1hmmaSTrk}xqS<_~`xgIkIuY&+^cluMhc14?&%3(~@GE(^25U6P
zO4!I_!tS=>O)z|)K)~b9iubbFEcWAGDR}lpmAfhA5?vt?-nZnB&`VzW=(Xve8huYj
z@_cVKG&#vqM48tG0S!s@ep$w&Fjt}uctK+VI(ixh^#||P<2V1wQt6WSJ32q@9R~4&
zy-GHW4&X_;-lFOV=gE_6B7efbGFG;#eMOZ*7>Ads_!K#5&V^-R*T^X!5q3PqeAMSv
z^vDuhSlxW@HqDnx4gyK7v%J5^ZkE1yA(3zI+DEakBJ0i$Nl_!yC}T#~&3q&3o*X+x
z%f3h`aDGM9wz-#6kEvFTPGh^nZsz2*1I+5v9ehft<)E=xUSF|%4DbxcK7cz$9>mz@
z&eKGzx(JY9t8MLW$(SC6Mpj?FvvZk`q5;!)G2uNkY}*(*-d*~U_W*hf$7<;ZUFP=5
z9`E~Rc`$Yy3FUYLJaOKeln3UD22?bc==j1OVe01eFDMb*gG^RWoc<egE6d90JQO^!
zW*0*XZED7J%%i=O-|Igztz;~KrUD^pyz_j=tJQ9c_A!zmWjP!H?{xwpb!$8t%L4?s
za7tEca%(e{8%r2*5w1h{KXz)dHfj&{-RU}n0msdQi@<~Z|NISD7d6pgB3;?(fbGD-
zqN$ovarulK4S*<4z4!oymM>MJa$!4_S`bQM^@)c<W_0Y8jBU+E7ja0CNCz;kd`e{7
zmn1M=(*n*~VDv%Z%kF``OE1Ac;mw*V`)u|Rt-M51KbY>fm2{;CUi27U!Y$yS=tBK-
z7<%FZDkrLZ=$A?W#?g(7{w>c2?}pm{tETGz&3ISK5`A4x)h?q|5|N5<US}9er&<B^
z2y~^`UT`J$Bx#!U?#s0QBgDW>un2<QUs`^bz*0VyS`=W`JRkl7PYv5M{A14N=HNN0
z_pC3n#9i--ysJ}bw4FP@in}O>0dNDjsund-KJLsn5|mw9r%{<rIghc;pFF1+QB0Sk
z%8#+tmR=rbstS~+dI5k2nDi{EsW4@wL0j?hj=O2(2X|HmZ6&=(yTWQzxVd4UQJ<z*
zN}orzFyn!sE#QnpXU<ZYtD`&`nd(KC7o(5fk*6n0xBa4B09y-BMf4{}s<DX$LDuZ;
zFSCee{%)oQ9K`P>o=jf)g8yELU(x4X%7t-fXe;qwynp}X>gNVIEpf>9=e2}yntgJn
z6<79Ol4~UXMR+*<09m_(O1t~VjP*{E>_#xcf7KD@+WRG^<r20>-~}alWm#JTQ)i}t
z@=^KdX%#iEpYC3F;_4zW;M*g6QFh<TvgCYC#uiyPAjoyfcOJJ5<oxS4j3p(QHhd{>
z34%T@jR!NEjDlu18k@<v7E#0?b7Tq}i64{+ankJ3?faY>xYjHs)a(dEyf!$)yO>=|
zCGKf{j?iDXlugV7pw0ci)`}CP7u3Y0(ak*R^|0s^D@tGwHbi!5PXO4%65g>1-c@A1
zG-$oLfc}n}U>dOPV6a5lPg4>#P~K@H$D8Oh4JAngS4uE7<X&>tzXm4B9dnQU!w$gh
z&hn*cPS1N-tKDmLqv>cQg6X)+=y&oy=OT>ld^yqvmoFbK6KPV_j+V|aK#64)2y7`2
zN^5adoyU3|;A};hD%JY1=2KKz!i3h2$1!UGDKdY>>6{uv2H;OS+l8O|>xSsno+Z48
zuCP8}YS2Pnc$KL!7zP*9_)5RILmtMGo}9F+>eFtZ?~z<c#Rlz^PtwwSVs+mpmcA18
zZjVKbNRsG^<=}PV4i{-ZMq5`qMexL=J*aeD4h)WE?2re?YIaXVBrQ8(NaTk=K9OTc
zgt9zAgKvSjk-{k)|AujDqImm5?d(9r3B`kj2k!-GWu>aDP&xb^S5=u|{&JjDQaiUg
z8TuzMvCB)8at{FimtPx8MNkBaYG(EV<^5F?IDax3C{LqLC?hrL#Yvqdc&VxU0He8u
zca%?udQ5_dyf8`FTfYq4Laa)=X-e)fXp+|^ID}5&fRnKcvR2Lza*{v}p3wr><TC7K
zy)UM{Ub=W1=jo`yFkMEoj(TZUQ8edIt@!+A#VO}w^;eJ1>K0soKW2<Yc15pfD$<la
zAsZWYIsTG^8=C9<;<aQnF6&(TaY=9Td9>DhA!`VeFmFf-Tz2;_-mFlUqCa18E1CU4
zUzj@7%vZ1-`vj^WOF5n+Z>N2*Wwkf7fqZlI5CIo88I(yx?bkATN`>cn<{5;9WmjN@
z9asWHJwdeUZE*yXGCXN)oeKMtCX1#RN`m!J5M$WpeHxz`5Z%!eLegf0@BCx%o~J>C
z{m6^YTf4lDY&DjHwZo{k4f~pEwtjVI60}MrNeQ0n81uVPxnD%vzF|X#no5lz7WxfZ
z5W$y|+pLHY#Lql9Gf6>5NyPPO&Y)^gQK66X9Z1ywNx3Fw57-Qh5+-{`oGl(spgc;W
z^(SLmp8x|vWJieGse}VD#)FqbNWv&rY>LpNWOWbM@fa6_#EOVE;buUgB}Y3()6`(C
zBmw1;-AT_TS{n9waxqjMBUHPXSP88UgEY(>sjEoB3>%MPNvAiT8YeS~)8Q2-sg~53
zsMravWs&f4_jSV$*-3V~9}!p4dWpS&i049kdVv}Mg3qe375tgVN4<{;YQ*?=FiF7d
zgp;i#Bc{JcN%RT%pZ|rky{HA={&KEtVg8#d?*GV~%}uQB|F6V(E2YnNg8?S=#si9|
zV^vr`Z^$1A*byTXt;4$Gl8%AxfFhYzFd?x`j$gCo!c-yxXL+#NHo;f%&`j}u{v0S+
zUA4P`1Z1?|{34&qq=Feq(zBs#6UKh8se<RPDr35IV0n2@6|&rPhjBDE3a96~A#FzS
zwni%(@VbEk3M7jtMx~Nt%$IRfuqd5sQPgZCYcXF$kF5WQ>iZI7qi@g3quC$Eyr3-h
zu)b%8FrKlopMoo&3z~=*l62Xi5=g>=Ef}V=sD97y-m!(86nOhV4H>^WG-t^2ebm!W
zUwCAVz`jRBCrdyV%r?p!7Bk!@wK%=A%VU@+n_3Tv*iy9+v5r5%1cT=04zutH%7t8q
zDwgz$%24PLbTX40o^_a)IVuBn6u5n@adC@BUCry?PMVU#1G($+tJXjT+i!#kEkXiZ
zyPx?p{I?F<qA2(*MS?SK0+hCNuxC#>S%T=R-kRPyF{Q#`8CDQ(<B9TiV$e^Vv8Vga
z=^N?6a<ewPI+!-|q7E<TTKAB?`@{hUBj^#59%Huo%+9@iNSfh~p;hx~3d>ixKLpu2
zBvSafg70+iTiRe|7ILc0a<vg&yPG|H_>p#HC$k1W$@E^{o>1W{nM7+BbB8A6xpcR?
zdtKk;(NN|LJ|E|}17@%PB@d`{_1~f+7y!T;#sAzX))r3A|F=`7G;I<S|Bo0R@eX<^
zCEtFj-UxSRXINpRl|E+RaD7pcvJc_9=4V?5D0n4s8@9P{g`{KVcx{So7Ic6?bOxg@
zs|pcz&Z$AD@dNpA^*Lznlu`CIr2&*{B+5=g;5jTFz#3zVcxHoy$6)Xwt|yk_fCBOh
zRR@{=meYVEoE#!|ghia=E@{}jh?SF^5k_2TZqM4R`joG~)3B0YXAk=F3#1_RHO`_d
zq+ZR^JaB$kZ1Oe-AgB!>2II(CBnam9&i+}|Uyng_?T@QTBLL#({(Myq#yCwHF-UHj
zurZHg$cusi4~_`lWD^nNE8YR@-M&r#$|F;-t$;WSUbLuHgCIX-7{$=0_8iIKK}gdO
zL@@K89i<f+zbLjOLi{&ELyk`g#VdS$W7J&^Q8^^p+_~F8hVKynW}2#$Q{Dka3ED_X
zqy8RIQ|Z8H3~BxXD-rXx>o86_YBT6Mnql2RA-@K5kvAb4D=8zxyz6R|mm~C)xI6bG
zrz4QY6NF|e9o%_sb%&vK1^&e>mBp!qvo!R6)$9Q6e3LN+=#+EbcJF*5r8*p3eXC$c
zRKNuKvP#xJ=S7Zn)G%EOSQ*STzh+uO2B%;VU-G(u$$pXpSMpuzV;T*IaYeuHc$2Za
zts!00o@9Iex*Ud3_yoVc!)UMjocmI6*+XMgHO<iwzGjd^i-t7>QynIp^pn|<`QUZ7
zX>YzziAEh@-7NPGnACGa)xDdM$IlMY?jG(0;hS>}T|rF*5~La#Bhw@Ym%qb<D6Zxj
zI`GTPHzDGz%mn$Adk2t6SJ~G1fW6yefU~lIDs)i-fx|`8VzKb!Wnl6sM@NE#i25D(
z;&ydPz}#Pb3-60coNh5a(Ygmq^v<%X9DgDAwfIGE;fXzf=A!La;QfZLkgxAl@UYF@
zMoe!mPKCKaoiC9~y$a*vnA=AC$PRy+EQ%&6p=RE%DJWf<I6zZnItbJrc4SWGU!}X1
z?iclAVO%3AFQ3N^*rg1YEJ{7V+hJjt=f+0c89FY_e~q>0Sy9?(oE^N5&l02g9TS?%
z#Rp%FSN~2-Txz#8(3EJfFOs!n<0BU$qm+yfSQgT!S#tY#<AN*1cB%XM!M`J%bN##I
zy58o3Y;>cdvbdx!ZyG~BgoT;c=u%w<C4d?@#$?U1m>`^)#JPFB4A|!;j7v+|T{2#e
z`UN3XA1*l|WmC<pkdhRo;a_HIu(|Uq;@`X<xNmi>jHwiJBzv%(HrU8U+rahsop+u0
zgOF-<(3;CmOzuR1AacD~v%w70PlE?HhdKk)$+!sX*9?7WOdk?Or*yNLPuAAX!oN0_
z4Z)jJEBUr7hua2igT8uM`0Ol@s^-x25J*HvHGeci-&9HVPqE>Uh$hl3b6WPJUY&xM
zuZx<#Z<?!Y@iugOe@bwc_7ii$S_)=I>y?ZEGCZD}Rgrd3E5o6Dv#q@ed%UrpWrxR)
z-ndu~e%TTChJvw<{#sQ3!U@Y77y^aH<Lig+&~m{C5n|E!#>cBdy5*r}JeE^Jmr|7g
z8lRM}ruFa@ZBk48&nYGew7%T8EE9&G!nbA5D5q^$ry4b7l%zt-pk3SEw!Q)JimDw_
z8#1Xo^0e>Lv>o=;Sgkt(8FY3Q-fbq0ZA&Y&o>8iBNo+kL8Rk}c*Vm}vq#0njxZrF8
zS(+pnY6+V1Ler*;1}S9}lVLC|lEHi_2ugV4ZB-k3=xjwhgm~!lFZNX*oxvPSnLK_?
z)bt_4_7kq2=gF4=;z)E_=ZhHnW15!Fh|$st)hpRQAxq9=9%t8n1Cx}9PN7G|R%oux
zh6?zTGqDkte3~mZ%Gt20pyhh1H>od*I?J9^D*03U>&V@_T?EY*6Ak(7b`!OYYoB5x
zGPhEvn-x3X-H<DX=QqAlCNDj?e_(7ID3&~4HnPx}JN&JXH%0B(#H%wf#8Npzsoy}q
zinpBtThkAbJy1TZm=tCR=|wgNVVRm%Yqdm_3}9x?R*+d5hiGZ`?MtQQ-=Co{g8AV3
zKCw=<RE;#3G_3mcA(N{QmG}oQZwav+X}NQBT8<1XX{t5d&bBm{uNUoF%lM>Oz0#7V
zUA45Imz>|RU1o`kH8lD@)h>JJ4F;H|{#6JGEo7;|p?<OH+&C2j*4EL6Ww?8c?DSHH
z5pEAahY>A?o?O-Ad*<W|yiM>Jq5$~)7lohnKC0otulEq^*TD5ZNqub${^L?Jq@riF
z!GPfRQaam-jve(k`-*h3ZmtG9ND%*DDN`^`E;e^Vv4mrz^<&44G52?35v@rbq~y`Z
zN5@AtUCjj-I#gOfRCtEApyW0fiS8fh5sw+HMnp>y#ga)d4rQ;A^%F_u;Q3C*Xdp=m
z1&MqZi*rL-ro2vuJd71UgL1S^MgeZ<5lq@>!014>q|F}6frKq)&oB;GYrpVUP?KCP
z1%dYoR;4PS$vpCDwzQm(JXPoL15S`wHg&L&+^Pf|io^s-(&aqFGE+|ZE(>WgS5{v~
zfHUrxL_yi>-q{F=8NFr&6#XUEw<{($kpllV%J0|#(CuY+C%<<1)3#%2tNVP$eWAsd
zyh+&pJ2;oH=Zip#Q=@ejtV#AWi`{nA=Mmq~1d>>zm(TXQ{GhQWkCyK{94?YDSu*mj
zCk&y|3pg*wLabXKvtCR8a;eKj#?AWZp!>3<IusFJ!)vaqDIqIniw%d+Dw46P*}BeJ
z&HaI46rRt?$Ky&;t9^$^6d$oxcQT`D8Z49F=u%bu<I{M>VB^d^6B7u2EF4Gr#BP3s
znDQU`xu4LY7UvHN=L`Hes{S7LgrZT%u=Pr!ToMVdtHpz}KmzZLnRFub2x|!iqs8rm
zMQE2+EfNHJMqSZHMn76<Gn=5S9F){;*V!ogjt5MXUe>DVbREXU$rH3S{&cnWI-@5B
z>_&&;&mryBVPv8Prn<Gy(lBptm_0)>9j)>$-5HY8(md)+FCQd7x}+}q1VUd6pHoLX
zD;`kpczV4Y1CJWvi3f|+g{B#{v%9VU=E%96)^M@Jl|w~;Tks5rD(S>*`Hkmna|OUY
ziRA*g0sDQ2>ObR(-L1OIo7UmBMPIBm=iC<&5nGc-I0<eox8(KRk};t|mfW`r9U=#3
za6|I%M<{0-rvkGS5n7(peedSww@nIo>u-Sd?7@pQ=9`d7%dfy<<u>(F@eKtfi&r@p
zb&b_1jGuu2`NqG1B}ZZQyEdMFvEcvH+Hf-Yk89kPs!n940E*9xn)YR+P#mk)dmTa3
z7A<Yae27f0B&O}Y3`-evHfG}S&Wmip+~8${wUqF|!sMNAmQxM$=|{I)B_wSz0rhJj
z*07x@Lw7_i&kgM|TjD#|c3}%9s<Q1a9J;cBEl(=vN&^VtadR~*B%numQH#9Vn{)|D
z#Ad>9e-uEeC|WaVedKK?-Ml6fjI~tx)P+gYAZi3Cv*tlb^=o>4EVubDM-q=UWe``I
zehrlwoHRISohtzR6Jx>?<f~s1l|-I0zbPx;paATIL|S!2xrTZh?GM!>kDY2B<84jL
zy{anpylcNoP>S!NYpXd$=Ng;11zhMQJ1Pd%F`PACYcvW;tP({3!0vK*(M>?&9B;LJ
zGdl^UItqajdL!<Oae)<-u-(iO(m6Mpw{)wM(a*iOWK)vIlEXG88cq^hi`hor08#>Q
z{+y$V?Fsr48-0PCc>{RE7u*EI0?TdbH$bG<=(4fwCtj?-ksl7?i3Jn(iE!`OW8?7*
z3j@`q>6d_6&RtLD%pc_X&3puGcDhg>`_ISGmZcb{{jJ$q{;I_S^3sPs8B&TeDD41y
zkSRwTKD_G)L0ucIn7oasT=BUg=Z*gA4LtH!Il24XoK*gon__9XqRv1`O#yZBbfD7Z
z;s#E=my1nP`<{Bt+S<Ykh;Gxk`&KlUboyo~B}z4xEcb`~OJTnl3;HBJ>ec>2;st+n
zvi2^4P1>&)Is~;VIl4G9Hohq9x(Uw3prke_^VEZdk^9w5UhlsNKRXAn+U>f0Cq1`N
zQZMj1B)iZGhPse@H`Dvy=)?Im@Bg0!lK%~)2i1^NO#G!TGyJ|t|EK<Rws!iz&x~SK
zn_nh9!gr3IK?uCm9|)s;5g{nUq&qTw6Iue0QO9hli|P8HtEwcu8BThi9e_FbOrss%
zm(A>(X#)%}VBkqX*(M@-eIZ#SM@R1)0|XwE=1hx+v`+NT1n(S3@PJhMC_2bukt<8!
zBqRmZbq+=9J{LIUf-Hl1vS692lxE;{Nsww&4JmS}LQZw+#U?fU7H+{8iwlXRtE#(f
z;0skG)n}<LRju`D>E=#;xs%I?IZDq~{fTXP6ok*NzQoikK_M{dama&IVg@z+(8)Ta
zS;$P50yLbdeErePd)Wl0>3qk<zevtkNb?5OE(fq!IW1d6!e7g5g?p(VcBh#Oa;47h
zT)xQ&6Hs}ssvX+&D87>ZRB(Yk`pCGYFS2-f*8YKmT&6g`Cq)&BJUMN1bf&^*TYX22
z^Z8vtZQI7T&<uyPxO<q|t6kWxX4PFiv;IP(D<f7HZ#5Mqhm%m{?Z=Zd*9C8Ggz=Y2
zxV$SAUB`4tn~HemL)n|<!D2P+E~-kA(1JFT)tE(2;ejU;_%)YPj#AAGH3RJ2Xdn#|
z5fal42%A4hyUej*<Hjy)t2r$xPZv^B<7xj&c3l%%6J!r~0L7<>%p7neS-E@&nH&?6
z!5Qy))%N`h_YeVGljAM&O@yQ93G1&<*tpG^xoRcxaSkk#al3qlg7-@MjGF8T9N5%g
zfSAU3Q=|mvg*M&X=n<$sQ>t<c28I&kUUsy|6~q^)HWSYw1l&4=$(Nf*n;LWzTbg}N
z?SkxVmrqs392@Flhm@wgH3)EltvW5U*ohWJb@18FB?G#!uD{?L+bi`kGMtSgY1pBm
z6vuNyRT}ls0@MTpoI0M(6DU!XaW~0rZrZgU|7~V(=<U_p{yU{$L+xdsU;p(n!caoW
zIs$il_tH?BewxqQ+sEPgQ|{awOnr^n2AAs+akwOupHgDVaUC<|Uo|hb!?97~Q{T2j
zPj&H+Q_s+9X+I!z>I_CnQ)^Bf6#a!zCJ*<ynizMIYIOBLwW>|B@rvukmsF!5`G#5B
zRUd8t{igmxA7DszyK&4+cxfyuK-Lo4=&uwkj*Q;d{;9lPi$f=9r(jbq47=@lxsf=R
z?W{gF0ULLH=37_SB;Js{tNiGi`f-7211Fw#D|2h|xm6SVo34KCNvz|q=T#h4Qc6@E
zHKzGZpZJ4RZzrJ(=xRAe{OVU-V-i+kXd>dzDK0?(tRe%YZ-C#YdNfnAam{Q=lUwy7
zbnpRlew|M(%L?-`hmXBC4!BnIG$8YggD(EL%SAvp5=UHZ%g+$_<yCsawb941rpCs~
z+ID7~HTA^VXNC1PX}PgwMod;;<X06P6oXd<WhIro2Z}HA&LaB@RMFxU6VEO<8r$;p
zT5QIcFxFpi=YOmiL*Uw8;2eARfGzZ9mN7vOcG<?h!--YdPuyl$*SHbAdlFl`#;ydm
ztVStGuw6R0iwwyZ_v!MJG+tR{2F?f}wObnjTyJhU(dkx7d!=HB4k4tH-1oZF;hCin
z#j8V~!h93r;ue3WGUn@GAI~B8+!&F1{>xcARzDrp^?S}9dHgquyp4&Couh}IwTZ$1
zixAK`bNt`i*-q(fFh3j!yu<`&BBrExK8{Q&Bzx3|OVaS}Y9IJ-Y<%TFiucgC2s8(<
zjkQ!qWwL9}Ov2H&K&#!hXbOAFR;1YJ)amInu%oHnz85pcRp_hP>FUX7&J-_y)Xb#x
zN@wpk9>14C{e9yo<W>8P1_y+IuJ&0{_FpORxi=mkGRG^1LBY0>!AW#*y{Fn^_=PqG
zVQ&jML&0^tz!~Va`zRfA!tbNS$0<y>di*B&8Rsmgg?F~WZ}>|oHS?>|ufhTLTG8RN
zUF$_8F?`V(aPr}^wcRgp&vMFiLryl=0sjc8%=tpKv;s6%X|_XWL|I<7tay`zW3X~j
ztV3nZ&$F-~^x?>X*0ABTy;Bcp4JB(I<~(?9>yVa{7WrL$o|y?!J=n;PzT0}#w}3BT
z?AKDxOm2?ldmQc<b{IiF*}yBX{9j$Yi+dK^d78GwQ(wJuN-L&kPl&g-0r;%_z<s^M
zm5D6(1%AxCoYv{->*4Lvo!I@8%g@v9Wzd~BgFELTkjW0)%k%TFdtl(~qp)xb?H%3)
zUiDk}If9Wix$P`ppT@HVLN2>znZw7|<;j+-nP`ue(`&LrYnjj&k=4=>8UFN1jLjSv
zTQ^O85^P~*qmrw6`xIu_u<$U$iBnK!1uE)g%qPTy>XyLGPLG%z_^C02kXTTo*%u-f
z?CivWTJ7wq2fkq1)B6A+7hgeG;kP|V@rSUc?`z4*#Y?WDpliKCYy#Pg0=q0d7D{KY
zL91gFi-pZ4h`S1>13VO^Q#RVdDPpAPGcS7OXH)FJsM{Ql_BThiv<13#-`i}`lxG4a
z);siGPDpe%W5&Y4->ZJ{BlL9mZcnx*_zYuCQv`8~;zH6LqX%(nSo=y`tWY$4GawHl
z$7pD33*etPE3Q_U4CnoJ4xeb1AgRm9HjF94UJ_!>CT}IVMYe;6H<aSJ0<ZqXcHo%y
zrh)Rir;#WfRJIJbvQ*fme1xfUPn7i--_wudeN=OR<boJDUjME7<vg|mx15_O>;iwc
z?`Cl-*y|a2NpDGEANy}q(d=b{_D+Q&Ea%$=LunzGDd(aV@Z&(u(SMLlA;a(sf9+Er
zMIw{JSm<SsxabeK^JjBLc}Fct3;zN@a8{eZ(%h^L%4;YZ+6^mj3vek@8bg1v{e|D}
z1K`?4mMZ131o}#=F!$aYL$Wl2z_q=Gnc?xvNAwF~`%t@tb`&xb`Pd;|nA^o#J<wM>
zU|6+QMLlF_FQR}$wiqPM*AB^~-W;d3`VEbb$Ii2l2<Yjf$2US*XND=|3Em>O-*dMN
zg($$)Xt5KCITKcsu8Y-yCSDksjL*3l(04+!w>kdukH?rWJL?CY2Js2-Ug+vw@5Ug>
zQ*GrL2^jwkNezzaTZG7{7`l`v7vp6H*ok1oGF<RG*ZG$|Dvbz5U@tF;CAX>3{!qU$
zh?)=X%xueiKbjZ+G-N`@`-6>rjTgy}*WIbk)h#9Dm3S*V!W4Oz6u_qZOz4F@s7Pz0
zYAp}$;)FtP8_p=#X3Z|v*_`zi%;IX&?Q>zPq@D>ReMk9ZSgMguKK^GI=}{jm#E40g
zWCTHls!i*X?l$GH%&?2gIRk$Bq4y_>zHV+y0)56gR@2F8fP&Rr*GSxZ855ARP9RLZ
z=@zli?gL$f8*~RzA4EYIr{LOh{T>+0Tm^oRl$gW)FFFB7p+i5+W{&Lke&@3C9`-4K
zM^$g3Gn(WzlNV$rvRitTN-I;r{oFV-gu8U{EpW<tNMoi<G1R7-q)PCu>i9?qFcoC7
zi7izpsu`RFT!6|<U$Drqut)0n3iMp^LR=83hR0!jVZkNM=qOH`fP^hss(kBVjH=b~
z_%ts?E`DJkYn*4PZjJ3}NH8}U)2j^#C(Zat)n`hA2i>3LiE9(g?qq6Axj(m;R2LF6
zOI0PZ84U_zsgD$8^~J*}Q_dBJ69bCYoa(qb&xscW)AyS{_A1^U?k=zMwy6M7Ni-z#
zbj}KJK5^m)A#`dE^>B#@`8EcgGbl8s%rFB&s4eTbpquBMy8b`>w}!R{-xcwf74ee=
zg>m034YX*;8GzY!!1&32-2n+zAexoyX^V04{+n5y7r+A!9*C2x;7wWe!}Tt?e_AC^
zs-vcIp}8$M4Np2u)kdphwl~CXljXAp7P6D`c<2d%ujQ*@rc?WEKBk$Z(T}g~qDnRd
zlzlJCA}UUC!cs9hB!+1bO#dB_{3&iKqHeS0X9vw>Zx(ljz9L5}b~y%(D%n*x24_Gw
z8-Ssjj73)=m12bs+$gybxo#7f9t^TLk)jalfR`!qFMeoLrlOfg>g$w4(YEhN*kQa9
z_GdJn?rlNvKyIHR@r&a-;4)bDib{SOM_!b=R*GOM#_Fw3LMOA@>_E9)mEOg*#&Jg#
zbVFB}S3mNfSj6r%2TeT)BnWrn4=yti1r$x?zzeLWXX>AXe55S!2C_(1+Q<N6{={43
zt71)SIGhav1XlPJ7zH*oVg~ytshMuAtreu%KwobKmHWPE)cU)Wf*V1e<s_&O5))h~
zlx1qZTQBplEoaBBsy7a1{!5_FG~q*|?*A(WE$o7t@C<iKh>hr@D4=9ic7)6ibUMZi
zKqHZOjV5K{(2G#5(t3@X!f1jYw=TGJ)i$hZF(#*uI;ryGd?FPBP{7&zM~9;1>u3hy
zs$ad6OubHqt!g;9bs{j8VRtgTo1CFqsctl&$(@o9Z*Q;ICjM@PPLp74_QW*?vlV(5
zdb*}5DGqz(MKv3`Lv1YCl?~lh&!(C*yNOyNSfE^$H6+}mwp-Du2gysWQwM<FFXwbL
zSgg|$p+>L`Y|kd(7WNUYupT$<1kTHHv@jr=u9PEqLq~QK@eJZH-Y;z%El?sR5e?-0
z)5-l3@s8Xd0gO4Hcz0M-F4N>pd+$6L;M_1(xf;E*rKFJzDjC(Xpw6YB*KS^}lklbY
zSwnaV<y)~tz+G1cwFX;7-v0zEbY(?jz}_STE3_4EfPCAQv8~&w_O5OCAbZ!r=R(&h
zG=n)WgV{Ql`8L~Ya@NoAAg6P#!~P{ld-vUXFR@xOEn$<E6D`5RETm-EFO9$0GMT4@
ztB39?2sXzudC0m&BIrJY6i;kR8G@$XC^eJKnD@ePX!x}amUtOxjFwW>3-1x^UuMzu
zW2L^LnBw|TywYnGI;@DU-_ci)o7&K?p|YR59X!GQBPd4-IRE}RsspNn8HLSsw0dXy
z8b)Iu)pqVRERJl;gmrq6!|BT5Sbxcy(qyoSf;0A;)oUJ=@ksY?s(a?qN>lY+lS_SN
zrGl`F+zvl9Zh5sfYfPSseXz~Y1gqIrmk0NH9Qj7)myU-Y7`$79lpQuhK2Q?pk8NPu
z5Sj6%vhdVXoYE2i>!WrAtVa=znAGa03IvekM|A`lNo0$bOxGx(mX$oYm3hY>kANg{
z1#vesP(15xplCp2kYz=Vj6Z-rBWg@O@C!d4nB~4me?umpgvY_D=}T@SFYTH4dM@J6
zr~o263GRj&inBdJPznUjgQn_q0dHPZzOt<bgD$W8J1r%U|24N1OyjRhmm)MKTUZy2
zKCKY?!n0$;BztM0zMG9L_79AkxE#@%H}EH+FJ<~c9<1+awr-k-fZCx>*KI$XFExTZ
zjlaN+>3gu~^!#aq45?%hitetK-BL!OrW4-TS!QbD;oh8P_~#17z;*VIG#%f<28F3>
zOv+I(Wi<xAmhp=4Uh)Y@DUXc3puMy~Mc3!M*i-Ju_C!yT19QI9ZVDuFs5}`!0|2YG
zBX@Z&QCB+8C8J{XGY=_qsikbGR~y&~cJc!F;mL<f#&*Cow#){Y&+1-2F)V}Ull9Ob
z+Qy1Cv8_c~=&tWML-Ny3St$8;_P2YU-WOD<5BA}gfJvf4kg5M#X(xEk)_nM7;YD?U
ze!jUUIEC9XgR8n_rG^}q!Lc~FcS8jCgfsY0^b=J2K=PQtvP6M78fW1Z_CF8V#x{!0
zOn!%K&QSjierMxi?QG#}?r36Q{Qsm!Ev4xEj-XLOZ=R@G^v2|+d-MwtR!>Fqs~DDT
z*^MBE_#~3X65<Yv-d?y{_E#k`gRLo>x*LigatdGKofAkTMw@{V^YTO&yy_k=zaV+}
zdAUgs3(NWk+o-uv=%fV%+X<TW52YnI@m2C2dpVPfS?OjaCpJ-flVNs)@5z;6x)f9Z
z@QB5gLm|;<itt+d!%Y31ErbAeL^p<xSSDmFM9dGeXN^v$WQU}089mb?MX;SBjo2N^
z&^1n;tnNDTEJdQ^y}boGaPUMdQ0)z9Tc;0q=i<}=*hqs9CEexl<-I;0uA@Z2&r}NO
z^@E*B)_1c!<O`Ai(Wrvc<d%&t@6Rt;e<e4^fK_Kz&38zOiKN;a<0LDei-+m3C~X;K
zF9tk4m@a6n<&C0HHGG7?%bR)};hf$QNVO$OO3chR<x&vo%VA{Qm>@vogc4ZB33ziN
z`TsO|)~}p9yXMKE%mR+b9Nm^_d9|<HNZDNlc%g3^W=rdpM^SW-=Of$pCFloAa1rtx
z>kl|7Ct@2#xJh`PCTJ#w-_<Az`bK6Kkxi_NrY`BN^b3PysNwjo%D~95N`Y^1qiie-
zy?-GDsk#6`ysS+=Jrpxp9~LiCY$R|zBvPT2*`0JKEiRSuK2H%x%6WV>XgJ$$;@jh?
zeON7Nd@Ri$gWX<(d5ilceqX7Z_WHA_rFK%svA(Lm)6bX3d#1L!qje8J+|nG@o!Psb
zzOe1%%1>*+cmQ+QH*0C+U6c~IzIE#_tYI^L4RIdm8b8qM?HnU({1BKcEdPLl183Ol
zLX~sM6<9Bw*f*hl{+GO}W}~Bg`Cr*b*>8^$|7Qfk|4xe3`wzOEiMxS~y|u|N$2D1A
zMrz;}(Rfpf;V8+N)It>^)Vm(ED+t>4muA7(kX^=g;If4EXM3s?55+m=VY}1g22_y1
zRr-_yw9r6{v1%Ey3T+|;fnB{LXb<)>ek%Zq3VROE*jlnIqtZL)0P?7#htCMOq6&t)
z?WLo`7qt&A(lB=HF8%e-XK)lLry+H)I8+l_i$z5i6Bo-z783gg#BHJ?Sh+`4cs(#J
zss;#E;BQder#?&bpn)=&m^|9LOUD-$nOY+lD0-Y60P3^Z`jKbzKiw+&RH?bX>^{2b
zKX%OTSmNB!jGbUQkdFaZ%hO&xp+W3~os^W#a@3@Qj3A0BWlc$fwXCjaeznwD+a$s^
z^R8WEmQ-}#)pPc27t>Sv{ZevU&*3!_UF4qnuY50@ZSlzp>r<F6mg8E*@tKU`91%}I
zs9uH~qn|ntA+LS~g-N`NbgU=KKmUc6NdjA+aQ0h>*x#a%{QoEj({Dv&V!Lby7+`|k
zctQ8ZG<aO7cFTq5;L;=LL<AY&ve%PIg9;MguPSqgYrtT|w%s~)KS3Ve8Q6$*AnA*|
z3sFP=S7T=a4`uf~@Udkpdu0h@-^upcy$NOCvt$|T*w<`fq>v@M?2$c;lAVy`CGjFl
zAw-(Uz7%<>{GUNJ!!*DD^Z7iVnd|d$zH`sH_dfUDbDwh@xANEMYv)+$Y^2p$p6Tzz
zzw=zyX7da17dtT{bB+A$8#YZd!aCUS`&HsaR<fkhC8%DpyE@O{#y?tkHEX&bv$i9W
z=rhkg;Ft5cNBjDLup)V+Ae80e_{DI^foTdjZu3`%J)@G)ByL~Z<)|m_C;5$1w=~bl
z+ZyrQWzF|orHe3)0-lHW-r;5XSCmWcwmBA@6Jzkzs_dhBu}Q|g(Mbfklk>f|BNu``
zw!X)=-_`)s^cZNn{Mb~5JdtAO21ND#-H3g3dz{+OQ#9}L)k#ZHcrTPrD{>bV-KC=c
zR78^}xv2DrpH5Y4p|ygkE)zdL*%%77x2G?V`SfBpweDNjSdNVBI;GNx^M~oz{W(Jp
z#PMd`0u?v@a)@(yGqFa-NN-A`EYMi{m}yc0E@8#UD^~Qb&bX;VQ8(m<j7x}cUYJdJ
z4R;oYMvp9R@!47LEuG$w8#qPT4xLdeoBbs%l-(%^IWYc;rOc(Tww~c@<+7P)Uu}&J
z^nQZ*8IZ?aDm37@uD~N+7&GJPXCYL5>PZ_n;9F5?)fOaVC#GPFI#cx`q3j>Bl<FT1
zLR2;{AWX%n@<sjUnEw+bp=8yUhj!+cg`9umnMcc}DnjV<PCujmwLawE&gaJW8Q^YL
zOa%4*@)CJpRH2qFBO`I)yE4?aWvf&-bT>piL)dw;xX{o@k5g;SlejLTZTXApf9uTX
zR>_;DXqW@6@`(@x62N8^FFP9_uWOFJb|_})HPA%HTWSH~ElF|c*;R%QsmXa5;}gA#
z1`CGdvt+wG@LKBfZNC(`7q?uvC^lbp`eGA;>bmS|%YWWH&uF*k!*v=7?v(C4Z6AcD
zk<LCnWmwLwk|Yp4oLkZi+y0iA1`e}w-ugH`LuQ#v8ie!MmfdyW*64?w(4+?2PNyUZ
zwxP&sPoux&z?^+!#ExW|p<c=F?nHrOeJeQBy>d^D?|?F{EX`Fzh%4?sYAb$WjWe1y
zCs0EXTKI(0xS)n*y>Q-QN%GyTdio!ng*gpg`~faZ6QP@r>f%k{Txz>)se!H~*-&`~
zILqgor5<rZbPVQF9|z`zdsMF2+S#VtN3kbz6MH^pwyoFTeG_t9I!dgPt$O151;Q`9
zg_H{1|At<LTXdhisyzXVmxHfYe^p8txiFdUd+}-aUSZGl=qyRl$eW-!>yLrzw_1nk
zpZM9jWX6U>aB()$Ix#SSq_5tc%wOr)<L9{WZmr_7dAE#B<*b`c4PH<_{ZulA?ch?g
zu=&=)-pd7upD!Fy%cjL_wo9T?4!Oy&HqXAz@L=`^<)q}`9UOX#Xvasv@$^0(-EoQV
zYYA}ut~stmgAN^`_z|)$@a|GiKvC_WV-lQx?6qrDEC)w&U}_-KD}~*PrVqQ*nV%WZ
zTZ{^2$J)O@OHl?|&5;n|`_G`?z1B5eQyotMIC%J(@A<uAaJyl1&0{bxQ4W3}vtHI%
z_DoO$G)4K_s+OOrLmo?7vwKdE!^792)FjJbF8@4j<3}sgPaO@_<YvIIS#g)dq6b3J
z1=6@V*TO{PmzJ4Y2Yga~v{7E<&meeLwk55uMnK-j5ra5aBj7N{dK+=ig0IB$(ztpz
zKc~BlzqZ*!{p4j8`7p(W9L~A2UTdDazR-5O$-%U`Mvg^IW-=Y>_ptI-&TZEpHJcf1
z(E|~V*%tpgl!m7dfnI(VD9+qN*Fhfw*0}=2Bs)iUEELhc4z4-c1E~pqIlPvQr;rY>
z;C_fzx4KAGq^fMLDy;0p$u=>^kHjoJ>XUzEF+_}-jNRy$>LZs?HAy-#+FmB-y$<u6
z8k<`cgvDxUYw30w={8GUo=Robnu2rnNq>|HTbE8YQ4_v2Ao#vi$>iKkfe!v8F%FIG
z$O)1@5!K2%4=;8}+nC!TGnL$Vd`~#(lHuHUwU#y2c-3?^wpRoj*vRqSBMSpJj4zrK
zwq1Vp<z}%Y!n54mhDDWkzI)YdMip26M3tk(ST4RE|9w}{#+hAOCNGvO?b@dP3+n~Y
zPX#kQ;;Jv>o)<q$<tMVKXK!wO*|r&NUr1LK*{Ro5KV*jBqhFk>)R-EH_v-oJcHfCF
ze9KJOsNZD|YO8T0(fm0#r|PZbyAcL8atmz4cM7FtErRo7*3~@_1v7{pDMN_!3!YQe
z21?{L_~W`8U<s;kfpH(gVu)@&-B|mf&dwg|xUexUBN?<hp*vePksHibaz4gfVeIW?
zUhXhz8W){)T|R}{klETqS6l?Qj_J*C-7Ys8^@=znnshS42LGbCH3TA9doWg#Ry@SO
zfj7<ZlDX>+thnEP3r`uh!z`bVNxveCigvNV@;fxH&q=-$mY#5Rxt2>V^pa4q2j6`>
zedU5)?~4I={SC?SgOAj5en|0%`X`$R=I$k4J>4>ExNy64T0QbpYAm<To&dPHlvRq0
zf`_6+vy;d>)PY)YwVgfL%*Z<@JCT%6jJQ`IY7m<8T0Uv~Z{p%CF5^Jg`l?)R<NA}d
zG^9BMyyNo|8fI{(2s*8@6%ltIiILBy4Qg%TQq)r};k68R@+K1(Yp&*%$&&U|<<Am)
zwN&JATa%)Im#VcquTyxqoF1t=73*-T*lJjqa&y|cyDdFg*UwkOIE3+ZeM6wf-)a;Z
zDm-TRWZ%uD8uE8ZT|d0-BFe!j&j~5pyn2^}r)fKba<(tkR7YsLhq3}1Oed>z?sLQ=
zFDKD*{}huqR&QDQzBxHRfre44aKm*~s+l^Uy*G<*V=%G_+A?QT5D&T9zO$_*QVx?0
z4O9ppE!)up5qDT<4@0k*R*+lTLLDaCI^j`^DMDOtL`4hdPA2Y#>bVQcdbeDThPmh6
zk5<2b$^o}{deC%QrUN!%@+Hkr{cU_qS#dcRB*0wU;f=&&7`4VnMqZW=9dpV7ri0FK
z$I@Ion(|=lXR41B*{;+=T4ES`t_^!UT!z;Z&eF>aR;RlaGwu=cJdb>OzC<GCHG)#k
zTIasxZKKs-&&T!}T}6J(j+;rX&t{7>y{91s%2#4kz7sJ}s+{8S>GRRxV>E-1wIaX*
zGYj~(9_i1=A7uSA#4y^fTAOy%mlRAT_OuF^^7sRO@gu|Kkx`-r2&N94({9-{qL}Y^
zIgzX7MN5KsRm=0(Icr6a=C-wHsq_mkj_ao9ms+)>zqyXY0()Gvsb5s&*AtWar-Qu7
z<VhWB8p9$!F8^mQXY){M{MOt<>I{!Bvbp2wp2Vt3r_Zz`s3F*C$P!X4tyl*{iJaX_
zJ@xVvGA8w45bB%rUd>r8F$9xJ-O48#r^p3*HWXQvq=$Wdp|Z_2{5skp&J^6t)~1#X
zhSdr30g_v)wAP?&{>}bQrt44l*n`-_a_h9%SX2=O@He9i1TaVPamr*7)`)v;0v|$@
zbPyMUn8Uk9L=5dX|Fh0EPJT$7<eF0u|Ao+N<S!}jyUH|RKT9!@Z84tLAYPU=M>4m%
z;TSni&vMx@sKMLJ0fb<zOl9w^sKVO@&1Qja`!h@8Ueg!y-4m9zjrA4!=)Q_v98yf~
zAU;iW2O{5@RMo{4)cE9PQ^)4aVtvPCZlskhD@)l-_Jhr<Lg{8a;a#ocEM>*f_#@wk
z>+*AJGx9|XSI(SKR=Z#RcV^g&;OLU7b2+%ubR*6)d!tX{<ZoM0`-l;IZp*Ax5Hi*0
z`0O+#(-<+UA6LiPOsPq9+HF0vgf~Nq!rJyM|Hd<G$#S*pis4gvP!m_zv6uz~aVi@@
zX?t;gO2HB@Lgf}CjmOEi6H{eu&8>U!YtCIJ96_xkq$MHgC2_XAuzsQ%iTUW~;wOBf
zcHK*7zCTWqk^dAZa7EH3b)n;ZQQ2w9Touy*9fzKoPQRKk+eP|}p#fX@<xeH0HnST&
z1*Fb(!_9UPjF;kD32zTiQ|VoWTIX{X-^j7VABr2wqbujQt`3dfj$}7<p2Lf}x2b|y
zQi{6AnAP=dL%!{lag+a)1h=?<0O?20q}7x4HwLI1zh}PMh$mV}*8TP>J<>fJyz@FQ
zXS_gmB5vd0q&cut0$j)wAg4pN321=t$omX%BcFsq$i0$lLJ)|fo1-@b0<3=6z&h8(
z)W8P7-@!USAa*NzI}+r>^*|1-sE+EnaUh}`@c+o0{HGis3G+2acQ-%)1akNt0s+f`
zxy}m;pBEJ4+LwVO6M?v6YgE`R@kR}Rk~?ry{FH(6D>Uy2lZ2e7z~2q*xC!Gm{o68y
z`JApGl~|4$RofKbKNp@%j%DL4i}pE8K52?=gDbIbaGbScJ-JTR`ze+E@=J}{ku1l>
z-ppI~KuR~h{ab1^r0Wm(ka2MT-mg8Nk4#-v0Q>7A*k2tYi^yMty!N$20s`+Ba6q)6
zwa;6=Fz!K|pP!yEv6q2B`M21Es=eu^kIyCmfyjW-gyNT2q?}(n0>}XO9`0Vq6z8Y}
zP_$odLe&5ct_|rtQu5C-DgkN8fUR={Dbyg4pApiS$Ym|^0t5F7z}^4h2idU?l4BpG
zcpDFNGY;agR~}Lh%SS2eyK(;Cfq~H)IP6J=MyUm6RR<JzR}ZUWwK$X~KKf_*$R?n_
zQxMh(%+Uq`jZ>n@e+T4|+L!+;fz;6|94hM49(F~oR;C5)0OLCDB@kdhR7DK|+S*4P
zBpq;faj~=U2HNknqol*iIB1?cFv%a3#rn(zKT-n(3OB?#ViG(-RMKG#%s)`Bdt;{{
zcooXJfG*b{a<pl+1RPR4yxhIrZQNb{Kq#O8Goje#hl*k&vuhU6h=_n}UgT&RtByV-
z96l9~0>CcQLJI>pEfl8?6m?ks-KvgvI?`VNWTy*EI0PIFNvIv;RnbA$`}t{g=fbhB
zhwK=S++PO22hO-T0Xq6^?+`$EA-ZsDJ2x8#S1T`P;iJ_!wC`H@VKx3>?*ayHjnBXW
z%K)G;{i+6%)Rcl&4eRUPb`T(OlcSsc&*OEdH3J>Wry9o2n=q-@wF7KY0}PsAG~PlQ
z8t<UvLk@aAHX_?Db|QLSx1W6nS;N>;G~%yLij5}5N`|%npy$0prHz-P(T-%n!lX+>
z59}=T3_7T+?~l<~`(p!IJ{?Rr`!wu0^o%E{xaR8PxL-TOhfWMkME|GQiRclzQHgOa
z$B8KMxiP^qPm*J`9D3AWR9ITuaoCX<z?gt=QS3nUS1VD01=GiYs4rS#qV?XyPD6h^
z50%zCbDW0yvK}VdyAtd)^cO5pY1e^W?EMGU!RWZ(G*Mo+zyt$*#STM1v5pE$Svn3w
zIlqnxM#)5hwV9wFKSqU#uO5e?97e_j6LrB3Lq8vg3d8w=2E%Y#5ED-+`G4@x+>)rg
zojr8kF()M^9P~YQ9J-eY6_<&NgW5`v4F;3X2@}+hh!U#-(LE`sP(J+s4MqA@Fi|6g
zuv5{U5~x%|0xVPvE(%QaG8lF``sO<-edz=iI;O37O!%fk?0EE@TvR+0@p1g`eO*kf
z$qno*^c_=F7Awhd){&hNOu!X7Dy;1ZecuHYs7Zzf{C}`m;xV!QwDLt|Nl>D(P*$Lr
z>NI76Y0mVgmGMsw7??BN_{sUbPDTTKdxW^mHHgzEaR@jp|DHW#S~jBs+J6H6RE5JO
d)4I&jB@+Sg@;TBu$__dWOq-RdfLSf*{{W)JaJB#d

literal 0
HcmV?d00001

diff --git a/sbin/spark-config.sh b/sbin/spark-config.sh
index b7284487c511d..f2d9e6b568a9b 100755
--- a/sbin/spark-config.sh
+++ b/sbin/spark-config.sh
@@ -28,6 +28,6 @@ export SPARK_CONF_DIR="${SPARK_CONF_DIR:-"${SPARK_HOME}/conf"}"
 # Add the PySpark classes to the PYTHONPATH:
 if [ -z "${PYSPARK_PYTHONPATH_SET}" ]; then
   export PYTHONPATH="${SPARK_HOME}/python:${PYTHONPATH}"
-  export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.3-src.zip:${PYTHONPATH}"
+  export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.4-src.zip:${PYTHONPATH}"
   export PYSPARK_PYTHONPATH_SET=1
 fi
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index ea4e1160b7672..6e4f68c74c365 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -1179,7 +1179,7 @@ private[spark] class Client(
         val pyArchivesFile = new File(pyLibPath, "pyspark.zip")
         require(pyArchivesFile.exists(),
           s"$pyArchivesFile not found; cannot run pyspark application in YARN mode.")
-        val py4jFile = new File(pyLibPath, "py4j-0.10.3-src.zip")
+        val py4jFile = new File(pyLibPath, "py4j-0.10.4-src.zip")
         require(py4jFile.exists(),
           s"$py4jFile not found; cannot run pyspark application in YARN mode.")
         Seq(pyArchivesFile.getAbsolutePath(), py4jFile.getAbsolutePath())
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
index d245acf49aa91..99fb58a28934a 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
@@ -242,7 +242,7 @@ class YarnClusterSuite extends BaseYarnClusterSuite {
     // needed locations.
     val sparkHome = sys.props("spark.test.home")
     val pythonPath = Seq(
-        s"$sparkHome/python/lib/py4j-0.10.3-src.zip",
+        s"$sparkHome/python/lib/py4j-0.10.4-src.zip",
         s"$sparkHome/python")
     val extraEnvVars = Map(
       "PYSPARK_ARCHIVES_PATH" -> pythonPath.map("local:" + _).mkString(File.pathSeparator),

From a8ea4da8d04c1ed621a96668118f20739145edd2 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Fri, 21 Oct 2016 09:49:37 +0100
Subject: [PATCH 076/162] [SPARK-17331][FOLLOWUP][ML][CORE] Avoid allocating
 0-length arrays

## What changes were proposed in this pull request?

`Array[T]()` -> `Array.empty[T]` to avoid allocating 0-length arrays.
Use regex `find . -name '*.scala' | xargs -i bash -c 'egrep "Array\[[A-Za-z]+\]\(\)" -n {} && echo {}'` to find modification candidates.

cc srowen

## How was this patch tested?
existing tests

Author: Zheng RuiFeng <ruifengz@foxmail.com>

Closes #15564 from zhengruifeng/avoid_0_length_array.
---
 .../org/apache/spark/CheckpointSuite.scala    |  2 +-
 .../spark/deploy/JsonProtocolSuite.scala      |  2 +-
 .../spark/deploy/SparkSubmitSuite.scala       |  2 +-
 .../history/HistoryServerArgumentsSuite.scala |  2 +-
 .../spark/io/ChunkedByteBufferSuite.scala     |  4 ++--
 .../serializer/KryoSerializerSuite.scala      |  2 +-
 .../spark/ml/linalg/MatricesSuite.scala       |  4 ++--
 .../spark/ml/util/TestingUtilsSuite.scala     | 24 +++++++++----------
 .../classification/LogisticRegression.scala   |  2 +-
 .../stat/test/KolmogorovSmirnovTest.scala     |  3 ++-
 .../MultilayerPerceptronClassifierSuite.scala |  2 +-
 .../apache/spark/ml/python/MLSerDeSuite.scala |  2 +-
 .../ml/tree/impl/RandomForestSuite.scala      |  4 ++--
 .../api/python/PythonMLLibAPISuite.scala      |  2 +-
 .../evaluation/RankingMetricsSuite.scala      |  4 ++--
 .../spark/mllib/linalg/MatricesSuite.scala    |  4 ++--
 .../spark/mllib/util/TestingUtilsSuite.scala  | 24 +++++++++----------
 .../expressions/StringExpressionsSuite.scala  | 10 ++++----
 .../spark/sql/DataFrameFunctionsSuite.scala   |  2 +-
 19 files changed, 51 insertions(+), 50 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala
index 9f94e36324536..b117c7709b46f 100644
--- a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala
+++ b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala
@@ -500,7 +500,7 @@ class CheckpointSuite extends SparkFunSuite with RDDCheckpointTester with LocalS
   }
 
   runTest("CheckpointRDD with zero partitions") { reliableCheckpoint: Boolean =>
-    val rdd = new BlockRDD[Int](sc, Array[BlockId]())
+    val rdd = new BlockRDD[Int](sc, Array.empty[BlockId])
     assert(rdd.partitions.size === 0)
     assert(rdd.isCheckpointed === false)
     assert(rdd.isCheckpointedAndMaterialized === false)
diff --git a/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala
index 2d48e75cfbd96..7093dad05c5f6 100644
--- a/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala
@@ -65,7 +65,7 @@ class JsonProtocolSuite extends SparkFunSuite with JsonTestUtils {
   test("writeMasterState") {
     val workers = Array(createWorkerInfo(), createWorkerInfo())
     val activeApps = Array(createAppInfo())
-    val completedApps = Array[ApplicationInfo]()
+    val completedApps = Array.empty[ApplicationInfo]
     val activeDrivers = Array(createDriverInfo())
     val completedDrivers = Array(createDriverInfo())
     val stateResponse = new MasterStateResponse(
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index 732cbfaaeea46..7c649e305a37e 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -91,7 +91,7 @@ class SparkSubmitSuite
 
   // scalastyle:off println
   test("prints usage on empty input") {
-    testPrematureExit(Array[String](), "Usage: spark-submit")
+    testPrematureExit(Array.empty[String], "Usage: spark-submit")
   }
 
   test("prints usage with only --help") {
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerArgumentsSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerArgumentsSuite.scala
index 34f27ecaa07a3..de321db845a66 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerArgumentsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerArgumentsSuite.scala
@@ -33,7 +33,7 @@ class HistoryServerArgumentsSuite extends SparkFunSuite {
     .set("spark.testing", "true")
 
   test("No Arguments Parsing") {
-    val argStrings = Array[String]()
+    val argStrings = Array.empty[String]
     val hsa = new HistoryServerArguments(conf, argStrings)
     assert(conf.get("spark.history.fs.logDirectory") === logDir.getAbsolutePath)
     assert(conf.get("spark.history.fs.updateInterval") === "1")
diff --git a/core/src/test/scala/org/apache/spark/io/ChunkedByteBufferSuite.scala b/core/src/test/scala/org/apache/spark/io/ChunkedByteBufferSuite.scala
index 38b48a4c9e654..3b798e36b0499 100644
--- a/core/src/test/scala/org/apache/spark/io/ChunkedByteBufferSuite.scala
+++ b/core/src/test/scala/org/apache/spark/io/ChunkedByteBufferSuite.scala
@@ -57,7 +57,7 @@ class ChunkedByteBufferSuite extends SparkFunSuite {
   }
 
   test("toArray()") {
-    val empty = ByteBuffer.wrap(Array[Byte]())
+    val empty = ByteBuffer.wrap(Array.empty[Byte])
     val bytes = ByteBuffer.wrap(Array.tabulate(8)(_.toByte))
     val chunkedByteBuffer = new ChunkedByteBuffer(Array(bytes, bytes, empty))
     assert(chunkedByteBuffer.toArray === bytes.array() ++ bytes.array())
@@ -74,7 +74,7 @@ class ChunkedByteBufferSuite extends SparkFunSuite {
   }
 
   test("toInputStream()") {
-    val empty = ByteBuffer.wrap(Array[Byte]())
+    val empty = ByteBuffer.wrap(Array.empty[Byte])
     val bytes1 = ByteBuffer.wrap(Array.tabulate(256)(_.toByte))
     val bytes2 = ByteBuffer.wrap(Array.tabulate(128)(_.toByte))
     val chunkedByteBuffer = new ChunkedByteBuffer(Array(empty, bytes1, bytes2))
diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
index 57a82312008e9..bc6e98365daef 100644
--- a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
@@ -100,7 +100,7 @@ class KryoSerializerSuite extends SparkFunSuite with SharedSparkContext {
     check(Array("aaa", "bbb", null))
     check(Array(true, false, true))
     check(Array('a', 'b', 'c'))
-    check(Array[Int]())
+    check(Array.empty[Int])
     check(Array(Array("1", "2"), Array("1", "2", "3", "4")))
   }
 
diff --git a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala
index 2796fcf2cbc22..9c0aa73938478 100644
--- a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala
+++ b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala
@@ -287,7 +287,7 @@ class MatricesSuite extends SparkMLFunSuite {
     val spHorz2 = Matrices.horzcat(Array(spMat1, deMat2))
     val spHorz3 = Matrices.horzcat(Array(deMat1, spMat2))
     val deHorz1 = Matrices.horzcat(Array(deMat1, deMat2))
-    val deHorz2 = Matrices.horzcat(Array[Matrix]())
+    val deHorz2 = Matrices.horzcat(Array.empty[Matrix])
 
     assert(deHorz1.numRows === 3)
     assert(spHorz2.numRows === 3)
@@ -341,7 +341,7 @@ class MatricesSuite extends SparkMLFunSuite {
     val deVert1 = Matrices.vertcat(Array(deMat1, deMat3))
     val spVert2 = Matrices.vertcat(Array(spMat1, deMat3))
     val spVert3 = Matrices.vertcat(Array(deMat1, spMat3))
-    val deVert2 = Matrices.vertcat(Array[Matrix]())
+    val deVert2 = Matrices.vertcat(Array.empty[Matrix])
 
     assert(deVert1.numRows === 5)
     assert(spVert2.numRows === 5)
diff --git a/mllib-local/src/test/scala/org/apache/spark/ml/util/TestingUtilsSuite.scala b/mllib-local/src/test/scala/org/apache/spark/ml/util/TestingUtilsSuite.scala
index 5cbf2f04e6269..2dc0ee32d5762 100644
--- a/mllib-local/src/test/scala/org/apache/spark/ml/util/TestingUtilsSuite.scala
+++ b/mllib-local/src/test/scala/org/apache/spark/ml/util/TestingUtilsSuite.scala
@@ -110,9 +110,9 @@ class TestingUtilsSuite extends SparkMLFunSuite {
     assert(!(Vectors.dense(Array(3.1, 3.5)) !~= Vectors.dense(Array(3.130, 3.534)) relTol 0.01))
     assert(!(Vectors.dense(Array(3.1, 3.5)) ~= Vectors.dense(Array(3.135, 3.534)) relTol 0.01))
     assert(Vectors.dense(Array(3.1)) !~= Vectors.dense(Array(3.130, 3.534)) relTol 0.01)
-    assert(Vectors.dense(Array[Double]()) !~= Vectors.dense(Array(3.130, 3.534)) relTol 0.01)
+    assert(Vectors.dense(Array.empty[Double]) !~= Vectors.dense(Array(3.130, 3.534)) relTol 0.01)
     assert(Vectors.dense(Array(3.1)) !~== Vectors.dense(Array(3.130, 3.534)) relTol 0.01)
-    assert(Vectors.dense(Array[Double]()) !~== Vectors.dense(Array(3.130, 3.534)) relTol 0.01)
+    assert(Vectors.dense(Array.empty[Double]) !~== Vectors.dense(Array(3.130, 3.534)) relTol 0.01)
 
     // Should throw exception with message when test fails.
     intercept[TestFailedException](
@@ -125,7 +125,7 @@ class TestingUtilsSuite extends SparkMLFunSuite {
       Vectors.dense(Array(3.1)) ~== Vectors.dense(Array(3.535, 3.534)) relTol 0.01)
 
     intercept[TestFailedException](
-      Vectors.dense(Array[Double]()) ~== Vectors.dense(Array(3.135)) relTol 0.01)
+      Vectors.dense(Array.empty[Double]) ~== Vectors.dense(Array(3.135)) relTol 0.01)
 
     // Comparing against zero should fail the test and throw exception with message
     // saying that the relative error is meaningless in this situation.
@@ -145,7 +145,7 @@ class TestingUtilsSuite extends SparkMLFunSuite {
     assert(Vectors.dense(Array(3.1)) !~==
       Vectors.sparse(2, Array(0, 1), Array(3.130, 3.534)) relTol 0.01)
 
-    assert(Vectors.dense(Array[Double]()) !~==
+    assert(Vectors.dense(Array.empty[Double]) !~==
       Vectors.sparse(2, Array(0, 1), Array(3.130, 3.534)) relTol 0.01)
   }
 
@@ -176,14 +176,14 @@ class TestingUtilsSuite extends SparkMLFunSuite {
     assert(!(Vectors.dense(Array(3.1)) ~=
       Vectors.dense(Array(3.1 + 1E-6, 3.5 + 2E-7)) absTol 1E-5))
 
-    assert(Vectors.dense(Array[Double]()) !~=
+    assert(Vectors.dense(Array.empty[Double]) !~=
       Vectors.dense(Array(3.1 + 1E-6, 3.5 + 2E-7)) absTol 1E-5)
 
-    assert(!(Vectors.dense(Array[Double]()) ~=
+    assert(!(Vectors.dense(Array.empty[Double]) ~=
       Vectors.dense(Array(3.1 + 1E-6, 3.5 + 2E-7)) absTol 1E-5))
 
-    assert(Vectors.dense(Array[Double]()) ~=
-      Vectors.dense(Array[Double]()) absTol 1E-5)
+    assert(Vectors.dense(Array.empty[Double]) ~=
+      Vectors.dense(Array.empty[Double]) absTol 1E-5)
 
     // Should throw exception with message when test fails.
     intercept[TestFailedException](Vectors.dense(Array(3.1, 3.5, 0.0)) !~==
@@ -195,7 +195,7 @@ class TestingUtilsSuite extends SparkMLFunSuite {
     intercept[TestFailedException](Vectors.dense(Array(3.1)) ~==
       Vectors.dense(Array(3.1 + 1E-5, 3.5 + 2E-7)) absTol 1E-6)
 
-    intercept[TestFailedException](Vectors.dense(Array[Double]()) ~==
+    intercept[TestFailedException](Vectors.dense(Array.empty[Double]) ~==
       Vectors.dense(Array(3.1 + 1E-5, 3.5 + 2E-7)) absTol 1E-6)
 
     // Comparisons of two sparse vectors
@@ -214,7 +214,7 @@ class TestingUtilsSuite extends SparkMLFunSuite {
     assert(Vectors.sparse(3, Array(0, 2), Array(3.1 + 1E-6, 2.4)) !~==
       Vectors.sparse(1, Array(0), Array(3.1)) absTol 1E-3)
 
-    assert(Vectors.sparse(0, Array[Int](), Array[Double]()) !~==
+    assert(Vectors.sparse(0, Array.empty[Int], Array.empty[Double]) !~==
       Vectors.sparse(1, Array(0), Array(3.1)) absTol 1E-3)
 
     // Comparisons of a dense vector and a sparse vector
@@ -230,14 +230,14 @@ class TestingUtilsSuite extends SparkMLFunSuite {
     assert(Vectors.sparse(3, Array(0, 2), Array(3.1, 2.4)) !~==
       Vectors.dense(Array(3.1)) absTol 1E-6)
 
-    assert(Vectors.dense(Array[Double]()) !~==
+    assert(Vectors.dense(Array.empty[Double]) !~==
       Vectors.sparse(3, Array(0, 2), Array(0, 2.4)) absTol 1E-6)
 
     assert(Vectors.sparse(1, Array(0), Array(3.1)) !~==
       Vectors.dense(Array(3.1, 3.2)) absTol 1E-6)
 
     assert(Vectors.dense(Array(3.1)) !~==
-      Vectors.sparse(0, Array[Int](), Array[Double]()) absTol 1E-6)
+      Vectors.sparse(0, Array.empty[Int], Array.empty[Double]) absTol 1E-6)
   }
 
   test("Comparing Matrices using absolute error.") {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 862a468745fbd..8fdaae04c42ec 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -622,7 +622,7 @@ class LogisticRegression @Since("1.2.0") (
             rawCoefficients(coefIndex)
           }
         } else {
-          Array[Double]()
+          Array.empty[Double]
         }
         val interceptVector = if (interceptsArray.nonEmpty && isMultinomial) {
           // The intercepts are never regularized, so we always center the mean.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala
index c3de5d75f4f7d..a8b5955a7285d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala
@@ -124,7 +124,8 @@ private[stat] object KolmogorovSmirnovTest extends Logging {
     val pResults = partDiffs.foldLeft(initAcc) { case ((pMin, pMax, pCt), (dl, dp)) =>
       (math.min(pMin, dl), math.max(pMax, dp), pCt + 1)
     }
-    val results = if (pResults == initAcc) Array[(Double, Double, Double)]() else Array(pResults)
+    val results =
+      if (pResults == initAcc) Array.empty[(Double, Double, Double)] else Array(pResults)
     results.iterator
   }
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala
index c08cb695806d0..41684d92be33a 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala
@@ -51,7 +51,7 @@ class MultilayerPerceptronClassifierSuite
   test("Input Validation") {
     val mlpc = new MultilayerPerceptronClassifier()
     intercept[IllegalArgumentException] {
-      mlpc.setLayers(Array[Int]())
+      mlpc.setLayers(Array.empty[Int])
     }
     intercept[IllegalArgumentException] {
       mlpc.setLayers(Array[Int](1))
diff --git a/mllib/src/test/scala/org/apache/spark/ml/python/MLSerDeSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/python/MLSerDeSuite.scala
index 5eaef9aabda50..3bb760f2ecc1d 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/python/MLSerDeSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/python/MLSerDeSuite.scala
@@ -54,7 +54,7 @@ class MLSerDeSuite extends SparkFunSuite {
     assert(matrix === nm)
 
     // Test conversion for empty matrix
-    val empty = Array[Double]()
+    val empty = Array.empty[Double]
     val emptyMatrix = Matrices.dense(0, 0, empty)
     val ne = MLSerDe.loads(MLSerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix]
     assert(emptyMatrix == ne)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala
index 499d386e66413..3bded9c01760a 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala
@@ -154,10 +154,10 @@ class RandomForestSuite extends SparkFunSuite with MLlibTestSparkContext {
       val featureSamples = Array(0, 0, 0).map(_.toDouble)
       val featureSamplesEmpty = Array.empty[Double]
       val splits = RandomForest.findSplitsForContinuousFeature(featureSamples, fakeMetadata, 0)
-      assert(splits === Array[Double]())
+      assert(splits === Array.empty[Double])
       val splitsEmpty =
         RandomForest.findSplitsForContinuousFeature(featureSamplesEmpty, fakeMetadata, 0)
-      assert(splitsEmpty === Array[Double]())
+      assert(splitsEmpty === Array.empty[Double])
     }
   }
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/api/python/PythonMLLibAPISuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/api/python/PythonMLLibAPISuite.scala
index 0eb839f20c003..5f85c0d65ff2d 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/api/python/PythonMLLibAPISuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/api/python/PythonMLLibAPISuite.scala
@@ -72,7 +72,7 @@ class PythonMLLibAPISuite extends SparkFunSuite {
     assert(matrix === nm)
 
     // Test conversion for empty matrix
-    val empty = Array[Double]()
+    val empty = Array.empty[Double]
     val emptyMatrix = Matrices.dense(0, 0, empty)
     val ne = SerDe.loads(SerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix]
     assert(emptyMatrix == ne)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/RankingMetricsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/RankingMetricsSuite.scala
index 8e9d910e646c9..f334be2c2ba83 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/RankingMetricsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/RankingMetricsSuite.scala
@@ -28,7 +28,7 @@ class RankingMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {
       Seq(
         (Array(1, 6, 2, 7, 8, 3, 9, 10, 4, 5), Array(1, 2, 3, 4, 5)),
         (Array(4, 1, 5, 6, 2, 7, 3, 8, 9, 10), Array(1, 2, 3)),
-        (Array(1, 2, 3, 4, 5), Array[Int]())
+        (Array(1, 2, 3, 4, 5), Array.empty[Int])
       ), 2)
     val eps = 1.0E-5
 
@@ -55,7 +55,7 @@ class RankingMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {
     val predictionAndLabels = sc.parallelize(
       Seq(
         (Array(1, 6, 2), Array(1, 2, 3, 4, 5)),
-        (Array[Int](), Array(1, 2, 3))
+        (Array.empty[Int], Array(1, 2, 3))
       ), 2)
     val eps = 1.0E-5
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
index d0c4dd28e14ee..563756907d201 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
@@ -289,7 +289,7 @@ class MatricesSuite extends SparkFunSuite {
     val spHorz2 = Matrices.horzcat(Array(spMat1, deMat2))
     val spHorz3 = Matrices.horzcat(Array(deMat1, spMat2))
     val deHorz1 = Matrices.horzcat(Array(deMat1, deMat2))
-    val deHorz2 = Matrices.horzcat(Array[Matrix]())
+    val deHorz2 = Matrices.horzcat(Array.empty[Matrix])
 
     assert(deHorz1.numRows === 3)
     assert(spHorz2.numRows === 3)
@@ -343,7 +343,7 @@ class MatricesSuite extends SparkFunSuite {
     val deVert1 = Matrices.vertcat(Array(deMat1, deMat3))
     val spVert2 = Matrices.vertcat(Array(spMat1, deMat3))
     val spVert3 = Matrices.vertcat(Array(deMat1, spMat3))
-    val deVert2 = Matrices.vertcat(Array[Matrix]())
+    val deVert2 = Matrices.vertcat(Array.empty[Matrix])
 
     assert(deVert1.numRows === 5)
     assert(spVert2.numRows === 5)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtilsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtilsSuite.scala
index 1aff44480aac9..3fcf1cf2c2635 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtilsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtilsSuite.scala
@@ -110,9 +110,9 @@ class TestingUtilsSuite extends SparkFunSuite {
     assert(!(Vectors.dense(Array(3.1, 3.5)) !~= Vectors.dense(Array(3.130, 3.534)) relTol 0.01))
     assert(!(Vectors.dense(Array(3.1, 3.5)) ~= Vectors.dense(Array(3.135, 3.534)) relTol 0.01))
     assert(Vectors.dense(Array(3.1)) !~= Vectors.dense(Array(3.130, 3.534)) relTol 0.01)
-    assert(Vectors.dense(Array[Double]()) !~= Vectors.dense(Array(3.130, 3.534)) relTol 0.01)
+    assert(Vectors.dense(Array.empty[Double]) !~= Vectors.dense(Array(3.130, 3.534)) relTol 0.01)
     assert(Vectors.dense(Array(3.1)) !~== Vectors.dense(Array(3.130, 3.534)) relTol 0.01)
-    assert(Vectors.dense(Array[Double]()) !~== Vectors.dense(Array(3.130, 3.534)) relTol 0.01)
+    assert(Vectors.dense(Array.empty[Double]) !~== Vectors.dense(Array(3.130, 3.534)) relTol 0.01)
 
     // Should throw exception with message when test fails.
     intercept[TestFailedException](
@@ -125,7 +125,7 @@ class TestingUtilsSuite extends SparkFunSuite {
       Vectors.dense(Array(3.1)) ~== Vectors.dense(Array(3.535, 3.534)) relTol 0.01)
 
     intercept[TestFailedException](
-      Vectors.dense(Array[Double]()) ~== Vectors.dense(Array(3.135)) relTol 0.01)
+      Vectors.dense(Array.empty[Double]) ~== Vectors.dense(Array(3.135)) relTol 0.01)
 
     // Comparing against zero should fail the test and throw exception with message
     // saying that the relative error is meaningless in this situation.
@@ -145,7 +145,7 @@ class TestingUtilsSuite extends SparkFunSuite {
     assert(Vectors.dense(Array(3.1)) !~==
       Vectors.sparse(2, Array(0, 1), Array(3.130, 3.534)) relTol 0.01)
 
-    assert(Vectors.dense(Array[Double]()) !~==
+    assert(Vectors.dense(Array.empty[Double]) !~==
       Vectors.sparse(2, Array(0, 1), Array(3.130, 3.534)) relTol 0.01)
   }
 
@@ -176,14 +176,14 @@ class TestingUtilsSuite extends SparkFunSuite {
     assert(!(Vectors.dense(Array(3.1)) ~=
       Vectors.dense(Array(3.1 + 1E-6, 3.5 + 2E-7)) absTol 1E-5))
 
-    assert(Vectors.dense(Array[Double]()) !~=
+    assert(Vectors.dense(Array.empty[Double]) !~=
       Vectors.dense(Array(3.1 + 1E-6, 3.5 + 2E-7)) absTol 1E-5)
 
-    assert(!(Vectors.dense(Array[Double]()) ~=
+    assert(!(Vectors.dense(Array.empty[Double]) ~=
       Vectors.dense(Array(3.1 + 1E-6, 3.5 + 2E-7)) absTol 1E-5))
 
-    assert(Vectors.dense(Array[Double]()) ~=
-      Vectors.dense(Array[Double]()) absTol 1E-5)
+    assert(Vectors.dense(Array.empty[Double]) ~=
+      Vectors.dense(Array.empty[Double]) absTol 1E-5)
 
     // Should throw exception with message when test fails.
     intercept[TestFailedException](Vectors.dense(Array(3.1, 3.5, 0.0)) !~==
@@ -195,7 +195,7 @@ class TestingUtilsSuite extends SparkFunSuite {
     intercept[TestFailedException](Vectors.dense(Array(3.1)) ~==
       Vectors.dense(Array(3.1 + 1E-5, 3.5 + 2E-7)) absTol 1E-6)
 
-    intercept[TestFailedException](Vectors.dense(Array[Double]()) ~==
+    intercept[TestFailedException](Vectors.dense(Array.empty[Double]) ~==
       Vectors.dense(Array(3.1 + 1E-5, 3.5 + 2E-7)) absTol 1E-6)
 
     // Comparisons of two sparse vectors
@@ -214,7 +214,7 @@ class TestingUtilsSuite extends SparkFunSuite {
     assert(Vectors.sparse(3, Array(0, 2), Array(3.1 + 1E-6, 2.4)) !~==
       Vectors.sparse(1, Array(0), Array(3.1)) absTol 1E-3)
 
-    assert(Vectors.sparse(0, Array[Int](), Array[Double]()) !~==
+    assert(Vectors.sparse(0, Array.empty[Int], Array.empty[Double]) !~==
       Vectors.sparse(1, Array(0), Array(3.1)) absTol 1E-3)
 
     // Comparisons of a dense vector and a sparse vector
@@ -230,14 +230,14 @@ class TestingUtilsSuite extends SparkFunSuite {
     assert(Vectors.sparse(3, Array(0, 2), Array(3.1, 2.4)) !~==
       Vectors.dense(Array(3.1)) absTol 1E-6)
 
-    assert(Vectors.dense(Array[Double]()) !~==
+    assert(Vectors.dense(Array.empty[Double]) !~==
       Vectors.sparse(3, Array(0, 2), Array(0, 2.4)) absTol 1E-6)
 
     assert(Vectors.sparse(1, Array(0), Array(3.1)) !~==
       Vectors.dense(Array(3.1, 3.2)) absTol 1E-6)
 
     assert(Vectors.dense(Array(3.1)) !~==
-      Vectors.sparse(0, Array[Int](), Array[Double]()) absTol 1E-6)
+      Vectors.sparse(0, Array.empty[Int], Array.empty[Double]) absTol 1E-6)
   }
 
   test("Comparing Matrices using absolute error.") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
index fdb9fa31f09c8..26978a0482fc7 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
@@ -215,13 +215,13 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(Substring(bytes, 2, 2), Array[Byte](2, 3))
     checkEvaluation(Substring(bytes, 3, 2), Array[Byte](3, 4))
     checkEvaluation(Substring(bytes, 4, 2), Array[Byte](4))
-    checkEvaluation(Substring(bytes, 8, 2), Array[Byte]())
+    checkEvaluation(Substring(bytes, 8, 2), Array.empty[Byte])
     checkEvaluation(Substring(bytes, -1, 2), Array[Byte](4))
     checkEvaluation(Substring(bytes, -2, 2), Array[Byte](3, 4))
     checkEvaluation(Substring(bytes, -3, 2), Array[Byte](2, 3))
     checkEvaluation(Substring(bytes, -4, 2), Array[Byte](1, 2))
     checkEvaluation(Substring(bytes, -5, 2), Array[Byte](1))
-    checkEvaluation(Substring(bytes, -8, 2), Array[Byte]())
+    checkEvaluation(Substring(bytes, -8, 2), Array.empty[Byte])
   }
 
   test("string substring_index function") {
@@ -275,7 +275,7 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(Base64(UnBase64(a)), "AQIDBA==", create_row("AQIDBA=="))
 
     checkEvaluation(Base64(b), "AQIDBA==", create_row(bytes))
-    checkEvaluation(Base64(b), "", create_row(Array[Byte]()))
+    checkEvaluation(Base64(b), "", create_row(Array.empty[Byte]))
     checkEvaluation(Base64(b), null, create_row(null))
     checkEvaluation(Base64(Literal.create(null, BinaryType)), null, create_row("abdef"))
 
@@ -526,13 +526,13 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     // non ascii characters are not allowed in the source code, so we disable the scalastyle.
     checkEvaluation(Length(Literal("a花花c")), 4, create_row(string))
     // scalastyle:on
-    checkEvaluation(Length(Literal(bytes)), 5, create_row(Array[Byte]()))
+    checkEvaluation(Length(Literal(bytes)), 5, create_row(Array.empty[Byte]))
 
     checkEvaluation(Length(a), 5, create_row(string))
     checkEvaluation(Length(b), 5, create_row(bytes))
 
     checkEvaluation(Length(a), 0, create_row(""))
-    checkEvaluation(Length(b), 0, create_row(Array[Byte]()))
+    checkEvaluation(Length(b), 0, create_row(Array.empty[Byte]))
 
     checkEvaluation(Length(a), null, create_row(null))
     checkEvaluation(Length(b), null, create_row(null))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 45db61515e9b6..586a0fffeb7a1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -273,7 +273,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext {
   test("sort_array function") {
     val df = Seq(
       (Array[Int](2, 1, 3), Array("b", "c", "a")),
-      (Array[Int](), Array[String]()),
+      (Array.empty[Int], Array.empty[String]),
       (null, null)
     ).toDF("a", "b")
     checkAnswer(

From 3a237512b162d192b5503c08d121134a2dac6ff1 Mon Sep 17 00:00:00 2001
From: Alex Bozarth <ajbozart@us.ibm.com>
Date: Fri, 21 Oct 2016 11:39:32 +0100
Subject: [PATCH 077/162] [SPARK-13275][WEB UI] Visually clarified executors
 start time in timeline

## What changes were proposed in this pull request?

Updated the Executors added/removed bubble in the time line so it's clearer where it starts. Now the bubble is left justified on the start time (still also denoted by the line) rather than center justified.

## How was this patch tested?

Manually tested UI

<img width="596" alt="screen shot 2016-10-17 at 6 04 36 pm" src="https://cloud.githubusercontent.com/assets/13952758/19496563/e6c9186e-953c-11e6-85e4-63309a553f65.png">
<img width="492" alt="screen shot 2016-10-17 at 5 54 09 pm" src="https://cloud.githubusercontent.com/assets/13952758/19496568/e9f06132-953c-11e6-8901-54405ebc7f5b.png">

Author: Alex Bozarth <ajbozart@us.ibm.com>

Closes #15536 from ajbozarth/spark13275.
---
 .../main/resources/org/apache/spark/ui/static/timeline-view.js  | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/core/src/main/resources/org/apache/spark/ui/static/timeline-view.js b/core/src/main/resources/org/apache/spark/ui/static/timeline-view.js
index a6153ceda75e2..705a08f0293d3 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/timeline-view.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/timeline-view.js
@@ -24,6 +24,7 @@ function drawApplicationTimeline(groupArray, eventObjArray, startTime, offset) {
       return a.value - b.value
     },
     editable: false,
+    align: 'left',
     showCurrentTime: false,
     min: startTime,
     zoomable: false,
@@ -99,6 +100,7 @@ function drawJobTimeline(groupArray, eventObjArray, startTime, offset) {
       return a.value - b.value;
     },
     editable: false,
+    align: 'left',
     showCurrentTime: false,
     min: startTime,
     zoomable: false,

From b3b4b9542223de3495a7a7e0dd27634ddb9f929d Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Fri, 21 Oct 2016 11:25:01 -0700
Subject: [PATCH 078/162] [SPARK-18034] Upgrade to MiMa 0.1.11 to fix flakiness

We should upgrade to the latest release of MiMa (0.1.11) in order to include a fix for a bug which led to flakiness in the MiMa checks (https://github.com/typesafehub/migration-manager/issues/115).

Author: Josh Rosen <joshrosen@databricks.com>

Closes #15571 from JoshRosen/SPARK-18034.
---
 project/MimaExcludes.scala | 7 ++++++-
 project/plugins.sbt        | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index facf034ea7e7d..350b144f8294b 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -81,7 +81,12 @@ object MimaExcludes {
       // [SPARK-17338][SQL] add global temp view
       ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.catalog.Catalog.dropGlobalTempView"),
       ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.sql.catalog.Catalog.dropTempView"),
-      ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.catalog.Catalog.dropTempView")
+      ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.catalog.Catalog.dropTempView"),
+
+      // [SPARK-18034] Upgrade to MiMa 0.1.11 to fix flakiness.
+      ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.param.shared.HasAggregationDepth.aggregationDepth"),
+      ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.param.shared.HasAggregationDepth.getAggregationDepth"),
+      ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.param.shared.HasAggregationDepth.org$apache$spark$ml$param$shared$HasAggregationDepth$_setter_$aggregationDepth_=")
     )
   }
 
diff --git a/project/plugins.sbt b/project/plugins.sbt
index 8bebd7bcac58c..76597d27292ea 100644
--- a/project/plugins.sbt
+++ b/project/plugins.sbt
@@ -6,7 +6,7 @@ addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.8.2")
 
 addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.8.0")
 
-addSbtPlugin("com.typesafe" % "sbt-mima-plugin" % "0.1.9")
+addSbtPlugin("com.typesafe" % "sbt-mima-plugin" % "0.1.11")
 
 addSbtPlugin("com.alpinenow" % "junit_xml_listener" % "0.5.1")
 

From 4efdc764edfbc4971f0e863947258482ca2017df Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Fri, 21 Oct 2016 12:34:14 -0700
Subject: [PATCH 079/162] [SPARK-17674][SPARKR] check for warning in test
 output

## What changes were proposed in this pull request?

testthat library we are using for testing R is redirecting warning (and disabling `options("warn" = 2)`), we need to have a way to detect any new warning and fail

## How was this patch tested?

manual testing, Jenkins

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #15576 from felixcheung/rtestwarning.
---
 R/run-tests.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/R/run-tests.sh b/R/run-tests.sh
index 1a1e8ab9ffe18..5e4dafaf76f3d 100755
--- a/R/run-tests.sh
+++ b/R/run-tests.sh
@@ -26,6 +26,8 @@ rm -f $LOGFILE
 SPARK_TESTING=1 $FWDIR/../bin/spark-submit --driver-java-options "-Dlog4j.configuration=file:$FWDIR/log4j.properties" --conf spark.hadoop.fs.default.name="file:///" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE
 FAILED=$((PIPESTATUS[0]||$FAILED))
 
+NUM_TEST_WARNING="$(grep -c -e 'Warnings ----------------' $LOGFILE)"
+
 # Also run the documentation tests for CRAN
 CRAN_CHECK_LOG_FILE=$FWDIR/cran-check.out
 rm -f $CRAN_CHECK_LOG_FILE
@@ -37,10 +39,10 @@ NUM_CRAN_WARNING="$(grep -c WARNING$ $CRAN_CHECK_LOG_FILE)"
 NUM_CRAN_ERROR="$(grep -c ERROR$ $CRAN_CHECK_LOG_FILE)"
 NUM_CRAN_NOTES="$(grep -c NOTE$ $CRAN_CHECK_LOG_FILE)"
 
-if [[ $FAILED != 0 ]]; then
+if [[ $FAILED != 0 || $NUM_TEST_WARNING != 0 ]]; then
     cat $LOGFILE
     echo -en "\033[31m"  # Red
-    echo "Had test failures; see logs."
+    echo "Had test warnings or failures; see logs."
     echo -en "\033[0m"  # No color
     exit -1
 else

From e21e1c946c4b7448fb150cfa2d9419864ae6f9b5 Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Fri, 21 Oct 2016 12:35:37 -0700
Subject: [PATCH 080/162] [SPARK-18013][SPARKR] add crossJoin API

## What changes were proposed in this pull request?

Add crossJoin and do not default to cross join if joinExpr is left out

## How was this patch tested?

unit test

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #15559 from felixcheung/rcrossjoin.
---
 R/pkg/NAMESPACE                           |  1 +
 R/pkg/R/DataFrame.R                       | 59 ++++++++++++++++++-----
 R/pkg/R/generics.R                        |  4 ++
 R/pkg/inst/tests/testthat/test_sparkSQL.R | 11 ++++-
 docs/sparkr.md                            |  4 ++
 5 files changed, 64 insertions(+), 15 deletions(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 5960c6206a6f1..87181851714e0 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -71,6 +71,7 @@ exportMethods("arrange",
               "covar_samp",
               "covar_pop",
               "createOrReplaceTempView",
+              "crossJoin",
               "crosstab",
               "dapply",
               "dapplyCollect",
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 801d2ed4e7500..8910a4b138a37 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -2271,12 +2271,13 @@ setMethod("dropDuplicates",
 
 #' Join
 #'
-#' Join two SparkDataFrames based on the given join expression.
+#' Joins two SparkDataFrames based on the given join expression.
 #'
 #' @param x A SparkDataFrame
 #' @param y A SparkDataFrame
 #' @param joinExpr (Optional) The expression used to perform the join. joinExpr must be a
-#' Column expression. If joinExpr is omitted, join() will perform a Cartesian join
+#' Column expression. If joinExpr is omitted, the default, inner join is attempted and an error is
+#' thrown if it would be a Cartesian Product. For Cartesian join, use crossJoin instead.
 #' @param joinType The type of join to perform. The following join types are available:
 #' 'inner', 'outer', 'full', 'fullouter', leftouter', 'left_outer', 'left',
 #' 'right_outer', 'rightouter', 'right', and 'leftsemi'. The default joinType is "inner".
@@ -2285,23 +2286,24 @@ setMethod("dropDuplicates",
 #' @aliases join,SparkDataFrame,SparkDataFrame-method
 #' @rdname join
 #' @name join
-#' @seealso \link{merge}
+#' @seealso \link{merge} \link{crossJoin}
 #' @export
 #' @examples
 #'\dontrun{
 #' sparkR.session()
 #' df1 <- read.json(path)
 #' df2 <- read.json(path2)
-#' join(df1, df2) # Performs a Cartesian
 #' join(df1, df2, df1$col1 == df2$col2) # Performs an inner join based on expression
 #' join(df1, df2, df1$col1 == df2$col2, "right_outer")
+#' join(df1, df2) # Attempts an inner join
 #' }
 #' @note join since 1.4.0
 setMethod("join",
           signature(x = "SparkDataFrame", y = "SparkDataFrame"),
           function(x, y, joinExpr = NULL, joinType = NULL) {
             if (is.null(joinExpr)) {
-              sdf <- callJMethod(x@sdf, "crossJoin", y@sdf)
+              # this may not fail until the planner checks for Cartesian join later on.
+              sdf <- callJMethod(x@sdf, "join", y@sdf)
             } else {
               if (class(joinExpr) != "Column") stop("joinExpr must be a Column")
               if (is.null(joinType)) {
@@ -2322,22 +2324,52 @@ setMethod("join",
             dataFrame(sdf)
           })
 
+#' CrossJoin
+#'
+#' Returns Cartesian Product on two SparkDataFrames.
+#'
+#' @param x A SparkDataFrame
+#' @param y A SparkDataFrame
+#' @return A SparkDataFrame containing the result of the join operation.
+#' @family SparkDataFrame functions
+#' @aliases crossJoin,SparkDataFrame,SparkDataFrame-method
+#' @rdname crossJoin
+#' @name crossJoin
+#' @seealso \link{merge} \link{join}
+#' @export
+#' @examples
+#'\dontrun{
+#' sparkR.session()
+#' df1 <- read.json(path)
+#' df2 <- read.json(path2)
+#' crossJoin(df1, df2) # Performs a Cartesian
+#' }
+#' @note crossJoin since 2.1.0
+setMethod("crossJoin",
+          signature(x = "SparkDataFrame", y = "SparkDataFrame"),
+          function(x, y) {
+            sdf <- callJMethod(x@sdf, "crossJoin", y@sdf)
+            dataFrame(sdf)
+          })
+
 #' Merges two data frames
 #'
 #' @name merge
-#' @param x the first data frame to be joined
-#' @param y the second data frame to be joined
+#' @param x the first data frame to be joined.
+#' @param y the second data frame to be joined.
 #' @param by a character vector specifying the join columns. If by is not
 #'   specified, the common column names in \code{x} and \code{y} will be used.
+#'   If by or both by.x and by.y are explicitly set to NULL or of length 0, the Cartesian
+#'   Product of x and y will be returned.
 #' @param by.x a character vector specifying the joining columns for x.
 #' @param by.y a character vector specifying the joining columns for y.
 #' @param all a boolean value setting \code{all.x} and \code{all.y}
 #'            if any of them are unset.
 #' @param all.x a boolean value indicating whether all the rows in x should
-#'              be including in the join
+#'              be including in the join.
 #' @param all.y a boolean value indicating whether all the rows in y should
-#'              be including in the join
-#' @param sort a logical argument indicating whether the resulting columns should be sorted
+#'              be including in the join.
+#' @param sort a logical argument indicating whether the resulting columns should be sorted.
 #' @param suffixes a string vector of length 2 used to make colnames of
 #'                 \code{x} and \code{y} unique.
 #'                 The first element is appended to each colname of \code{x}.
@@ -2351,20 +2383,21 @@ setMethod("join",
 #' @family SparkDataFrame functions
 #' @aliases merge,SparkDataFrame,SparkDataFrame-method
 #' @rdname merge
-#' @seealso \link{join}
+#' @seealso \link{join} \link{crossJoin}
 #' @export
 #' @examples
 #'\dontrun{
 #' sparkR.session()
 #' df1 <- read.json(path)
 #' df2 <- read.json(path2)
-#' merge(df1, df2) # Performs a Cartesian
+#' merge(df1, df2) # Performs an inner join by common columns
 #' merge(df1, df2, by = "col1") # Performs an inner join based on expression
 #' merge(df1, df2, by.x = "col1", by.y = "col2", all.y = TRUE)
 #' merge(df1, df2, by.x = "col1", by.y = "col2", all.x = TRUE)
 #' merge(df1, df2, by.x = "col1", by.y = "col2", all.x = TRUE, all.y = TRUE)
 #' merge(df1, df2, by.x = "col1", by.y = "col2", all = TRUE, sort = FALSE)
 #' merge(df1, df2, by = "col1", all = TRUE, suffixes = c("-X", "-Y"))
+#' merge(df1, df2, by = NULL) # Performs a Cartesian join
 #' }
 #' @note merge since 1.5.0
 setMethod("merge",
@@ -2401,7 +2434,7 @@ setMethod("merge",
               joinY <- by
             } else {
               # if by or both by.x and by.y have length 0, use Cartesian Product
-              joinRes <- join(x, y)
+              joinRes <- crossJoin(x, y)
               return (joinRes)
             }
 
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 810aea9017743..5549cd7cac516 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -468,6 +468,10 @@ setGeneric("createOrReplaceTempView",
              standardGeneric("createOrReplaceTempView")
            })
 
+# @rdname crossJoin
+# @export
+setGeneric("crossJoin", function(x, y) { standardGeneric("crossJoin") })
+
 #' @rdname dapply
 #' @export
 setGeneric("dapply", function(x, func, schema) { standardGeneric("dapply") })
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index 1c806869e9fbe..3a987cd86213f 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -1572,7 +1572,7 @@ test_that("filter() on a DataFrame", {
   #expect_true(is.ts(filter(1:100, rep(1, 3)))) # nolint
 })
 
-test_that("join() and merge() on a DataFrame", {
+test_that("join(), crossJoin() and merge() on a DataFrame", {
   df <- read.json(jsonPath)
 
   mockLines2 <- c("{\"name\":\"Michael\", \"test\": \"yes\"}",
@@ -1583,7 +1583,14 @@ test_that("join() and merge() on a DataFrame", {
   writeLines(mockLines2, jsonPath2)
   df2 <- read.json(jsonPath2)
 
-  joined <- join(df, df2)
+  # inner join, not cartesian join
+  expect_equal(count(where(join(df, df2), df$name == df2$name)), 3)
+  # cartesian join
+  expect_error(tryCatch(count(join(df, df2)), error = function(e) { stop(e) }),
+               paste0(".*(org.apache.spark.sql.AnalysisException: Detected cartesian product for",
+                      " INNER join between logical plans).*"))
+
+  joined <- crossJoin(df, df2)
   expect_equal(names(joined), c("age", "name", "name", "test"))
   expect_equal(count(joined), 12)
   expect_equal(names(collect(joined)), c("age", "name", "name", "test"))
diff --git a/docs/sparkr.md b/docs/sparkr.md
index 340e7f7cb1a0b..c1829efd18f44 100644
--- a/docs/sparkr.md
+++ b/docs/sparkr.md
@@ -591,3 +591,7 @@ You can inspect the search path in R with [`search()`](https://stat.ethz.ch/R-ma
  - The method `registerTempTable` has been deprecated to be replaced by `createOrReplaceTempView`.
  - The method `dropTempTable` has been deprecated to be replaced by `dropTempView`.
  - The `sc` SparkContext parameter is no longer required for these functions: `setJobGroup`, `clearJobGroup`, `cancelJobGroup`
+
+## Upgrading to SparkR 2.1.0
+
+ - `join` no longer performs Cartesian Product by default, use `crossJoin` instead.

From e371040a0150e4ed748a7c25465965840b61ca63 Mon Sep 17 00:00:00 2001
From: Hossein <hossein@databricks.com>
Date: Fri, 21 Oct 2016 12:38:52 -0700
Subject: [PATCH 081/162] [SPARK-17811] SparkR cannot parallelize data.frame
 with NA or NULL in Date columns

## What changes were proposed in this pull request?
NA date values are serialized as "NA" and NA time values are serialized as NaN from R. In the backend we did not have proper logic to deal with them. As a result we got an IllegalArgumentException for Date and wrong value for time. This PR adds support for deserializing NA as Date and Time.

## How was this patch tested?
* [x] TODO

Author: Hossein <hossein@databricks.com>

Closes #15421 from falaki/SPARK-17811.
---
 R/pkg/inst/tests/testthat/test_sparkSQL.R     | 13 ++++++++
 .../scala/org/apache/spark/api/r/SerDe.scala  | 31 +++++++++++++++----
 2 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index 3a987cd86213f..b4b43fdba42ce 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -390,6 +390,19 @@ test_that("create DataFrame with different data types", {
   expect_equal(collect(df), data.frame(l, stringsAsFactors = FALSE))
 })
 
+test_that("SPARK-17811: can create DataFrame containing NA as date and time", {
+  df <- data.frame(
+    id = 1:2,
+    time = c(as.POSIXlt("2016-01-10"), NA),
+    date = c(as.Date("2016-10-01"), NA))
+
+  DF <- collect(createDataFrame(df))
+  expect_true(is.na(DF$date[2]))
+  expect_equal(DF$date[1], as.Date("2016-10-01"))
+  expect_true(is.na(DF$time[2]))
+  expect_equal(DF$time[1], as.POSIXlt("2016-01-10"))
+})
+
 test_that("create DataFrame with complex types", {
   e <- new.env()
   assign("n", 3L, envir = e)
diff --git a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
index e4932a4192d39..550e075a95129 100644
--- a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
@@ -125,15 +125,34 @@ private[spark] object SerDe {
   }
 
   def readDate(in: DataInputStream): Date = {
-    Date.valueOf(readString(in))
+    try {
+      val inStr = readString(in)
+      if (inStr == "NA") {
+        null
+      } else {
+        Date.valueOf(inStr)
+      }
+    } catch {
+      // TODO: SPARK-18011 with some versions of R deserializing NA from R results in NASE
+      case _: NegativeArraySizeException => null
+    }
   }
 
   def readTime(in: DataInputStream): Timestamp = {
-    val seconds = in.readDouble()
-    val sec = Math.floor(seconds).toLong
-    val t = new Timestamp(sec * 1000L)
-    t.setNanos(((seconds - sec) * 1e9).toInt)
-    t
+    try {
+      val seconds = in.readDouble()
+      if (java.lang.Double.isNaN(seconds)) {
+        null
+      } else {
+        val sec = Math.floor(seconds).toLong
+        val t = new Timestamp(sec * 1000L)
+        t.setNanos(((seconds - sec) * 1e9).toInt)
+        t
+      }
+    } catch {
+      // TODO: SPARK-18011 with some versions of R deserializing NA from R results in NASE
+      case _: NegativeArraySizeException => null
+    }
   }
 
   def readBytesArr(in: DataInputStream): Array[Array[Byte]] = {

From 7a531e3054f8d4820216ed379433559f57f571b8 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Fri, 21 Oct 2016 13:07:29 -0700
Subject: [PATCH 082/162] [SPARK-17926][SQL][STREAMING] Added json for statuses

## What changes were proposed in this pull request?

StreamingQueryStatus exposed through StreamingQueryListener often needs to be recorded (similar to SparkListener events). This PR adds `.json` and `.prettyJson` to `StreamingQueryStatus`, `SourceStatus` and `SinkStatus`.

## How was this patch tested?
New unit tests

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #15476 from tdas/SPARK-17926.
---
 python/pyspark/sql/streaming.py               |  11 +-
 .../spark/sql/streaming/SinkStatus.scala      |  18 ++-
 .../spark/sql/streaming/SourceStatus.scala    |  23 +++-
 .../sql/streaming/StreamingQueryStatus.scala  |  55 ++++++---
 .../streaming/StreamingQueryStatusSuite.scala | 105 ++++++++++++++++++
 5 files changed, 187 insertions(+), 25 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusSuite.scala

diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index ce47bd1640fb1..35fc469291684 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -205,8 +205,7 @@ def __str__(self):
         Pretty string of this query status.
 
         >>> print(sqs)
-        StreamingQueryStatus:
-            Query name: query
+        Status of query 'query'
             Query id: 1
             Status timestamp: 123
             Input rate: 15.5 rows/sec
@@ -220,7 +219,7 @@ def __str__(self):
                 numRows.input.total: 100
                 triggerId: 5
             Source statuses [1 source]:
-                Source 1:    MySource1
+                Source 1 - MySource1
                     Available offset: #0
                     Input rate: 15.5 rows/sec
                     Processing rate: 23.5 rows/sec
@@ -228,7 +227,7 @@ def __str__(self):
                         numRows.input.source: 100
                         latency.getOffset.source: 10
                         latency.getBatch.source: 20
-            Sink status:     MySink
+            Sink status - MySink
                 Committed offsets: [#1, -]
         """
         return self._jsqs.toString()
@@ -366,7 +365,7 @@ def __str__(self):
         Pretty string of this source status.
 
         >>> print(sqs.sourceStatuses[0])
-        SourceStatus:    MySource1
+        Status of source MySource1
             Available offset: #0
             Input rate: 15.5 rows/sec
             Processing rate: 23.5 rows/sec
@@ -457,7 +456,7 @@ def __str__(self):
         Pretty string of this source status.
 
         >>> print(sqs.sinkStatus)
-        SinkStatus:    MySink
+        Status of sink MySink
             Committed offsets: [#1, -]
         """
         return self._jss.toString()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/SinkStatus.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/SinkStatus.scala
index c9911665f7d72..ab19602207ad8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/SinkStatus.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/SinkStatus.scala
@@ -17,6 +17,11 @@
 
 package org.apache.spark.sql.streaming
 
+import org.json4s._
+import org.json4s.JsonAST.JValue
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods._
+
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql.streaming.StreamingQueryStatus.indent
 
@@ -34,8 +39,19 @@ class SinkStatus private(
     val description: String,
     val offsetDesc: String) {
 
+  /** The compact JSON representation of this status. */
+  def json: String = compact(render(jsonValue))
+
+  /** The pretty (i.e. indented) JSON representation of this status. */
+  def prettyJson: String = pretty(render(jsonValue))
+
   override def toString: String =
-    "SinkStatus:" + indent(prettyString)
+    "Status of sink " + indent(prettyString).trim
+
+  private[sql] def jsonValue: JValue = {
+    ("description" -> JString(description)) ~
+    ("offsetDesc" -> JString(offsetDesc))
+  }
 
   private[sql] def prettyString: String = {
     s"""$description
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/SourceStatus.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/SourceStatus.scala
index 6ace4833be22f..cfdf11370e06d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/SourceStatus.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/SourceStatus.scala
@@ -21,8 +21,14 @@ import java.{util => ju}
 
 import scala.collection.JavaConverters._
 
+import org.json4s._
+import org.json4s.JsonAST.JValue
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods._
+
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql.streaming.StreamingQueryStatus.indent
+import org.apache.spark.util.JsonProtocol
 
 /**
  * :: Experimental ::
@@ -47,8 +53,22 @@ class SourceStatus private(
     val processingRate: Double,
     val triggerDetails: ju.Map[String, String]) {
 
+  /** The compact JSON representation of this status. */
+  def json: String = compact(render(jsonValue))
+
+  /** The pretty (i.e. indented) JSON representation of this status. */
+  def prettyJson: String = pretty(render(jsonValue))
+
   override def toString: String =
-    "SourceStatus:" + indent(prettyString)
+    "Status of source " + indent(prettyString).trim
+
+  private[sql] def jsonValue: JValue = {
+    ("description" -> JString(description)) ~
+    ("offsetDesc" -> JString(offsetDesc)) ~
+    ("inputRate" -> JDouble(inputRate)) ~
+    ("processingRate" -> JDouble(processingRate)) ~
+    ("triggerDetails" -> JsonProtocol.mapToJson(triggerDetails.asScala))
+  }
 
   private[sql] def prettyString: String = {
     val triggerDetailsLines =
@@ -59,7 +79,6 @@ class SourceStatus private(
        |Processing rate: $processingRate rows/sec
        |Trigger details:
        |""".stripMargin + indent(triggerDetailsLines)
-
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryStatus.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryStatus.scala
index 47689928730d0..a50b0d96c13f7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryStatus.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryStatus.scala
@@ -21,8 +21,14 @@ import java.{util => ju}
 
 import scala.collection.JavaConverters._
 
+import org.json4s._
+import org.json4s.JsonAST.JValue
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods._
+
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql.execution.streaming.{CompositeOffset, LongOffset}
+import org.apache.spark.util.JsonProtocol
 
 /**
  * :: Experimental ::
@@ -59,29 +65,46 @@ class StreamingQueryStatus private(
 
   import StreamingQueryStatus._
 
+  /** The compact JSON representation of this status. */
+  def json: String = compact(render(jsonValue))
+
+  /** The pretty (i.e. indented) JSON representation of this status. */
+  def prettyJson: String = pretty(render(jsonValue))
+
   override def toString: String = {
     val sourceStatusLines = sourceStatuses.zipWithIndex.map { case (s, i) =>
-      s"Source ${i + 1}:" + indent(s.prettyString)
+      s"Source ${i + 1} - " + indent(s.prettyString).trim
     }
-    val sinkStatusLines = sinkStatus.prettyString
+    val sinkStatusLines = sinkStatus.prettyString.trim
     val triggerDetailsLines = triggerDetails.asScala.map { case (k, v) => s"$k: $v" }.toSeq.sorted
     val numSources = sourceStatuses.length
     val numSourcesString = s"$numSources source" + { if (numSources > 1) "s" else "" }
 
-    val allLines = s"""
-        |Query name: $name
-        |Query id: $id
-        |Status timestamp: $timestamp
-        |Input rate: $inputRate rows/sec
-        |Processing rate $processingRate rows/sec
-        |Latency: ${latency.getOrElse("-")} ms
-        |Trigger details:
-        |${indent(triggerDetailsLines)}
-        |Source statuses [$numSourcesString]:
-        |${indent(sourceStatusLines)}
-        |Sink status: ${indent(sinkStatusLines)}""".stripMargin
-
-    s"StreamingQueryStatus:${indent(allLines)}"
+    val allLines =
+      s"""|Query id: $id
+          |Status timestamp: $timestamp
+          |Input rate: $inputRate rows/sec
+          |Processing rate $processingRate rows/sec
+          |Latency: ${latency.getOrElse("-")} ms
+          |Trigger details:
+          |${indent(triggerDetailsLines)}
+          |Source statuses [$numSourcesString]:
+          |${indent(sourceStatusLines)}
+          |Sink status - ${indent(sinkStatusLines).trim}""".stripMargin
+
+    s"Status of query '$name'\n${indent(allLines)}"
+  }
+
+  private[sql] def jsonValue: JValue = {
+    ("name" -> JString(name)) ~
+    ("id" -> JInt(id)) ~
+    ("timestamp" -> JInt(timestamp)) ~
+    ("inputRate" -> JDouble(inputRate)) ~
+    ("processingRate" -> JDouble(processingRate)) ~
+    ("latency" -> latency.map(JDouble).getOrElse(JNothing)) ~
+    ("triggerDetails" -> JsonProtocol.mapToJson(triggerDetails.asScala))
+    ("sourceStatuses" -> JArray(sourceStatuses.map(_.jsonValue).toList)) ~
+    ("sinkStatus" -> sinkStatus.jsonValue)
   }
 }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusSuite.scala
new file mode 100644
index 0000000000000..1a98cf2ba74e6
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusSuite.scala
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.streaming
+
+import org.apache.spark.SparkFunSuite
+
+class StreamingQueryStatusSuite extends SparkFunSuite {
+  test("toString") {
+    assert(StreamingQueryStatus.testStatus.sourceStatuses(0).toString ===
+      """
+        |Status of source MySource1
+        |    Available offset: #0
+        |    Input rate: 15.5 rows/sec
+        |    Processing rate: 23.5 rows/sec
+        |    Trigger details:
+        |        numRows.input.source: 100
+        |        latency.getOffset.source: 10
+        |        latency.getBatch.source: 20
+      """.stripMargin.trim, "SourceStatus.toString does not match")
+
+    assert(StreamingQueryStatus.testStatus.sinkStatus.toString ===
+      """
+        |Status of sink MySink
+        |    Committed offsets: [#1, -]
+      """.stripMargin.trim, "SinkStatus.toString does not match")
+
+    assert(StreamingQueryStatus.testStatus.toString ===
+      """
+        |Status of query 'query'
+        |    Query id: 1
+        |    Status timestamp: 123
+        |    Input rate: 15.5 rows/sec
+        |    Processing rate 23.5 rows/sec
+        |    Latency: 345.0 ms
+        |    Trigger details:
+        |        isDataPresentInTrigger: true
+        |        isTriggerActive: true
+        |        latency.getBatch.total: 20
+        |        latency.getOffset.total: 10
+        |        numRows.input.total: 100
+        |        triggerId: 5
+        |    Source statuses [1 source]:
+        |        Source 1 - MySource1
+        |            Available offset: #0
+        |            Input rate: 15.5 rows/sec
+        |            Processing rate: 23.5 rows/sec
+        |            Trigger details:
+        |                numRows.input.source: 100
+        |                latency.getOffset.source: 10
+        |                latency.getBatch.source: 20
+        |    Sink status - MySink
+        |        Committed offsets: [#1, -]
+      """.stripMargin.trim, "StreamingQueryStatus.toString does not match")
+
+  }
+
+  test("json") {
+    assert(StreamingQueryStatus.testStatus.json ===
+      """
+        |{"sourceStatuses":[{"description":"MySource1","offsetDesc":"#0","inputRate":15.5,
+        |"processingRate":23.5,"triggerDetails":{"numRows.input.source":"100",
+        |"latency.getOffset.source":"10","latency.getBatch.source":"20"}}],
+        |"sinkStatus":{"description":"MySink","offsetDesc":"[#1, -]"}}
+      """.stripMargin.replace("\n", "").trim)
+  }
+
+  test("prettyJson") {
+    assert(
+      StreamingQueryStatus.testStatus.prettyJson ===
+        """
+          |{
+          |  "sourceStatuses" : [ {
+          |    "description" : "MySource1",
+          |    "offsetDesc" : "#0",
+          |    "inputRate" : 15.5,
+          |    "processingRate" : 23.5,
+          |    "triggerDetails" : {
+          |      "numRows.input.source" : "100",
+          |      "latency.getOffset.source" : "10",
+          |      "latency.getBatch.source" : "20"
+          |    }
+          |  } ],
+          |  "sinkStatus" : {
+          |    "description" : "MySink",
+          |    "offsetDesc" : "[#1, -]"
+          |  }
+          |}
+        """.stripMargin.trim)
+  }
+}

From c1f344f1a09b8834bec70c1ece30b9bff63e55ea Mon Sep 17 00:00:00 2001
From: w00228970 <wangfei1@huawei.com>
Date: Fri, 21 Oct 2016 14:43:55 -0700
Subject: [PATCH 083/162] [SPARK-17929][CORE] Fix deadlock when
 CoarseGrainedSchedulerBackend reset

## What changes were proposed in this pull request?

https://issues.apache.org/jira/browse/SPARK-17929

Now `CoarseGrainedSchedulerBackend` reset will get the lock,
```
  protected def reset(): Unit = synchronized {
    numPendingExecutors = 0
    executorsPendingToRemove.clear()

    // Remove all the lingering executors that should be removed but not yet. The reason might be
    // because (1) disconnected event is not yet received; (2) executors die silently.
    executorDataMap.toMap.foreach { case (eid, _) =>
      driverEndpoint.askWithRetry[Boolean](
        RemoveExecutor(eid, SlaveLost("Stale executor after cluster manager re-registered.")))
    }
  }
```
 but on removeExecutor also need the lock "CoarseGrainedSchedulerBackend.this.synchronized", this will cause deadlock.

```
   private def removeExecutor(executorId: String, reason: ExecutorLossReason): Unit = {
      logDebug(s"Asked to remove executor $executorId with reason $reason")
      executorDataMap.get(executorId) match {
        case Some(executorInfo) =>
          // This must be synchronized because variables mutated
          // in this block are read when requesting executors
          val killed = CoarseGrainedSchedulerBackend.this.synchronized {
            addressToExecutorId -= executorInfo.executorAddress
            executorDataMap -= executorId
            executorsPendingLossReason -= executorId
            executorsPendingToRemove.remove(executorId).getOrElse(false)
          }
     ...

## How was this patch tested?

manual test.

Author: w00228970 <wangfei1@huawei.com>

Closes #15481 from scwf/spark-17929.
---
 .../cluster/CoarseGrainedSchedulerBackend.scala    | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index 0dae0e614e17d..10d55c87fb8de 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -386,15 +386,17 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
    * Reset the state of CoarseGrainedSchedulerBackend to the initial state. Currently it will only
    * be called in the yarn-client mode when AM re-registers after a failure.
    * */
-  protected def reset(): Unit = synchronized {
-    numPendingExecutors = 0
-    executorsPendingToRemove.clear()
+  protected def reset(): Unit = {
+    val executors = synchronized {
+      numPendingExecutors = 0
+      executorsPendingToRemove.clear()
+      Set() ++ executorDataMap.keys
+    }
 
     // Remove all the lingering executors that should be removed but not yet. The reason might be
     // because (1) disconnected event is not yet received; (2) executors die silently.
-    executorDataMap.toMap.foreach { case (eid, _) =>
-      driverEndpoint.askWithRetry[Boolean](
-        RemoveExecutor(eid, SlaveLost("Stale executor after cluster manager re-registered.")))
+    executors.foreach { eid =>
+      removeExecutor(eid, SlaveLost("Stale executor after cluster manager re-registered."))
     }
   }
 

From 140570252fd3739d6bdcadd6d4d5a180e480d3e0 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Fri, 21 Oct 2016 15:28:16 -0700
Subject: [PATCH 084/162] [SPARK-18044][STREAMING] FileStreamSource should not
 infer partitions in every batch

## What changes were proposed in this pull request?

In `FileStreamSource.getBatch`, we will create a `DataSource` with specified schema, to avoid inferring the schema again and again. However, we don't pass the partition columns, and will infer the partition again and again.

This PR fixes it by keeping the partition columns in `FileStreamSource`, like schema.

## How was this patch tested?

N/A

Author: Wenchen Fan <wenchen@databricks.com>

Closes #15581 from cloud-fan/stream.
---
 .../execution/datasources/DataSource.scala    | 26 +++++++++++++------
 .../streaming/FileStreamSource.scala          |  2 ++
 .../streaming/FileStreamSourceSuite.scala     |  2 +-
 3 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index 92b1fff7d8127..17da606580eea 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -75,7 +75,7 @@ case class DataSource(
     bucketSpec: Option[BucketSpec] = None,
     options: Map[String, String] = Map.empty) extends Logging {
 
-  case class SourceInfo(name: String, schema: StructType)
+  case class SourceInfo(name: String, schema: StructType, partitionColumns: Seq[String])
 
   lazy val providingClass: Class[_] = lookupDataSource(className)
   lazy val sourceInfo = sourceSchema()
@@ -186,8 +186,11 @@ case class DataSource(
     }
   }
 
-  private def inferFileFormatSchema(format: FileFormat): StructType = {
-    userSpecifiedSchema.orElse {
+  /**
+   * Infer the schema of the given FileFormat, returns a pair of schema and partition column names.
+   */
+  private def inferFileFormatSchema(format: FileFormat): (StructType, Seq[String]) = {
+    userSpecifiedSchema.map(_ -> partitionColumns).orElse {
       val caseInsensitiveOptions = new CaseInsensitiveMap(options)
       val allPaths = caseInsensitiveOptions.get("path")
       val globbedPaths = allPaths.toSeq.flatMap { path =>
@@ -197,14 +200,14 @@ case class DataSource(
         SparkHadoopUtil.get.globPathIfNecessary(qualified)
       }.toArray
       val fileCatalog = new ListingFileCatalog(sparkSession, globbedPaths, options, None)
-      val partitionCols = fileCatalog.partitionSpec().partitionColumns.fields
+      val partitionSchema = fileCatalog.partitionSpec().partitionColumns
       val inferred = format.inferSchema(
         sparkSession,
         caseInsensitiveOptions,
         fileCatalog.allFiles())
 
       inferred.map { inferredSchema =>
-        StructType(inferredSchema ++ partitionCols)
+        StructType(inferredSchema ++ partitionSchema) -> partitionSchema.map(_.name)
       }
     }.getOrElse {
       throw new AnalysisException("Unable to infer schema. It must be specified manually.")
@@ -217,7 +220,7 @@ case class DataSource(
       case s: StreamSourceProvider =>
         val (name, schema) = s.sourceSchema(
           sparkSession.sqlContext, userSpecifiedSchema, className, options)
-        SourceInfo(name, schema)
+        SourceInfo(name, schema, Nil)
 
       case format: FileFormat =>
         val caseInsensitiveOptions = new CaseInsensitiveMap(options)
@@ -246,7 +249,8 @@ case class DataSource(
               "you may be able to create a static DataFrame on that directory with " +
               "'spark.read.load(directory)' and infer schema from it.")
         }
-        SourceInfo(s"FileSource[$path]", inferFileFormatSchema(format))
+        val (schema, partCols) = inferFileFormatSchema(format)
+        SourceInfo(s"FileSource[$path]", schema, partCols)
 
       case _ =>
         throw new UnsupportedOperationException(
@@ -266,7 +270,13 @@ case class DataSource(
           throw new IllegalArgumentException("'path' is not specified")
         })
         new FileStreamSource(
-          sparkSession, path, className, sourceInfo.schema, metadataPath, options)
+          sparkSession = sparkSession,
+          path = path,
+          fileFormatClassName = className,
+          schema = sourceInfo.schema,
+          partitionColumns = sourceInfo.partitionColumns,
+          metadataPath = metadataPath,
+          options = options)
       case _ =>
         throw new UnsupportedOperationException(
           s"Data source $className does not support streamed reading")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
index 614a6261e7c28..115edf7ab2b61 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
@@ -35,6 +35,7 @@ class FileStreamSource(
     path: String,
     fileFormatClassName: String,
     override val schema: StructType,
+    partitionColumns: Seq[String],
     metadataPath: String,
     options: Map[String, String]) extends Source with Logging {
 
@@ -142,6 +143,7 @@ class FileStreamSource(
         sparkSession,
         paths = files.map(_.path),
         userSpecifiedSchema = Some(schema),
+        partitionColumns = partitionColumns,
         className = fileFormatClassName,
         options = optionsWithPartitionBasePath)
     Dataset.ofRows(sparkSession, LogicalRelation(newDataSource.resolveRelation(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceSuite.scala
index 3e1e1126f9e6b..4a47c04d3f084 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceSuite.scala
@@ -94,7 +94,7 @@ class FileStreamSourceSuite extends SparkFunSuite with SharedSQLContext {
         new FileStreamSourceLog(FileStreamSourceLog.VERSION, spark, dir.getAbsolutePath)
       assert(metadataLog.add(0, Array(FileEntry(s"$scheme:///file1", 100L, 0))))
 
-      val newSource = new FileStreamSource(spark, s"$scheme:///", "parquet", StructType(Nil),
+      val newSource = new FileStreamSource(spark, s"$scheme:///", "parquet", StructType(Nil), Nil,
         dir.getAbsolutePath, Map.empty)
       // this method should throw an exception if `fs.exists` is called during resolveRelation
       newSource.getBatch(None, LongOffset(1))

From 268ccb9a48dfefc4d7bc85155e7e20a2dfe89307 Mon Sep 17 00:00:00 2001
From: cody koeninger <cody@koeninger.org>
Date: Fri, 21 Oct 2016 15:55:04 -0700
Subject: [PATCH 085/162] [SPARK-17812][SQL][KAFKA] Assign and specific
 startingOffsets for structured stream

## What changes were proposed in this pull request?

startingOffsets takes specific per-topicpartition offsets as a json argument, usable with any consumer strategy

assign with specific topicpartitions as a consumer strategy

## How was this patch tested?

Unit tests

Author: cody koeninger <cody@koeninger.org>

Closes #15504 from koeninger/SPARK-17812.
---
 .../structured-streaming-kafka-integration.md |  38 ++++--
 .../apache/spark/sql/kafka010/JsonUtils.scala |  93 ++++++++++++++
 .../spark/sql/kafka010/KafkaSource.scala      |  64 ++++++++--
 .../sql/kafka010/KafkaSourceProvider.scala    |  52 ++++----
 .../spark/sql/kafka010/StartingOffsets.scala  |  32 +++++
 .../spark/sql/kafka010/JsonUtilsSuite.scala   |  45 +++++++
 .../spark/sql/kafka010/KafkaSourceSuite.scala | 114 ++++++++++++++++--
 .../spark/sql/kafka010/KafkaTestUtils.scala   |  14 ++-
 8 files changed, 395 insertions(+), 57 deletions(-)
 create mode 100644 external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/JsonUtils.scala
 create mode 100644 external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/StartingOffsets.scala
 create mode 100644 external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/JsonUtilsSuite.scala

diff --git a/docs/structured-streaming-kafka-integration.md b/docs/structured-streaming-kafka-integration.md
index 668489addf82c..e851f210c92c4 100644
--- a/docs/structured-streaming-kafka-integration.md
+++ b/docs/structured-streaming-kafka-integration.md
@@ -150,16 +150,25 @@ The following options must be set for the Kafka source.
 
 <table class="table">
 <tr><th>Option</th><th>value</th><th>meaning</th></tr>
+<tr>
+  <td>assign</td>
+  <td>json string {"topicA":[0,1],"topicB":[2,4]}</td>
+  <td>Specific TopicPartitions to consume.
+  Only one of "assign", "subscribe" or "subscribePattern"
+  options can be specified for Kafka source.</td>
+</tr>
 <tr>
   <td>subscribe</td>
   <td>A comma-separated list of topics</td>
-  <td>The topic list to subscribe. Only one of "subscribe" and "subscribePattern" options can be
-  specified for Kafka source.</td>
+  <td>The topic list to subscribe.
+  Only one of "assign", "subscribe" or "subscribePattern"
+  options can be specified for Kafka source.</td>
 </tr>
 <tr>
   <td>subscribePattern</td>
   <td>Java regex string</td>
-  <td>The pattern used to subscribe the topic. Only one of "subscribe" and "subscribePattern"
+  <td>The pattern used to subscribe to topic(s).
+  Only one of "assign, "subscribe" or "subscribePattern"
   options can be specified for Kafka source.</td>
 </tr>
 <tr>
@@ -174,16 +183,21 @@ The following configurations are optional:
 <table class="table">
 <tr><th>Option</th><th>value</th><th>default</th><th>meaning</th></tr>
 <tr>
-  <td>startingOffset</td>
-  <td>["earliest", "latest"]</td>
-  <td>"latest"</td>
-  <td>The start point when a query is started, either "earliest" which is from the earliest offset, 
-  or "latest" which is just from the latest offset. Note: This only applies when a new Streaming q
-  uery is started, and that resuming will always pick up from where the query left off.</td>
+  <td>startingOffsets</td>
+  <td>earliest, latest, or json string
+  {"topicA":{"0":23,"1":-1},"topicB":{"0":-2}}
+  </td>
+  <td>latest</td>
+  <td>The start point when a query is started, either "earliest" which is from the earliest offsets,
+  "latest" which is just from the latest offsets, or a json string specifying a starting offset for
+  each TopicPartition.  In the json, -2 as an offset can be used to refer to earliest, -1 to latest.
+  Note: This only applies when a new Streaming query is started, and that resuming will always pick
+  up from where the query left off. Newly discovered partitions during a query will start at
+  earliest.</td>
 </tr>
 <tr>
   <td>failOnDataLoss</td>
-  <td>[true, false]</td>
+  <td>true or false</td>
   <td>true</td>
   <td>Whether to fail the query when it's possible that data is lost (e.g., topics are deleted, or 
   offsets are out of range). This may be a false alarm. You can disable it when it doesn't work
@@ -215,10 +229,10 @@ Kafka's own configurations can be set via `DataStreamReader.option` with `kafka.
 
 Note that the following Kafka params cannot be set and the Kafka source will throw an exception:
 - **group.id**: Kafka source will create a unique group id for each query automatically.
-- **auto.offset.reset**: Set the source option `startingOffset` to `earliest` or `latest` to specify
+- **auto.offset.reset**: Set the source option `startingOffsets` to specify
  where to start instead. Structured Streaming manages which offsets are consumed internally, rather 
  than rely on the kafka Consumer to do it. This will ensure that no data is missed when when new 
- topics/partitions are dynamically subscribed. Note that `startingOffset` only applies when a new
+ topics/partitions are dynamically subscribed. Note that `startingOffsets` only applies when a new
  Streaming query is started, and that resuming will always pick up from where the query left off.
 - **key.deserializer**: Keys are always deserialized as byte arrays with ByteArrayDeserializer. Use 
  DataFrame operations to explicitly deserialize the keys.
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/JsonUtils.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/JsonUtils.scala
new file mode 100644
index 0000000000000..40d568a12c25d
--- /dev/null
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/JsonUtils.scala
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import java.io.Writer
+
+import scala.collection.mutable.HashMap
+import scala.util.control.NonFatal
+
+import org.apache.kafka.common.TopicPartition
+import org.json4s.NoTypeHints
+import org.json4s.jackson.Serialization
+
+/**
+ * Utilities for converting Kafka related objects to and from json.
+ */
+private object JsonUtils {
+  private implicit val formats = Serialization.formats(NoTypeHints)
+
+  /**
+   * Read TopicPartitions from json string
+   */
+  def partitions(str: String): Array[TopicPartition] = {
+    try {
+      Serialization.read[Map[String, Seq[Int]]](str).flatMap {  case (topic, parts) =>
+          parts.map { part =>
+            new TopicPartition(topic, part)
+          }
+      }.toArray
+    } catch {
+      case NonFatal(x) =>
+        throw new IllegalArgumentException(
+          s"""Expected e.g. {"topicA":[0,1],"topicB":[0,1]}, got $str""")
+    }
+  }
+
+  /**
+   * Write TopicPartitions as json string
+   */
+  def partitions(partitions: Iterable[TopicPartition]): String = {
+    val result = new HashMap[String, List[Int]]
+    partitions.foreach { tp =>
+      val parts: List[Int] = result.getOrElse(tp.topic, Nil)
+      result += tp.topic -> (tp.partition::parts)
+    }
+    Serialization.write(result)
+  }
+
+  /**
+   * Read per-TopicPartition offsets from json string
+   */
+  def partitionOffsets(str: String): Map[TopicPartition, Long] = {
+    try {
+      Serialization.read[Map[String, Map[Int, Long]]](str).flatMap { case (topic, partOffsets) =>
+          partOffsets.map { case (part, offset) =>
+              new TopicPartition(topic, part) -> offset
+          }
+      }.toMap
+    } catch {
+      case NonFatal(x) =>
+        throw new IllegalArgumentException(
+          s"""Expected e.g. {"topicA":{"0":23,"1":-1},"topicB":{"0":-2}}, got $str""")
+    }
+  }
+
+  /**
+   * Write per-TopicPartition offsets as json string
+   */
+  def partitionOffsets(partitionOffsets: Map[TopicPartition, Long]): String = {
+    val result = new HashMap[String, HashMap[Int, Long]]()
+    partitionOffsets.foreach { case (tp, off) =>
+        val parts = result.getOrElse(tp.topic, new HashMap[Int, Long])
+        parts += tp.partition -> off
+        result += tp.topic -> parts
+    }
+    Serialization.write(result)
+  }
+}
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
index 4b0bb0a0f725c..537b7b0baa1b1 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
@@ -22,7 +22,7 @@ import java.{util => ju}
 import scala.collection.JavaConverters._
 import scala.util.control.NonFatal
 
-import org.apache.kafka.clients.consumer.{Consumer, KafkaConsumer}
+import org.apache.kafka.clients.consumer.{Consumer, KafkaConsumer, OffsetOutOfRangeException}
 import org.apache.kafka.clients.consumer.internals.NoOpConsumerRebalanceListener
 import org.apache.kafka.common.TopicPartition
 
@@ -82,7 +82,7 @@ private[kafka010] case class KafkaSource(
     executorKafkaParams: ju.Map[String, Object],
     sourceOptions: Map[String, String],
     metadataPath: String,
-    startFromEarliestOffset: Boolean,
+    startingOffsets: StartingOffsets,
     failOnDataLoss: Boolean)
   extends Source with Logging {
 
@@ -110,10 +110,10 @@ private[kafka010] case class KafkaSource(
   private lazy val initialPartitionOffsets = {
     val metadataLog = new HDFSMetadataLog[KafkaSourceOffset](sqlContext.sparkSession, metadataPath)
     metadataLog.get(0).getOrElse {
-      val offsets = if (startFromEarliestOffset) {
-        KafkaSourceOffset(fetchEarliestOffsets())
-      } else {
-        KafkaSourceOffset(fetchLatestOffsets())
+      val offsets = startingOffsets match {
+        case EarliestOffsets => KafkaSourceOffset(fetchEarliestOffsets())
+        case LatestOffsets => KafkaSourceOffset(fetchLatestOffsets())
+        case SpecificOffsets(p) => KafkaSourceOffset(fetchSpecificStartingOffsets(p))
       }
       metadataLog.add(0, offsets)
       logInfo(s"Initial offsets: $offsets")
@@ -231,6 +231,43 @@ private[kafka010] case class KafkaSource(
 
   override def toString(): String = s"KafkaSource[$consumerStrategy]"
 
+  /**
+   * Set consumer position to specified offsets, making sure all assignments are set.
+   */
+  private def fetchSpecificStartingOffsets(
+      partitionOffsets: Map[TopicPartition, Long]): Map[TopicPartition, Long] = {
+    val result = withRetriesWithoutInterrupt {
+      // Poll to get the latest assigned partitions
+      consumer.poll(0)
+      val partitions = consumer.assignment()
+      consumer.pause(partitions)
+      assert(partitions.asScala == partitionOffsets.keySet,
+        "If startingOffsets contains specific offsets, you must specify all TopicPartitions.\n" +
+          "Use -1 for latest, -2 for earliest, if you don't care.\n" +
+          s"Specified: ${partitionOffsets.keySet} Assigned: ${partitions.asScala}")
+      logDebug(s"Partitions assigned to consumer: $partitions. Seeking to $partitionOffsets")
+
+      partitionOffsets.foreach {
+        case (tp, -1) => consumer.seekToEnd(ju.Arrays.asList(tp))
+        case (tp, -2) => consumer.seekToBeginning(ju.Arrays.asList(tp))
+        case (tp, off) => consumer.seek(tp, off)
+      }
+      partitionOffsets.map {
+        case (tp, _) => tp -> consumer.position(tp)
+      }
+    }
+    partitionOffsets.foreach {
+      case (tp, off) if off != -1 && off != -2 =>
+        if (result(tp) != off) {
+          reportDataLoss(
+            s"startingOffsets for $tp was $off but consumer reset to ${result(tp)}")
+        }
+      case _ =>
+        // no real way to check that beginning or end is reasonable
+    }
+    result
+  }
+
   /**
    * Fetch the earliest offsets of partitions.
    */
@@ -273,7 +310,7 @@ private[kafka010] case class KafkaSource(
     consumer.poll(0)
     val partitions = consumer.assignment()
     consumer.pause(partitions)
-    logDebug(s"\tPartitioned assigned to consumer: $partitions")
+    logDebug(s"\tPartitions assigned to consumer: $partitions")
 
     // Get the earliest offset of each partition
     consumer.seekToBeginning(partitions)
@@ -317,6 +354,8 @@ private[kafka010] case class KafkaSource(
               try {
                 result = Some(body)
               } catch {
+                case x: OffsetOutOfRangeException =>
+                  reportDataLoss(x.getMessage)
                 case NonFatal(e) =>
                   lastException = e
                   logWarning(s"Error in attempt $attempt getting Kafka offsets: ", e)
@@ -373,6 +412,17 @@ private[kafka010] object KafkaSource {
     def createConsumer(): Consumer[Array[Byte], Array[Byte]]
   }
 
+  case class AssignStrategy(partitions: Array[TopicPartition], kafkaParams: ju.Map[String, Object])
+    extends ConsumerStrategy {
+    override def createConsumer(): Consumer[Array[Byte], Array[Byte]] = {
+      val consumer = new KafkaConsumer[Array[Byte], Array[Byte]](kafkaParams)
+      consumer.assign(ju.Arrays.asList(partitions: _*))
+      consumer
+    }
+
+    override def toString: String = s"Assign[${partitions.mkString(", ")}]"
+  }
+
   case class SubscribeStrategy(topics: Seq[String], kafkaParams: ju.Map[String, Object])
     extends ConsumerStrategy {
     override def createConsumer(): Consumer[Array[Byte], Array[Byte]] = {
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
index 23b1b60f3bcaa..585ced875caa7 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
@@ -77,14 +77,12 @@ private[kafka010] class KafkaSourceProvider extends StreamSourceProvider
     // id. Hence, we should generate a unique id for each query.
     val uniqueGroupId = s"spark-kafka-source-${UUID.randomUUID}-${metadataPath.hashCode}"
 
-    val startFromEarliestOffset =
-      caseInsensitiveParams.get(STARTING_OFFSET_OPTION_KEY).map(_.trim.toLowerCase) match {
-        case Some("latest") => false
-        case Some("earliest") => true
-        case Some(pos) =>
-          // This should not happen since we have already checked the options.
-          throw new IllegalStateException(s"Invalid $STARTING_OFFSET_OPTION_KEY: $pos")
-        case None => false
+    val startingOffsets =
+      caseInsensitiveParams.get(STARTING_OFFSETS_OPTION_KEY).map(_.trim.toLowerCase) match {
+        case Some("latest") => LatestOffsets
+        case Some("earliest") => EarliestOffsets
+        case Some(json) => SpecificOffsets(JsonUtils.partitionOffsets(json))
+        case None => LatestOffsets
       }
 
     val kafkaParamsForStrategy =
@@ -95,9 +93,9 @@ private[kafka010] class KafkaSourceProvider extends StreamSourceProvider
         // So that consumers in Kafka source do not mess with any existing group id
         .set(ConsumerConfig.GROUP_ID_CONFIG, s"$uniqueGroupId-driver")
 
-        // Set to "latest" to avoid exceptions. However, KafkaSource will fetch the initial offsets
-        // by itself instead of counting on KafkaConsumer.
-        .set(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest")
+        // Set to "earliest" to avoid exceptions. However, KafkaSource will fetch the initial
+        // offsets by itself instead of counting on KafkaConsumer.
+        .set(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest")
 
         // So that consumers in the driver does not commit offsets unnecessarily
         .set(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false")
@@ -130,6 +128,10 @@ private[kafka010] class KafkaSourceProvider extends StreamSourceProvider
         .build()
 
     val strategy = caseInsensitiveParams.find(x => STRATEGY_OPTION_KEYS.contains(x._1)).get match {
+      case ("assign", value) =>
+        AssignStrategy(
+          JsonUtils.partitions(value),
+          kafkaParamsForStrategy)
       case ("subscribe", value) =>
         SubscribeStrategy(
           value.split(",").map(_.trim()).filter(_.nonEmpty),
@@ -153,7 +155,7 @@ private[kafka010] class KafkaSourceProvider extends StreamSourceProvider
       kafkaParamsForExecutors,
       parameters,
       metadataPath,
-      startFromEarliestOffset,
+      startingOffsets,
       failOnDataLoss)
   }
 
@@ -175,6 +177,13 @@ private[kafka010] class KafkaSourceProvider extends StreamSourceProvider
     }
 
     val strategy = caseInsensitiveParams.find(x => STRATEGY_OPTION_KEYS.contains(x._1)).get match {
+      case ("assign", value) =>
+        if (!value.trim.startsWith("{")) {
+          throw new IllegalArgumentException(
+            "No topicpartitions to assign as specified value for option " +
+              s"'assign' is '$value'")
+        }
+
       case ("subscribe", value) =>
         val topics = value.split(",").map(_.trim).filter(_.nonEmpty)
         if (topics.isEmpty) {
@@ -195,14 +204,6 @@ private[kafka010] class KafkaSourceProvider extends StreamSourceProvider
         throw new IllegalArgumentException("Unknown option")
     }
 
-    caseInsensitiveParams.get(STARTING_OFFSET_OPTION_KEY) match {
-      case Some(pos) if !STARTING_OFFSET_OPTION_VALUES.contains(pos.trim.toLowerCase) =>
-        throw new IllegalArgumentException(
-          s"Illegal value '$pos' for option '$STARTING_OFFSET_OPTION_KEY', " +
-            s"acceptable values are: ${STARTING_OFFSET_OPTION_VALUES.mkString(", ")}")
-      case _ =>
-    }
-
     // Validate user-specified Kafka options
 
     if (caseInsensitiveParams.contains(s"kafka.${ConsumerConfig.GROUP_ID_CONFIG}")) {
@@ -215,11 +216,11 @@ private[kafka010] class KafkaSourceProvider extends StreamSourceProvider
       throw new IllegalArgumentException(
         s"""
            |Kafka option '${ConsumerConfig.AUTO_OFFSET_RESET_CONFIG}' is not supported.
-           |Instead set the source option '$STARTING_OFFSET_OPTION_KEY' to 'earliest' or 'latest' to
-           |specify where to start. Structured Streaming manages which offsets are consumed
+           |Instead set the source option '$STARTING_OFFSETS_OPTION_KEY' to 'earliest' or 'latest'
+           |to specify where to start. Structured Streaming manages which offsets are consumed
            |internally, rather than relying on the kafkaConsumer to do it. This will ensure that no
            |data is missed when when new topics/partitions are dynamically subscribed. Note that
-           |'$STARTING_OFFSET_OPTION_KEY' only applies when a new Streaming query is started, and
+           |'$STARTING_OFFSETS_OPTION_KEY' only applies when a new Streaming query is started, and
            |that resuming will always pick up from where the query left off. See the docs for more
            |details.
          """.stripMargin)
@@ -282,8 +283,7 @@ private[kafka010] class KafkaSourceProvider extends StreamSourceProvider
 }
 
 private[kafka010] object KafkaSourceProvider {
-  private val STRATEGY_OPTION_KEYS = Set("subscribe", "subscribepattern")
-  private val STARTING_OFFSET_OPTION_KEY = "startingoffset"
-  private val STARTING_OFFSET_OPTION_VALUES = Set("earliest", "latest")
+  private val STRATEGY_OPTION_KEYS = Set("subscribe", "subscribepattern", "assign")
+  private val STARTING_OFFSETS_OPTION_KEY = "startingoffsets"
   private val FAIL_ON_DATA_LOSS_OPTION_KEY = "failondataloss"
 }
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/StartingOffsets.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/StartingOffsets.scala
new file mode 100644
index 0000000000000..83959e597171a
--- /dev/null
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/StartingOffsets.scala
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import org.apache.kafka.common.TopicPartition
+
+/*
+ * Values that can be specified for config startingOffsets
+ */
+private[kafka010] sealed trait StartingOffsets
+
+private[kafka010] case object EarliestOffsets extends StartingOffsets
+
+private[kafka010] case object LatestOffsets extends StartingOffsets
+
+private[kafka010] case class SpecificOffsets(
+  partitionOffsets: Map[TopicPartition, Long]) extends StartingOffsets
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/JsonUtilsSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/JsonUtilsSuite.scala
new file mode 100644
index 0000000000000..54b980049d1a2
--- /dev/null
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/JsonUtilsSuite.scala
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import org.apache.kafka.common.TopicPartition
+
+import org.apache.spark.SparkFunSuite
+
+class JsonUtilsSuite extends SparkFunSuite {
+
+  test("parsing partitions") {
+    val parsed = JsonUtils.partitions("""{"topicA":[0,1],"topicB":[4,6]}""")
+    val expected = Array(
+      new TopicPartition("topicA", 0),
+      new TopicPartition("topicA", 1),
+      new TopicPartition("topicB", 4),
+      new TopicPartition("topicB", 6)
+    )
+    assert(parsed.toSeq === expected.toSeq)
+  }
+
+  test("parsing partitionOffsets") {
+    val parsed = JsonUtils.partitionOffsets(
+      """{"topicA":{"0":23,"1":-1},"topicB":{"0":-2}}""")
+
+    assert(parsed(new TopicPartition("topicA", 0)) === 23)
+    assert(parsed(new TopicPartition("topicA", 1)) === -1)
+    assert(parsed(new TopicPartition("topicB", 0)) === -2)
+  }
+}
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
index 8b5296ea135c7..b50688ecb7743 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
@@ -22,6 +22,7 @@ import java.util.concurrent.atomic.AtomicInteger
 import scala.util.Random
 
 import org.apache.kafka.clients.producer.RecordMetadata
+import org.apache.kafka.common.TopicPartition
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.sql.execution.streaming._
@@ -52,7 +53,7 @@ abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
   protected def makeSureGetOffsetCalled = AssertOnQuery { q =>
     // Because KafkaSource's initialPartitionOffsets is set lazily, we need to make sure
     // its "getOffset" is called before pushing any data. Otherwise, because of the race contion,
-    // we don't know which data should be fetched when `startingOffset` is latest.
+    // we don't know which data should be fetched when `startingOffsets` is latest.
     q.processAllAvailable()
     true
   }
@@ -155,26 +156,52 @@ class KafkaSourceSuite extends KafkaSourceTest {
     )
   }
 
+  test("assign from latest offsets") {
+    val topic = newTopic()
+    testFromLatestOffsets(topic, false, "assign" -> assignString(topic, 0 to 4))
+  }
+
+  test("assign from earliest offsets") {
+    val topic = newTopic()
+    testFromEarliestOffsets(topic, false, "assign" -> assignString(topic, 0 to 4))
+  }
+
+  test("assign from specific offsets") {
+    val topic = newTopic()
+    testFromSpecificOffsets(topic, "assign" -> assignString(topic, 0 to 4))
+  }
+
   test("subscribing topic by name from latest offsets") {
     val topic = newTopic()
-    testFromLatestOffsets(topic, "subscribe" -> topic)
+    testFromLatestOffsets(topic, true, "subscribe" -> topic)
   }
 
   test("subscribing topic by name from earliest offsets") {
     val topic = newTopic()
-    testFromEarliestOffsets(topic, "subscribe" -> topic)
+    testFromEarliestOffsets(topic, true, "subscribe" -> topic)
+  }
+
+  test("subscribing topic by name from specific offsets") {
+    val topic = newTopic()
+    testFromSpecificOffsets(topic, "subscribe" -> topic)
   }
 
   test("subscribing topic by pattern from latest offsets") {
     val topicPrefix = newTopic()
     val topic = topicPrefix + "-suffix"
-    testFromLatestOffsets(topic, "subscribePattern" -> s"$topicPrefix-.*")
+    testFromLatestOffsets(topic, true, "subscribePattern" -> s"$topicPrefix-.*")
   }
 
   test("subscribing topic by pattern from earliest offsets") {
     val topicPrefix = newTopic()
     val topic = topicPrefix + "-suffix"
-    testFromEarliestOffsets(topic, "subscribePattern" -> s"$topicPrefix-.*")
+    testFromEarliestOffsets(topic, true, "subscribePattern" -> s"$topicPrefix-.*")
+  }
+
+  test("subscribing topic by pattern from specific offsets") {
+    val topicPrefix = newTopic()
+    val topic = topicPrefix + "-suffix"
+    testFromSpecificOffsets(topic, "subscribePattern" -> s"$topicPrefix-.*")
   }
 
   test("subscribing topic by pattern with topic deletions") {
@@ -233,6 +260,10 @@ class KafkaSourceSuite extends KafkaSourceTest {
     testBadOptions("subscribe" -> "t", "subscribePattern" -> "t.*")(
       "only one", "options can be specified")
 
+    testBadOptions("subscribe" -> "t", "assign" -> """{"a":[0]}""")(
+      "only one", "options can be specified")
+
+    testBadOptions("assign" -> "")("no topicpartitions to assign")
     testBadOptions("subscribe" -> "")("no topics to subscribe")
     testBadOptions("subscribePattern" -> "")("pattern to subscribe is empty")
   }
@@ -293,7 +324,61 @@ class KafkaSourceSuite extends KafkaSourceTest {
 
   private def newTopic(): String = s"topic-${topicId.getAndIncrement()}"
 
-  private def testFromLatestOffsets(topic: String, options: (String, String)*): Unit = {
+  private def assignString(topic: String, partitions: Iterable[Int]): String = {
+    JsonUtils.partitions(partitions.map(p => new TopicPartition(topic, p)))
+  }
+
+  private def testFromSpecificOffsets(topic: String, options: (String, String)*): Unit = {
+    val partitionOffsets = Map(
+      new TopicPartition(topic, 0) -> -2L,
+      new TopicPartition(topic, 1) -> -1L,
+      new TopicPartition(topic, 2) -> 0L,
+      new TopicPartition(topic, 3) -> 1L,
+      new TopicPartition(topic, 4) -> 2L
+    )
+    val startingOffsets = JsonUtils.partitionOffsets(partitionOffsets)
+
+    testUtils.createTopic(topic, partitions = 5)
+    // part 0 starts at earliest, these should all be seen
+    testUtils.sendMessages(topic, Array(-20, -21, -22).map(_.toString), Some(0))
+    // part 1 starts at latest, these should all be skipped
+    testUtils.sendMessages(topic, Array(-10, -11, -12).map(_.toString), Some(1))
+    // part 2 starts at 0, these should all be seen
+    testUtils.sendMessages(topic, Array(0, 1, 2).map(_.toString), Some(2))
+    // part 3 starts at 1, first should be skipped
+    testUtils.sendMessages(topic, Array(10, 11, 12).map(_.toString), Some(3))
+    // part 4 starts at 2, first and second should be skipped
+    testUtils.sendMessages(topic, Array(20, 21, 22).map(_.toString), Some(4))
+    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
+
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("startingOffsets", startingOffsets)
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("kafka.metadata.max.age.ms", "1")
+    options.foreach { case (k, v) => reader.option(k, v) }
+    val kafka = reader.load()
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .as[(String, String)]
+    val mapped: org.apache.spark.sql.Dataset[_] = kafka.map(kv => kv._2.toInt)
+
+    testStream(mapped)(
+      makeSureGetOffsetCalled,
+      CheckAnswer(-20, -21, -22, 0, 1, 2, 11, 12, 22),
+      StopStream,
+      StartStream(),
+      CheckAnswer(-20, -21, -22, 0, 1, 2, 11, 12, 22), // Should get the data back on recovery
+      AddKafkaData(Set(topic), 30, 31, 32, 33, 34)(ensureDataInMultiplePartition = true),
+      CheckAnswer(-20, -21, -22, 0, 1, 2, 11, 12, 22, 30, 31, 32, 33, 34),
+      StopStream
+    )
+  }
+
+  private def testFromLatestOffsets(
+      topic: String,
+      addPartitions: Boolean,
+      options: (String, String)*): Unit = {
     testUtils.createTopic(topic, partitions = 5)
     testUtils.sendMessages(topic, Array("-1"))
     require(testUtils.getLatestOffsets(Set(topic)).size === 5)
@@ -301,7 +386,7 @@ class KafkaSourceSuite extends KafkaSourceTest {
     val reader = spark
       .readStream
       .format("kafka")
-      .option("startingOffset", s"latest")
+      .option("startingOffsets", s"latest")
       .option("kafka.bootstrap.servers", testUtils.brokerAddress)
       .option("kafka.metadata.max.age.ms", "1")
     options.foreach { case (k, v) => reader.option(k, v) }
@@ -324,7 +409,9 @@ class KafkaSourceSuite extends KafkaSourceTest {
       AddKafkaData(Set(topic), 7, 8),
       CheckAnswer(2, 3, 4, 5, 6, 7, 8, 9),
       AssertOnQuery("Add partitions") { query: StreamExecution =>
-        testUtils.addPartitions(topic, 10)
+        if (addPartitions) {
+          testUtils.addPartitions(topic, 10)
+        }
         true
       },
       AddKafkaData(Set(topic), 9, 10, 11, 12, 13, 14, 15, 16),
@@ -332,7 +419,10 @@ class KafkaSourceSuite extends KafkaSourceTest {
     )
   }
 
-  private def testFromEarliestOffsets(topic: String, options: (String, String)*): Unit = {
+  private def testFromEarliestOffsets(
+      topic: String,
+      addPartitions: Boolean,
+      options: (String, String)*): Unit = {
     testUtils.createTopic(topic, partitions = 5)
     testUtils.sendMessages(topic, (1 to 3).map { _.toString }.toArray)
     require(testUtils.getLatestOffsets(Set(topic)).size === 5)
@@ -340,7 +430,7 @@ class KafkaSourceSuite extends KafkaSourceTest {
     val reader = spark.readStream
     reader
       .format(classOf[KafkaSourceProvider].getCanonicalName.stripSuffix("$"))
-      .option("startingOffset", s"earliest")
+      .option("startingOffsets", s"earliest")
       .option("kafka.bootstrap.servers", testUtils.brokerAddress)
       .option("kafka.metadata.max.age.ms", "1")
     options.foreach { case (k, v) => reader.option(k, v) }
@@ -360,7 +450,9 @@ class KafkaSourceSuite extends KafkaSourceTest {
       StartStream(),
       CheckAnswer(2, 3, 4, 5, 6, 7, 8, 9),
       AssertOnQuery("Add partitions") { query: StreamExecution =>
-        testUtils.addPartitions(topic, 10)
+        if (addPartitions) {
+          testUtils.addPartitions(topic, 10)
+        }
         true
       },
       AddKafkaData(Set(topic), 9, 10, 11, 12, 13, 14, 15, 16),
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala
index 3eb8a737ba4c8..9b24ccdd560e8 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala
@@ -201,11 +201,23 @@ class KafkaTestUtils extends Logging {
 
   /** Send the array of messages to the Kafka broker */
   def sendMessages(topic: String, messages: Array[String]): Seq[(String, RecordMetadata)] = {
+    sendMessages(topic, messages, None)
+  }
+
+  /** Send the array of messages to the Kafka broker using specified partition */
+  def sendMessages(
+      topic: String,
+      messages: Array[String],
+      partition: Option[Int]): Seq[(String, RecordMetadata)] = {
     producer = new KafkaProducer[String, String](producerConfiguration)
     val offsets = try {
       messages.map { m =>
+        val record = partition match {
+          case Some(p) => new ProducerRecord[String, String](topic, p, null, m)
+          case None => new ProducerRecord[String, String](topic, m)
+        }
         val metadata =
-          producer.send(new ProducerRecord[String, String](topic, m)).get(10, TimeUnit.SECONDS)
+          producer.send(record).get(10, TimeUnit.SECONDS)
           logInfo(s"\tSent $m to partition ${metadata.partition}, offset ${metadata.offset}")
         (m, metadata)
       }

From c9720b2195a465653690b3e221ce789142217b0d Mon Sep 17 00:00:00 2001
From: cody koeninger <cody@koeninger.org>
Date: Fri, 21 Oct 2016 16:27:19 -0700
Subject: [PATCH 086/162] [STREAMING][KAFKA][DOC] clarify kafka settings needed
 for larger batches

## What changes were proposed in this pull request?

Minor doc change to mention kafka configuration for larger spark batches.

## How was this patch tested?

Doc change only, confirmed via jekyll.

The configuration issue was discussed / confirmed with users on the mailing list.

Author: cody koeninger <cody@koeninger.org>

Closes #15570 from koeninger/kafka-doc-heartbeat.
---
 docs/streaming-kafka-0-10-integration.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/streaming-kafka-0-10-integration.md b/docs/streaming-kafka-0-10-integration.md
index 456b8453383db..de95ea90137eb 100644
--- a/docs/streaming-kafka-0-10-integration.md
+++ b/docs/streaming-kafka-0-10-integration.md
@@ -48,6 +48,7 @@ Each item in the stream is a [ConsumerRecord](http://kafka.apache.org/0100/javad
 </div>
 
 For possible kafkaParams, see [Kafka consumer config docs](http://kafka.apache.org/documentation.html#newconsumerconfigs).
+If your Spark batch duration is larger than the default Kafka heartbeat session timeout (30 seconds), increase heartbeat.interval.ms and session.timeout.ms appropriately.  For batches larger than 5 minutes, this will require changing group.max.session.timeout.ms on the broker.
 Note that the example sets enable.auto.commit to false, for discussion see [Storing Offsets](streaming-kafka-0-10-integration.html#storing-offsets) below.
 
 ### LocationStrategies

From 3fbf5a58c236fc5d5fee39cb29e7f5c7e01c0ee7 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Fri, 21 Oct 2016 17:27:18 -0700
Subject: [PATCH 087/162] [SPARK-18042][SQL] OutputWriter should expose file
 path written

## What changes were proposed in this pull request?
This patch adds a new "path" method on OutputWriter that returns the path of the file written by the OutputWriter. This is part of the necessary work to consolidate structured streaming and batch write paths.

The batch write path has a nice feature that each data source can define the extension of the files, and allow Spark to specify the staging directory and the prefix for the files. However, in the streaming path we need to collect the list of files written, and there is no interface right now to do that.

## How was this patch tested?
N/A - there is no behavior change and this should be covered by existing tests.

Author: Reynold Xin <rxin@databricks.com>

Closes #15580 from rxin/SPARK-18042.
---
 .../ml/source/libsvm/LibSVMRelation.scala     |  8 ++++-
 .../execution/datasources/OutputWriter.scala  | 17 ++++++-----
 .../datasources/csv/CSVRelation.scala         |  8 ++++-
 .../datasources/json/JsonFileFormat.scala     |  8 ++++-
 .../parquet/ParquetFileFormat.scala           |  2 +-
 .../datasources/parquet/ParquetOptions.scala  |  2 +-
 .../parquet/ParquetOutputWriter.scala         | 24 ++++++++-------
 .../datasources/text/TextFileFormat.scala     | 25 ++++++++++++++--
 .../spark/sql/hive/orc/OrcFileFormat.scala    | 29 +++++++++----------
 .../sql/sources/CommitFailureTestSource.scala |  3 ++
 .../sql/sources/SimpleTextRelation.scala      |  3 ++
 11 files changed, 90 insertions(+), 39 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
index fff86686b550c..5e9e6ff1a5690 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
@@ -35,6 +35,7 @@ import org.apache.spark.sql.catalyst.encoders.RowEncoder
 import org.apache.spark.sql.catalyst.expressions.AttributeReference
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
 import org.apache.spark.sql.execution.datasources._
+import org.apache.spark.sql.execution.datasources.text.TextOutputWriter
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
 import org.apache.spark.util.SerializableConfiguration
@@ -46,12 +47,17 @@ private[libsvm] class LibSVMOutputWriter(
     context: TaskAttemptContext)
   extends OutputWriter {
 
+  override val path: String = {
+    val compressionExtension = TextOutputWriter.getCompressionExtension(context)
+    new Path(stagingDir, fileNamePrefix + ".libsvm" + compressionExtension).toString
+  }
+
   private[this] val buffer = new Text()
 
   private val recordWriter: RecordWriter[NullWritable, Text] = {
     new TextOutputFormat[NullWritable, Text]() {
       override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
-        new Path(stagingDir, fileNamePrefix + extension)
+        new Path(path)
       }
     }.getRecordWriter(context)
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/OutputWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/OutputWriter.scala
index f4cefdab077e9..fbf6e96d3f850 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/OutputWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/OutputWriter.scala
@@ -42,11 +42,12 @@ abstract class OutputWriterFactory extends Serializable {
    * @param fileNamePrefix Prefix of the file name. The returned OutputWriter must make sure this
    *                       prefix is used in the actual file name. For example, if the prefix is
    *                       "part-1-2-3", then the file name must start with "part_1_2_3" but can
-   *                       end in arbitrary extension.
+   *                       end in arbitrary extension that is deterministic given the configuration
+   *                       (i.e. the suffix extension should not depend on any task id, attempt id,
+   *                       or partition id).
    * @param dataSchema Schema of the rows to be written. Partition columns are not included in the
    *        schema if the relation being written is partitioned.
    * @param context The Hadoop MapReduce task context.
-   * @since 1.4.0
    */
   def newInstance(
       stagingDir: String,
@@ -62,7 +63,6 @@ abstract class OutputWriterFactory extends Serializable {
    * and not modify it (do not add subdirectories, extensions, etc.). All other
    * file-format-specific information needed to create the writer must be passed
    * through the [[OutputWriterFactory]] implementation.
-   * @since 2.0.0
    */
   def newWriter(path: String): OutputWriter = {
     throw new UnsupportedOperationException("newInstance with just path not supported")
@@ -77,19 +77,22 @@ abstract class OutputWriterFactory extends Serializable {
  * executor side.  This instance is used to persist rows to this single output file.
  */
 abstract class OutputWriter {
+
+  /**
+   * The path of the file to be written out. This path should include the staging directory and
+   * the file name prefix passed into the associated createOutputWriter function.
+   */
+  def path: String
+
   /**
    * Persists a single row.  Invoked on the executor side.  When writing to dynamically partitioned
    * tables, dynamic partition columns are not included in rows to be written.
-   *
-   * @since 1.4.0
    */
   def write(row: Row): Unit
 
   /**
    * Closes the [[OutputWriter]]. Invoked on the executor side after all rows are persisted, before
    * the task output is committed.
-   *
-   * @since 1.4.0
    */
   def close(): Unit
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala
index eefacbf05ba0d..a35cfdb2c234f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala
@@ -32,6 +32,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.datasources.{OutputWriter, OutputWriterFactory, PartitionedFile}
+import org.apache.spark.sql.execution.datasources.text.TextOutputWriter
 import org.apache.spark.sql.types._
 
 object CSVRelation extends Logging {
@@ -185,6 +186,11 @@ private[csv] class CsvOutputWriter(
     context: TaskAttemptContext,
     params: CSVOptions) extends OutputWriter with Logging {
 
+  override val path: String = {
+    val compressionExtension = TextOutputWriter.getCompressionExtension(context)
+    new Path(stagingDir, fileNamePrefix + ".csv" + compressionExtension).toString
+  }
+
   // create the Generator without separator inserted between 2 records
   private[this] val text = new Text()
 
@@ -199,7 +205,7 @@ private[csv] class CsvOutputWriter(
   private val recordWriter: RecordWriter[NullWritable, Text] = {
     new TextOutputFormat[NullWritable, Text]() {
       override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
-        new Path(stagingDir, s"$fileNamePrefix.csv$extension")
+        new Path(path)
       }
     }.getRecordWriter(context)
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala
index cdbb2f7292613..651fa78a4e924 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala
@@ -35,6 +35,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.json.{JacksonParser, JSONOptions}
 import org.apache.spark.sql.catalyst.util.CompressionCodecs
 import org.apache.spark.sql.execution.datasources._
+import org.apache.spark.sql.execution.datasources.text.TextOutputWriter
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.SerializableConfiguration
@@ -160,6 +161,11 @@ private[json] class JsonOutputWriter(
     context: TaskAttemptContext)
   extends OutputWriter with Logging {
 
+  override val path: String = {
+    val compressionExtension = TextOutputWriter.getCompressionExtension(context)
+    new Path(stagingDir, fileNamePrefix + ".json" + compressionExtension).toString
+  }
+
   private[this] val writer = new CharArrayWriter()
   // create the Generator without separator inserted between 2 records
   private[this] val gen = new JacksonGenerator(dataSchema, writer, options)
@@ -168,7 +174,7 @@ private[json] class JsonOutputWriter(
   private val recordWriter: RecordWriter[NullWritable, Text] = {
     new TextOutputFormat[NullWritable, Text]() {
       override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
-        new Path(stagingDir, s"$fileNamePrefix.json$extension")
+        new Path(path)
       }
     }.getRecordWriter(context)
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
index 87b944ba523ca..502dd0e8d4cf9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
@@ -121,7 +121,7 @@ class ParquetFileFormat
       sparkSession.sessionState.conf.writeLegacyParquetFormat.toString)
 
     // Sets compression scheme
-    conf.set(ParquetOutputFormat.COMPRESSION, parquetOptions.compressionCodec)
+    conf.set(ParquetOutputFormat.COMPRESSION, parquetOptions.compressionCodecClassName)
 
     // SPARK-15719: Disables writing Parquet summary files by default.
     if (conf.get(ParquetOutputFormat.ENABLE_JOB_SUMMARY) == null) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala
index 615731889dfad..d0fd23605bea8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala
@@ -35,7 +35,7 @@ private[parquet] class ParquetOptions(
    * Compression codec to use. By default use the value specified in SQLConf.
    * Acceptable values are defined in [[shortParquetCompressionCodecNames]].
    */
-  val compressionCodec: String = {
+  val compressionCodecClassName: String = {
     val codecName = parameters.getOrElse("compression", sqlConf.parquetCompressionCodec).toLowerCase
     if (!shortParquetCompressionCodecNames.contains(codecName)) {
       val availableCodecs = shortParquetCompressionCodecNames.keys.map(_.toLowerCase)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOutputWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOutputWriter.scala
index 39c199784cd6d..1300069c42b05 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOutputWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOutputWriter.scala
@@ -22,6 +22,7 @@ import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
 import org.apache.parquet.hadoop.{ParquetOutputFormat, ParquetRecordWriter}
+import org.apache.parquet.hadoop.codec.CodecConfig
 import org.apache.parquet.hadoop.util.ContextUtil
 
 import org.apache.spark.sql.Row
@@ -80,7 +81,7 @@ private[parquet] class ParquetOutputWriterFactory(
       sqlConf.writeLegacyParquetFormat.toString)
 
     // Sets compression scheme
-    conf.set(ParquetOutputFormat.COMPRESSION, parquetOptions.compressionCodec)
+    conf.set(ParquetOutputFormat.COMPRESSION, parquetOptions.compressionCodecClassName)
     new SerializableConfiguration(conf)
   }
 
@@ -88,7 +89,7 @@ private[parquet] class ParquetOutputWriterFactory(
    * Returns a [[OutputWriter]] that writes data to the give path without using
    * [[OutputCommitter]].
    */
-  override def newWriter(path: String): OutputWriter = new OutputWriter {
+  override def newWriter(path1: String): OutputWriter = new OutputWriter {
 
     // Create TaskAttemptContext that is used to pass on Configuration to the ParquetRecordWriter
     private val hadoopTaskAttemptId = new TaskAttemptID(new TaskID(new JobID, TaskType.MAP, 0), 0)
@@ -98,6 +99,8 @@ private[parquet] class ParquetOutputWriterFactory(
     // Instance of ParquetRecordWriter that does not use OutputCommitter
     private val recordWriter = createNoCommitterRecordWriter(path, hadoopAttemptContext)
 
+    override def path: String = path1
+
     override def write(row: Row): Unit = {
       throw new UnsupportedOperationException("call writeInternal")
     }
@@ -140,16 +143,17 @@ private[parquet] class ParquetOutputWriter(
     context: TaskAttemptContext)
   extends OutputWriter {
 
+  override val path: String = {
+    val filename = fileNamePrefix + CodecConfig.from(context).getCodec.getExtension + ".parquet"
+    new Path(stagingDir, filename).toString
+  }
+
   private val recordWriter: RecordWriter[Void, InternalRow] = {
-    val outputFormat = {
-      new ParquetOutputFormat[InternalRow]() {
-        override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
-          new Path(stagingDir, fileNamePrefix + extension)
-        }
+    new ParquetOutputFormat[InternalRow]() {
+      override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
+        new Path(path)
       }
-    }
-
-    outputFormat.getRecordWriter(context)
+    }.getRecordWriter(context)
   }
 
   override def write(row: Row): Unit = throw new UnsupportedOperationException("call writeInternal")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala
index 6cd2351c5749a..d40b5725199a8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala
@@ -20,8 +20,10 @@ package org.apache.spark.sql.execution.datasources.text
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, Path}
 import org.apache.hadoop.io.{NullWritable, Text}
+import org.apache.hadoop.io.compress.GzipCodec
 import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext}
-import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
+import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, TextOutputFormat}
+import org.apache.hadoop.util.ReflectionUtils
 
 import org.apache.spark.TaskContext
 import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
@@ -128,12 +130,17 @@ class TextOutputWriter(
     context: TaskAttemptContext)
   extends OutputWriter {
 
+  override val path: String = {
+    val compressionExtension = TextOutputWriter.getCompressionExtension(context)
+    new Path(stagingDir, fileNamePrefix + ".txt" + compressionExtension).toString
+  }
+
   private[this] val buffer = new Text()
 
   private val recordWriter: RecordWriter[NullWritable, Text] = {
     new TextOutputFormat[NullWritable, Text]() {
       override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
-        new Path(stagingDir, s"$fileNamePrefix.txt$extension")
+        new Path(path)
       }
     }.getRecordWriter(context)
   }
@@ -150,3 +157,17 @@ class TextOutputWriter(
     recordWriter.close(context)
   }
 }
+
+
+object TextOutputWriter {
+  /** Returns the compression codec extension to be used in a file name, e.g. ".gzip"). */
+  def getCompressionExtension(context: TaskAttemptContext): String = {
+    // Set the compression extension, similar to code in TextOutputFormat.getDefaultWorkFile
+    if (FileOutputFormat.getCompressOutput(context)) {
+      val codecClass = FileOutputFormat.getOutputCompressorClass(context, classOf[GzipCodec])
+      ReflectionUtils.newInstance(codecClass, context.getConfiguration).getDefaultExtension
+    } else {
+      ""
+    }
+  }
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
index 1ceacb458ae6e..eba7aa386ade2 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
@@ -216,9 +216,18 @@ private[orc] class OrcOutputWriter(
     context: TaskAttemptContext)
   extends OutputWriter {
 
-  private[this] val conf = context.getConfiguration
+  override val path: String = {
+    val compressionExtension: String = {
+      val name = context.getConfiguration.get(OrcRelation.ORC_COMPRESSION)
+      OrcRelation.extensionsForCompressionCodecNames.getOrElse(name, "")
+    }
+    // It has the `.orc` extension at the end because (de)compression tools
+    // such as gunzip would not be able to decompress this as the compression
+    // is not applied on this whole file but on each "stream" in ORC format.
+    new Path(stagingDir, fileNamePrefix + compressionExtension + ".orc").toString
+  }
 
-  private[this] val serializer = new OrcSerializer(dataSchema, conf)
+  private[this] val serializer = new OrcSerializer(dataSchema, context.getConfiguration)
 
   // `OrcRecordWriter.close()` creates an empty file if no rows are written at all.  We use this
   // flag to decide whether `OrcRecordWriter.close()` needs to be called.
@@ -226,20 +235,10 @@ private[orc] class OrcOutputWriter(
 
   private lazy val recordWriter: RecordWriter[NullWritable, Writable] = {
     recordWriterInstantiated = true
-
-    val compressionExtension = {
-      val name = conf.get(OrcRelation.ORC_COMPRESSION)
-      OrcRelation.extensionsForCompressionCodecNames.getOrElse(name, "")
-    }
-    // It has the `.orc` extension at the end because (de)compression tools
-    // such as gunzip would not be able to decompress this as the compression
-    // is not applied on this whole file but on each "stream" in ORC format.
-    val filename = s"$fileNamePrefix$compressionExtension.orc"
-
     new OrcOutputFormat().getRecordWriter(
-      new Path(stagingDir, filename).getFileSystem(conf),
-      conf.asInstanceOf[JobConf],
-      new Path(stagingDir, filename).toString,
+      new Path(path).getFileSystem(context.getConfiguration),
+      context.getConfiguration.asInstanceOf[JobConf],
+      path,
       Reporter.NULL
     ).asInstanceOf[RecordWriter[NullWritable, Writable]]
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/CommitFailureTestSource.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/CommitFailureTestSource.scala
index d5044684020e2..731540db17eeb 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/CommitFailureTestSource.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/CommitFailureTestSource.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.sources
 
+import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}
 
 import org.apache.spark.TaskContext
@@ -50,6 +51,8 @@ class CommitFailureTestSource extends SimpleTextSource {
             SimpleTextRelation.callbackCalled = true
           }
 
+          override val path: String = new Path(stagingDir, fileNamePrefix).toString
+
           override def write(row: Row): Unit = {
             if (SimpleTextRelation.failWriter) {
               sys.error("Intentional task writer failure for testing purpose.")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
index 9e13b217ec305..9896b9bde99c8 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
@@ -123,6 +123,9 @@ class SimpleTextSource extends TextBasedFileFormat with DataSourceRegister {
 class SimpleTextOutputWriter(
     stagingDir: String, fileNamePrefix: String, context: TaskAttemptContext)
   extends OutputWriter {
+
+  override val path: String = new Path(stagingDir, fileNamePrefix).toString
+
   private val recordWriter: RecordWriter[NullWritable, Text] =
     new AppendingTextOutputFormat(new Path(stagingDir), fileNamePrefix).getRecordWriter(context)
 

From 7178c56433cd138dae53db9194c55e3f4fa0fa69 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Fri, 21 Oct 2016 22:20:52 -0700
Subject: [PATCH 088/162] [SPARK-16606][MINOR] Tiny follow-up to , to correct
 more instances of the same log message typo

## What changes were proposed in this pull request?

Tiny follow-up to SPARK-16606 / https://github.com/apache/spark/pull/14533 , to correct more instances of the same log message typo

## How was this patch tested?

Existing tests (no functional change anyway)

Author: Sean Owen <sowen@cloudera.com>

Closes #15586 from srowen/SPARK-16606.2.
---
 .../src/main/scala/org/apache/spark/sql/SparkSession.scala    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
index baae55013787d..3045eb69f427f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -814,7 +814,7 @@ object SparkSession {
       if ((session ne null) && !session.sparkContext.isStopped) {
         options.foreach { case (k, v) => session.sessionState.conf.setConfString(k, v) }
         if (options.nonEmpty) {
-          logWarning("Use an existing SparkSession, some configuration may not take effect.")
+          logWarning("Using an existing SparkSession; some configuration may not take effect.")
         }
         return session
       }
@@ -826,7 +826,7 @@ object SparkSession {
         if ((session ne null) && !session.sparkContext.isStopped) {
           options.foreach { case (k, v) => session.sessionState.conf.setConfString(k, v) }
           if (options.nonEmpty) {
-            logWarning("Use an existing SparkSession, some configuration may not take effect.")
+            logWarning("Using an existing SparkSession; some configuration may not take effect.")
           }
           return session
         }

From 625fdddacd58ad54fdbb17409987812176abc812 Mon Sep 17 00:00:00 2001
From: Erik O'Shaughnessy <erik.oshaughnessy@gmail.com>
Date: Sat, 22 Oct 2016 09:37:53 +0100
Subject: [PATCH 089/162] [SPARK-17944][DEPLOY] sbin/start-* scripts use of
 `hostname -f` fail with Solaris

## What changes were proposed in this pull request?

Modify sbin/start-master.sh, sbin/start-mesos-dispatcher.sh and sbin/start-slaves.sh to use the output of 'uname' to select which OS-specific command-line is used to determine the host's fully qualified host name.

## How was this patch tested?

Tested by hand; starting on Solaris, Linux and macOS.

Author: Erik O'Shaughnessy <erik.oshaughnessy@gmail.com>

Closes #15557 from JnyJny/SPARK-17944.
---
 sbin/start-master.sh           | 9 ++++++++-
 sbin/start-mesos-dispatcher.sh | 9 ++++++++-
 sbin/start-slaves.sh           | 9 ++++++++-
 3 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/sbin/start-master.sh b/sbin/start-master.sh
index d970fcc45e2c1..97ee32159b6de 100755
--- a/sbin/start-master.sh
+++ b/sbin/start-master.sh
@@ -48,7 +48,14 @@ if [ "$SPARK_MASTER_PORT" = "" ]; then
 fi
 
 if [ "$SPARK_MASTER_HOST" = "" ]; then
-  SPARK_MASTER_HOST=`hostname -f`
+  case `uname` in
+      (SunOS)
+	  SPARK_MASTER_HOST="`/usr/sbin/check-hostname | awk '{print $NF}'`"
+	  ;;
+      (*)
+	  SPARK_MASTER_HOST="`hostname -f`"
+	  ;;
+  esac
 fi
 
 if [ "$SPARK_MASTER_WEBUI_PORT" = "" ]; then
diff --git a/sbin/start-mesos-dispatcher.sh b/sbin/start-mesos-dispatcher.sh
index ef65fb9539146..ecaad7ad09634 100755
--- a/sbin/start-mesos-dispatcher.sh
+++ b/sbin/start-mesos-dispatcher.sh
@@ -34,7 +34,14 @@ if [ "$SPARK_MESOS_DISPATCHER_PORT" = "" ]; then
 fi
 
 if [ "$SPARK_MESOS_DISPATCHER_HOST" = "" ]; then
-  SPARK_MESOS_DISPATCHER_HOST=`hostname -f`
+  case `uname` in
+      (SunOS)
+	  SPARK_MESOS_DISPATCHER_HOST="`/usr/sbin/check-hostname | awk '{print $NF}'`"
+	  ;;
+      (*)
+	  SPARK_MESOS_DISPATCHER_HOST="`hostname -f`"
+	  ;;
+  esac
 fi
 
 if [ "$SPARK_MESOS_DISPATCHER_NUM" = "" ]; then
diff --git a/sbin/start-slaves.sh b/sbin/start-slaves.sh
index 7d8871251f81b..f5269df523dac 100755
--- a/sbin/start-slaves.sh
+++ b/sbin/start-slaves.sh
@@ -32,7 +32,14 @@ if [ "$SPARK_MASTER_PORT" = "" ]; then
 fi
 
 if [ "$SPARK_MASTER_HOST" = "" ]; then
-  SPARK_MASTER_HOST="`hostname -f`"
+  case `uname` in
+      (SunOS)
+	  SPARK_MASTER_HOST="`/usr/sbin/check-hostname | awk '{print $NF}'`"
+	  ;;
+      (*)
+	  SPARK_MASTER_HOST="`hostname -f`"
+	  ;;
+  esac
 fi
 
 # Launch the slaves

From 01b26a06436b4c8020f22be3e1da4995b44c9b03 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Sat, 22 Oct 2016 09:39:07 +0100
Subject: [PATCH 090/162] [SPARK-17898][DOCS] repositories needs username and
 password

## What changes were proposed in this pull request?

Document `user:password` syntax as possible means of specifying credentials for password-protected `--repositories`

## How was this patch tested?

Doc build

Author: Sean Owen <sowen@cloudera.com>

Closes #15584 from srowen/SPARK-17898.
---
 docs/programming-guide.md       | 8 ++++----
 docs/submitting-applications.md | 2 ++
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/docs/programming-guide.md b/docs/programming-guide.md
index 20b4bee0f58e1..7516579ec6dbf 100644
--- a/docs/programming-guide.md
+++ b/docs/programming-guide.md
@@ -182,7 +182,7 @@ variable called `sc`. Making your own SparkContext will not work. You can set wh
 context connects to using the `--master` argument, and you can add JARs to the classpath
 by passing a comma-separated list to the `--jars` argument. You can also add dependencies
 (e.g. Spark Packages) to your shell session by supplying a comma-separated list of maven coordinates
-to the `--packages` argument. Any additional repositories where dependencies might exist (e.g. SonaType)
+to the `--packages` argument. Any additional repositories where dependencies might exist (e.g. Sonatype)
 can be passed to the `--repositories` argument. For example, to run `bin/spark-shell` on exactly
 four cores, use:
 
@@ -214,9 +214,9 @@ variable called `sc`. Making your own SparkContext will not work. You can set wh
 context connects to using the `--master` argument, and you can add Python .zip, .egg or .py files
 to the runtime path by passing a comma-separated list to `--py-files`. You can also add dependencies
 (e.g. Spark Packages) to your shell session by supplying a comma-separated list of maven coordinates
-to the `--packages` argument. Any additional repositories where dependencies might exist (e.g. SonaType)
-can be passed to the `--repositories` argument. Any python dependencies a Spark Package has (listed in
-the requirements.txt of that package) must be manually installed using pip when necessary.
+to the `--packages` argument. Any additional repositories where dependencies might exist (e.g. Sonatype)
+can be passed to the `--repositories` argument. Any Python dependencies a Spark package has (listed in
+the requirements.txt of that package) must be manually installed using `pip` when necessary.
 For example, to run `bin/pyspark` on exactly four cores, use:
 
 {% highlight bash %}
diff --git a/docs/submitting-applications.md b/docs/submitting-applications.md
index 6fe3049995876..b738194eac9aa 100644
--- a/docs/submitting-applications.md
+++ b/docs/submitting-applications.md
@@ -190,6 +190,8 @@ is handled automatically, and with Spark standalone, automatic cleanup can be co
 Users may also include any other dependencies by supplying a comma-delimited list of maven coordinates
 with `--packages`. All transitive dependencies will be handled when using this command. Additional
 repositories (or resolvers in SBT) can be added in a comma-delimited fashion with the flag `--repositories`.
+(Note that credentials for password-protected repositories can be supplied in some cases in the repository URI,
+such as in `https://user:password@host/...`. Be careful when supplying credentials this way.)
 These commands can be used with `pyspark`, `spark-shell`, and `spark-submit` to include Spark Packages.
 
 For Python, the equivalent `--py-files` option can be used to distribute `.egg`, `.zip` and `.py` libraries

From ab3363e9f6b1f7fc26682509fe7382c570f91778 Mon Sep 17 00:00:00 2001
From: Drew Robb <drewrobb@gmail.com>
Date: Sat, 22 Oct 2016 01:59:36 -0700
Subject: [PATCH 091/162] [SPARK-17986][ML] SQLTransformer should remove
 temporary tables

## What changes were proposed in this pull request?

A call to the method `SQLTransformer.transform` previously would create a temporary table and never delete it. This change adds a call to `dropTempView()` that deletes this temporary table before returning the result so that the table will not remain in spark's table catalog. Because `tableName` is randomized and not exposed, there should be no expected use of this table outside of the `transform` method.

## How was this patch tested?

A single new assertion was added to the existing test of the `SQLTransformer.transform` method that all temporary tables are removed. Without the corresponding code change, this new assertion fails. I am not aware of any circumstances in which removing this temporary view would be bad for performance or correctness in other ways, but some expertise here would be helpful.

Author: Drew Robb <drewrobb@gmail.com>

Closes #15526 from drewrobb/SPARK-17986.
---
 .../scala/org/apache/spark/ml/feature/SQLTransformer.scala    | 4 +++-
 .../org/apache/spark/ml/feature/SQLTransformerSuite.scala     | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
index 259be2679ce19..b25fff973c441 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
@@ -67,7 +67,9 @@ class SQLTransformer @Since("1.6.0") (@Since("1.6.0") override val uid: String)
     val tableName = Identifiable.randomUID(uid)
     dataset.createOrReplaceTempView(tableName)
     val realStatement = $(statement).replace(tableIdentifier, tableName)
-    dataset.sparkSession.sql(realStatement)
+    val result = dataset.sparkSession.sql(realStatement)
+    dataset.sparkSession.catalog.dropTempView(tableName)
+    result
   }
 
   @Since("1.6.0")
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/SQLTransformerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/SQLTransformerSuite.scala
index 23464073e6edb..753f890c48301 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/SQLTransformerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/SQLTransformerSuite.scala
@@ -43,6 +43,7 @@ class SQLTransformerSuite
     assert(result.schema.toString == resultSchema.toString)
     assert(resultSchema == expected.schema)
     assert(result.collect().toSeq == expected.collect().toSeq)
+    assert(original.sparkSession.catalog.listTables().count() == 0)
   }
 
   test("read/write") {

From 3eca283aca68ac81c127d60ad5699f854d5f14b7 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Sat, 22 Oct 2016 22:08:28 +0800
Subject: [PATCH 092/162] [SPARK-17994][SQL] Add back a file status cache for
 catalog tables

## What changes were proposed in this pull request?

In SPARK-16980, we removed the full in-memory cache of table partitions in favor of loading only needed partitions from the metastore. This greatly improves the initial latency of queries that only read a small fraction of table partitions.

However, since the metastore does not store file statistics, we need to discover those from remote storage. With the loss of the in-memory file status cache this has to happen on each query, increasing the latency of repeated queries over the same partitions.

The proposal is to add back a per-table cache of partition contents, i.e. Map[Path, Array[FileStatus]]. This cache would be retained per-table, and can be invalidated through refreshTable() and refreshByPath(). Unlike the prior cache, it can be incrementally updated as new partitions are read.

## How was this patch tested?

Existing tests and new tests in `HiveTablePerfStatsSuite`.

cc mallman

Author: Eric Liang <ekl@databricks.com>
Author: Michael Allman <michael@videoamp.com>
Author: Eric Liang <ekhliang@gmail.com>

Closes #15539 from ericl/meta-cache.
---
 .../spark/metrics/source/StaticSources.scala  |   7 +
 .../datasources/FileStatusCache.scala         | 149 ++++++++++++++++++
 .../datasources/ListingFileCatalog.scala      |  13 +-
 .../PartitioningAwareFileCatalog.scala        | 115 ++++++++------
 .../datasources/TableFileCatalog.scala        |  36 ++---
 .../apache/spark/sql/internal/SQLConf.scala   |  16 +-
 .../spark/sql/hive/HiveMetastoreCatalog.scala |   2 +-
 .../spark/sql/hive/HiveDDLCommandSuite.scala  |  16 +-
 ...te.scala => HiveTablePerfStatsSuite.scala} | 127 +++++++++++++--
 9 files changed, 385 insertions(+), 96 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
 rename sql/hive/src/test/scala/org/apache/spark/sql/hive/{HiveDataFrameSuite.scala => HiveTablePerfStatsSuite.scala} (50%)

diff --git a/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala b/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
index cf92a10deabd5..b54885b7ff8b0 100644
--- a/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
@@ -80,15 +80,22 @@ object HiveCatalogMetrics extends Source {
    */
   val METRIC_FILES_DISCOVERED = metricRegistry.counter(MetricRegistry.name("filesDiscovered"))
 
+  /**
+   * Tracks the total number of files served from the file status cache instead of discovered.
+   */
+  val METRIC_FILE_CACHE_HITS = metricRegistry.counter(MetricRegistry.name("fileCacheHits"))
+
   /**
    * Resets the values of all metrics to zero. This is useful in tests.
    */
   def reset(): Unit = {
     METRIC_PARTITIONS_FETCHED.dec(METRIC_PARTITIONS_FETCHED.getCount())
     METRIC_FILES_DISCOVERED.dec(METRIC_FILES_DISCOVERED.getCount())
+    METRIC_FILE_CACHE_HITS.dec(METRIC_FILE_CACHE_HITS.getCount())
   }
 
   // clients can use these to avoid classloader issues with the codahale classes
   def incrementFetchedPartitions(n: Int): Unit = METRIC_PARTITIONS_FETCHED.inc(n)
   def incrementFilesDiscovered(n: Int): Unit = METRIC_FILES_DISCOVERED.inc(n)
+  def incrementFileCacheHits(n: Int): Unit = METRIC_FILE_CACHE_HITS.inc(n)
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
new file mode 100644
index 0000000000000..e0ec748a0b34d
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import java.util.concurrent.ConcurrentHashMap
+import java.util.concurrent.atomic.AtomicBoolean
+
+import scala.collection.JavaConverters._
+
+import com.google.common.cache._
+import org.apache.hadoop.fs.{FileStatus, Path}
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.metrics.source.HiveCatalogMetrics
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.util.{SerializableConfiguration, SizeEstimator}
+
+/**
+ * A cache of the leaf files of partition directories. We cache these files in order to speed
+ * up iterated queries over the same set of partitions. Otherwise, each query would have to
+ * hit remote storage in order to gather file statistics for physical planning.
+ *
+ * Each resolved catalog table has its own FileStatusCache. When the backing relation for the
+ * table is refreshed via refreshTable() or refreshByPath(), this cache will be invalidated.
+ */
+abstract class FileStatusCache {
+  /**
+   * @return the leaf files for the specified path from this cache, or None if not cached.
+   */
+  def getLeafFiles(path: Path): Option[Array[FileStatus]] = None
+
+  /**
+   * Saves the given set of leaf files for a path in this cache.
+   */
+  def putLeafFiles(path: Path, leafFiles: Array[FileStatus]): Unit
+
+  /**
+   * Invalidates all data held by this cache.
+   */
+  def invalidateAll(): Unit
+}
+
+object FileStatusCache {
+  private var sharedCache: SharedInMemoryCache = null
+
+  /**
+   * @return a new FileStatusCache based on session configuration. Cache memory quota is
+   *         shared across all clients.
+   */
+  def newCache(session: SparkSession): FileStatusCache = {
+    synchronized {
+      if (session.sqlContext.conf.filesourcePartitionPruning &&
+          session.sqlContext.conf.filesourcePartitionFileCacheSize > 0) {
+        if (sharedCache == null) {
+          sharedCache = new SharedInMemoryCache(
+            session.sqlContext.conf.filesourcePartitionFileCacheSize)
+        }
+        sharedCache.getForNewClient()
+      } else {
+        NoopCache
+      }
+    }
+  }
+
+  def resetForTesting(): Unit = synchronized {
+    sharedCache = null
+  }
+}
+
+/**
+ * An implementation that caches partition file statuses in memory.
+ *
+ * @param maxSizeInBytes max allowable cache size before entries start getting evicted
+ */
+private class SharedInMemoryCache(maxSizeInBytes: Long) extends Logging {
+  import FileStatusCache._
+
+  // Opaque object that uniquely identifies a shared cache user
+  private type ClientId = Object
+
+  private val warnedAboutEviction = new AtomicBoolean(false)
+
+  // we use a composite cache key in order to distinguish entries inserted by different clients
+  private val cache: Cache[(ClientId, Path), Array[FileStatus]] = CacheBuilder.newBuilder()
+    .weigher(new Weigher[(ClientId, Path), Array[FileStatus]] {
+      override def weigh(key: (ClientId, Path), value: Array[FileStatus]): Int = {
+        (SizeEstimator.estimate(key) + SizeEstimator.estimate(value)).toInt
+      }})
+    .removalListener(new RemovalListener[(ClientId, Path), Array[FileStatus]]() {
+      override def onRemoval(removed: RemovalNotification[(ClientId, Path), Array[FileStatus]]) = {
+        if (removed.getCause() == RemovalCause.SIZE &&
+            warnedAboutEviction.compareAndSet(false, true)) {
+          logWarning(
+            "Evicting cached table partition metadata from memory due to size constraints " +
+            "(spark.sql.hive.filesourcePartitionFileCacheSize = " + maxSizeInBytes + " bytes). " +
+            "This may impact query planning performance.")
+        }
+      }})
+    .maximumWeight(maxSizeInBytes)
+    .build()
+
+  /**
+   * @return a FileStatusCache that does not share any entries with any other client, but does
+   *         share memory resources for the purpose of cache eviction.
+   */
+  def getForNewClient(): FileStatusCache = new FileStatusCache {
+    val clientId = new Object()
+
+    override def getLeafFiles(path: Path): Option[Array[FileStatus]] = {
+      Option(cache.getIfPresent((clientId, path)))
+    }
+
+    override def putLeafFiles(path: Path, leafFiles: Array[FileStatus]): Unit = {
+      cache.put((clientId, path), leafFiles.toArray)
+    }
+
+    override def invalidateAll(): Unit = {
+      cache.asMap.asScala.foreach { case (key, value) =>
+        if (key._1 == clientId) {
+          cache.invalidate(key)
+        }
+      }
+    }
+  }
+}
+
+/**
+ * A non-caching implementation used when partition file status caching is disabled.
+ */
+object NoopCache extends FileStatusCache {
+  override def getLeafFiles(path: Path): Option[Array[FileStatus]] = None
+  override def putLeafFiles(path: Path, leafFiles: Array[FileStatus]): Unit = {}
+  override def invalidateAll(): Unit = {}
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
index 6d10501b7265d..d9d588388aaf1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
@@ -38,14 +38,16 @@ class ListingFileCatalog(
     sparkSession: SparkSession,
     override val rootPaths: Seq[Path],
     parameters: Map[String, String],
-    partitionSchema: Option[StructType])
-  extends PartitioningAwareFileCatalog(sparkSession, parameters, partitionSchema) {
+    partitionSchema: Option[StructType],
+    fileStatusCache: FileStatusCache = NoopCache)
+  extends PartitioningAwareFileCatalog(
+    sparkSession, parameters, partitionSchema, fileStatusCache) {
 
   @volatile private var cachedLeafFiles: mutable.LinkedHashMap[Path, FileStatus] = _
   @volatile private var cachedLeafDirToChildrenFiles: Map[Path, Array[FileStatus]] = _
   @volatile private var cachedPartitionSpec: PartitionSpec = _
 
-  refresh()
+  refresh0()
 
   override def partitionSpec(): PartitionSpec = {
     if (cachedPartitionSpec == null) {
@@ -64,6 +66,11 @@ class ListingFileCatalog(
   }
 
   override def refresh(): Unit = {
+    refresh0()
+    fileStatusCache.invalidateAll()
+  }
+
+  private def refresh0(): Unit = {
     val files = listLeafFiles(rootPaths)
     cachedLeafFiles =
       new mutable.LinkedHashMap[Path, FileStatus]() ++= files.map(f => f.getPath -> f)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
index 5c8eff7ec46b4..9b1903c47119e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
@@ -33,7 +33,6 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.types.{StringType, StructType}
 import org.apache.spark.util.SerializableConfiguration
 
-
 /**
  * An abstract class that represents [[FileCatalog]]s that are aware of partitioned tables.
  * It provides the necessary methods to parse partition data based on a set of files.
@@ -45,7 +44,8 @@ import org.apache.spark.util.SerializableConfiguration
 abstract class PartitioningAwareFileCatalog(
     sparkSession: SparkSession,
     parameters: Map[String, String],
-    partitionSchema: Option[StructType]) extends FileCatalog with Logging {
+    partitionSchema: Option[StructType],
+    fileStatusCache: FileStatusCache = NoopCache) extends FileCatalog with Logging {
   import PartitioningAwareFileCatalog.BASE_PATH_PARAM
 
   /** Returns the specification of the partitions inferred from the data. */
@@ -238,15 +238,29 @@ abstract class PartitioningAwareFileCatalog(
    * This is publicly visible for testing.
    */
   def listLeafFiles(paths: Seq[Path]): mutable.LinkedHashSet[FileStatus] = {
-    val files =
-      if (paths.length >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) {
-        PartitioningAwareFileCatalog.listLeafFilesInParallel(paths, hadoopConf, sparkSession)
-      } else {
-        PartitioningAwareFileCatalog.listLeafFilesInSerial(paths, hadoopConf)
+    val output = mutable.LinkedHashSet[FileStatus]()
+    val pathsToFetch = mutable.ArrayBuffer[Path]()
+    for (path <- paths) {
+      fileStatusCache.getLeafFiles(path) match {
+        case Some(files) =>
+          HiveCatalogMetrics.incrementFileCacheHits(files.length)
+          output ++= files
+        case None =>
+          pathsToFetch += path
       }
-
-    HiveCatalogMetrics.incrementFilesDiscovered(files.size)
-    mutable.LinkedHashSet(files: _*)
+    }
+    val discovered = if (pathsToFetch.length >=
+        sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) {
+      PartitioningAwareFileCatalog.listLeafFilesInParallel(pathsToFetch, hadoopConf, sparkSession)
+    } else {
+      PartitioningAwareFileCatalog.listLeafFilesInSerial(pathsToFetch, hadoopConf)
+    }
+    discovered.foreach { case (path, leafFiles) =>
+      HiveCatalogMetrics.incrementFilesDiscovered(leafFiles.size)
+      fileStatusCache.putLeafFiles(path, leafFiles.toArray)
+      output ++= leafFiles
+    }
+    output
   }
 }
 
@@ -276,14 +290,14 @@ object PartitioningAwareFileCatalog extends Logging {
    */
   private def listLeafFilesInSerial(
       paths: Seq[Path],
-      hadoopConf: Configuration): Seq[FileStatus] = {
+      hadoopConf: Configuration): Seq[(Path, Seq[FileStatus])] = {
     // Dummy jobconf to get to the pathFilter defined in configuration
     val jobConf = new JobConf(hadoopConf, this.getClass)
     val filter = FileInputFormat.getInputPathFilter(jobConf)
 
-    paths.flatMap { path =>
+    paths.map { path =>
       val fs = path.getFileSystem(hadoopConf)
-      listLeafFiles0(fs, path, filter)
+      (path, listLeafFiles0(fs, path, filter))
     }
   }
 
@@ -294,7 +308,7 @@ object PartitioningAwareFileCatalog extends Logging {
   private def listLeafFilesInParallel(
       paths: Seq[Path],
       hadoopConf: Configuration,
-      sparkSession: SparkSession): Seq[FileStatus] = {
+      sparkSession: SparkSession): Seq[(Path, Seq[FileStatus])] = {
     assert(paths.size >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold)
     logInfo(s"Listing leaf files and directories in parallel under: ${paths.mkString(", ")}")
 
@@ -306,47 +320,54 @@ object PartitioningAwareFileCatalog extends Logging {
     // in case of large #defaultParallelism.
     val numParallelism = Math.min(paths.size, 10000)
 
-    val statuses = sparkContext
+    val statusMap = sparkContext
       .parallelize(serializedPaths, numParallelism)
       .mapPartitions { paths =>
         val hadoopConf = serializableConfiguration.value
         listLeafFilesInSerial(paths.map(new Path(_)).toSeq, hadoopConf).iterator
-      }.map { status =>
-        // Turn FileStatus into SerializableFileStatus so we can send it back to the driver
-        val blockLocations = status match {
-          case f: LocatedFileStatus =>
-            f.getBlockLocations.map { loc =>
-              SerializableBlockLocation(
-                loc.getNames,
-                loc.getHosts,
-                loc.getOffset,
-                loc.getLength)
-            }
-
-          case _ =>
-            Array.empty[SerializableBlockLocation]
-        }
+      }.map { case (path, statuses) =>
+        val serializableStatuses = statuses.map { status =>
+          // Turn FileStatus into SerializableFileStatus so we can send it back to the driver
+          val blockLocations = status match {
+            case f: LocatedFileStatus =>
+              f.getBlockLocations.map { loc =>
+                SerializableBlockLocation(
+                  loc.getNames,
+                  loc.getHosts,
+                  loc.getOffset,
+                  loc.getLength)
+              }
+
+            case _ =>
+              Array.empty[SerializableBlockLocation]
+          }
 
-        SerializableFileStatus(
-          status.getPath.toString,
-          status.getLen,
-          status.isDirectory,
-          status.getReplication,
-          status.getBlockSize,
-          status.getModificationTime,
-          status.getAccessTime,
-          blockLocations)
+          SerializableFileStatus(
+            status.getPath.toString,
+            status.getLen,
+            status.isDirectory,
+            status.getReplication,
+            status.getBlockSize,
+            status.getModificationTime,
+            status.getAccessTime,
+            blockLocations)
+        }
+        (path.toString, serializableStatuses)
       }.collect()
 
-    // Turn SerializableFileStatus back to Status
-    statuses.map { f =>
-      val blockLocations = f.blockLocations.map { loc =>
-        new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length)
+    // turn SerializableFileStatus back to Status
+    statusMap.map { case (path, serializableStatuses) =>
+      val statuses = serializableStatuses.map { f =>
+        val blockLocations = f.blockLocations.map { loc =>
+          new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length)
+        }
+        new LocatedFileStatus(
+          new FileStatus(
+            f.length, f.isDir, f.blockReplication, f.blockSize, f.modificationTime,
+            new Path(f.path)),
+          blockLocations)
       }
-      new LocatedFileStatus(
-        new FileStatus(
-          f.length, f.isDir, f.blockReplication, f.blockSize, f.modificationTime, new Path(f.path)),
-        blockLocations)
+      (new Path(path), statuses)
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index fc08c3798ee06..31a01bc6db082 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -32,6 +32,7 @@ import org.apache.spark.sql.types.StructType
  * @param table the table's (unqualified) name
  * @param partitionSchema the schema of a partitioned table's partition columns
  * @param sizeInBytes the table's data size in bytes
+ * @param fileStatusCache optional cache implementation to use for file listing
  */
 class TableFileCatalog(
     sparkSession: SparkSession,
@@ -42,24 +43,21 @@ class TableFileCatalog(
 
   protected val hadoopConf = sparkSession.sessionState.newHadoopConf
 
+  private val fileStatusCache = FileStatusCache.newCache(sparkSession)
+
   private val externalCatalog = sparkSession.sharedState.externalCatalog
 
   private val catalogTable = externalCatalog.getTable(db, table)
 
   private val baseLocation = catalogTable.storage.locationUri
 
-  // Populated on-demand by calls to cachedAllPartitions
-  private var cachedAllPartitions: ListingFileCatalog = null
-
   override def rootPaths: Seq[Path] = baseLocation.map(new Path(_)).toSeq
 
   override def listFiles(filters: Seq[Expression]): Seq[PartitionDirectory] = {
     filterPartitions(filters).listFiles(Nil)
   }
 
-  override def refresh(): Unit = synchronized {
-    cachedAllPartitions = null
-  }
+  override def refresh(): Unit = fileStatusCache.invalidateAll()
 
   /**
    * Returns a [[ListingFileCatalog]] for this table restricted to the subset of partitions
@@ -68,14 +66,6 @@ class TableFileCatalog(
    * @param filters partition-pruning filters
    */
   def filterPartitions(filters: Seq[Expression]): ListingFileCatalog = {
-    if (filters.isEmpty) {
-      allPartitions
-    } else {
-      filterPartitions0(filters)
-    }
-  }
-
-  private def filterPartitions0(filters: Seq[Expression]): ListingFileCatalog = {
     val parameters = baseLocation
       .map(loc => Map(PartitioningAwareFileCatalog.BASE_PATH_PARAM -> loc))
       .getOrElse(Map.empty)
@@ -87,21 +77,13 @@ class TableFileCatalog(
         }
         val partitionSpec = PartitionSpec(schema, partitions)
         new PrunedTableFileCatalog(
-          sparkSession, new Path(baseLocation.get), partitionSpec)
+          sparkSession, new Path(baseLocation.get), fileStatusCache, partitionSpec)
       case None =>
-        new ListingFileCatalog(sparkSession, rootPaths, parameters, None)
-    }
-  }
-
-  // Not used in the hot path of queries when metastore partition pruning is enabled
-  def allPartitions: ListingFileCatalog = synchronized {
-    if (cachedAllPartitions == null) {
-      cachedAllPartitions = filterPartitions0(Nil)
+        new ListingFileCatalog(sparkSession, rootPaths, parameters, None, fileStatusCache)
     }
-    cachedAllPartitions
   }
 
-  override def inputFiles: Array[String] = allPartitions.inputFiles
+  override def inputFiles: Array[String] = filterPartitions(Nil).inputFiles
 }
 
 /**
@@ -114,9 +96,11 @@ class TableFileCatalog(
 private class PrunedTableFileCatalog(
     sparkSession: SparkSession,
     tableBasePath: Path,
+    fileStatusCache: FileStatusCache,
     override val partitionSpec: PartitionSpec)
   extends ListingFileCatalog(
     sparkSession,
     partitionSpec.partitions.map(_.path),
     Map.empty,
-    Some(partitionSpec.partitionColumns))
+    Some(partitionSpec.partitionColumns),
+    fileStatusCache)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index ebf4fad5cbcff..a6e2fa26cb5ef 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -265,17 +265,27 @@ object SQLConf {
   val HIVE_METASTORE_PARTITION_PRUNING =
     SQLConfigBuilder("spark.sql.hive.metastorePartitionPruning")
       .doc("When true, some predicates will be pushed down into the Hive metastore so that " +
-           "unmatching partitions can be eliminated earlier.")
+           "unmatching partitions can be eliminated earlier. This only affects Hive tables " +
+           "not converted to filesource relations (see HiveUtils.CONVERT_METASTORE_PARQUET and " +
+           "HiveUtils.CONVERT_METASTORE_ORC for more information).")
       .booleanConf
       .createWithDefault(true)
 
   val HIVE_FILESOURCE_PARTITION_PRUNING =
     SQLConfigBuilder("spark.sql.hive.filesourcePartitionPruning")
-      .doc("When true, enable metastore partition pruning for file source tables as well. " +
+      .doc("When true, enable metastore partition pruning for filesource relations as well. " +
            "This is currently implemented for converted Hive tables only.")
       .booleanConf
       .createWithDefault(true)
 
+  val HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE =
+    SQLConfigBuilder("spark.sql.hive.filesourcePartitionFileCacheSize")
+      .doc("When nonzero, enable caching of partition file metadata in memory. All table share " +
+           "a cache that can use up to specified num bytes for file metadata. This conf only " +
+           "applies if filesource partition pruning is also enabled.")
+      .longConf
+      .createWithDefault(250 * 1024 * 1024)
+
   val OPTIMIZER_METADATA_ONLY = SQLConfigBuilder("spark.sql.optimizer.metadataOnly")
     .doc("When true, enable the metadata-only query optimization that use the table's metadata " +
       "to produce the partition columns instead of table scans. It applies when all the columns " +
@@ -670,6 +680,8 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
 
   def filesourcePartitionPruning: Boolean = getConf(HIVE_FILESOURCE_PARTITION_PRUNING)
 
+  def filesourcePartitionFileCacheSize: Long = getConf(HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE)
+
   def gatherFastStats: Boolean = getConf(GATHER_FASTSTAT)
 
   def optimizerMetadataOnly: Boolean = getConf(OPTIMIZER_METADATA_ONLY)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index c909eb5d20bcd..44089335e1a1d 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -235,7 +235,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
           if (lazyPruningEnabled) {
             catalog
           } else {
-            catalog.allPartitions
+            catalog.filterPartitions(Nil)  // materialize all the partitions in memory
           }
         }
         val partitionSchemaColumnNames = partitionSchema.map(_.name.toLowerCase).toSet
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDDLCommandSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDDLCommandSuite.scala
index 81337493c7f28..d13e29b3029b1 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDDLCommandSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDDLCommandSuite.scala
@@ -577,5 +577,19 @@ class HiveDDLCommandSuite extends PlanTest with SQLTestUtils with TestHiveSingle
       assert(output == Some("org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat"))
       assert(serde == Some("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe"))
     }
-   }
+  }
+
+  test("table name with schema") {
+    // regression test for SPARK-11778
+    spark.sql("create schema usrdb")
+    spark.sql("create table usrdb.test(c int)")
+    spark.read.table("usrdb.test")
+    spark.sql("drop table usrdb.test")
+    spark.sql("drop schema usrdb")
+  }
+
+  test("SPARK-15887: hive-site.xml should be loaded") {
+    val hiveClient = spark.sharedState.externalCatalog.asInstanceOf[HiveExternalCatalog].client
+    assert(hiveClient.getConf("hive.in.test", "") == "true")
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveTablePerfStatsSuite.scala
similarity index 50%
rename from sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
rename to sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveTablePerfStatsSuite.scala
index 15523437a3404..82ee813c6a95f 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveTablePerfStatsSuite.scala
@@ -19,25 +19,26 @@ package org.apache.spark.sql.hive
 
 import java.io.File
 
+import org.scalatest.BeforeAndAfterEach
+
 import org.apache.spark.metrics.source.HiveCatalogMetrics
+import org.apache.spark.sql.execution.datasources.FileStatusCache
 import org.apache.spark.sql.QueryTest
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SQLTestUtils
 
-class HiveDataFrameSuite extends QueryTest with TestHiveSingleton with SQLTestUtils {
-  test("table name with schema") {
-    // regression test for SPARK-11778
-    spark.sql("create schema usrdb")
-    spark.sql("create table usrdb.test(c int)")
-    spark.read.table("usrdb.test")
-    spark.sql("drop table usrdb.test")
-    spark.sql("drop schema usrdb")
+class HiveTablePerfStatsSuite
+  extends QueryTest with TestHiveSingleton with SQLTestUtils with BeforeAndAfterEach {
+
+  override def beforeEach(): Unit = {
+    super.beforeEach()
+    FileStatusCache.resetForTesting()
   }
 
-  test("SPARK-15887: hive-site.xml should be loaded") {
-    val hiveClient = spark.sharedState.externalCatalog.asInstanceOf[HiveExternalCatalog].client
-    assert(hiveClient.getConf("hive.in.test", "") == "true")
+  override def afterEach(): Unit = {
+    super.afterEach()
+    FileStatusCache.resetForTesting()
   }
 
   private def setupPartitionedTable(tableName: String, dir: File): Unit = {
@@ -79,7 +80,9 @@ class HiveDataFrameSuite extends QueryTest with TestHiveSingleton with SQLTestUt
   }
 
   test("lazy partition pruning reads only necessary partition data") {
-    withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_PRUNING.key -> "true") {
+    withSQLConf(
+        SQLConf.HIVE_FILESOURCE_PARTITION_PRUNING.key -> "true",
+        SQLConf.HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE.key -> "0") {
       withTable("test") {
         withTempDir { dir =>
           setupPartitionedTable("test", dir)
@@ -104,11 +107,103 @@ class HiveDataFrameSuite extends QueryTest with TestHiveSingleton with SQLTestUt
           assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 5)
           assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 5)
 
-          // read all should be cached
+          // read all should not be cached
           HiveCatalogMetrics.reset()
           spark.sql("select * from test").count()
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 5)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 5)
+
+          // cache should be disabled
+          assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 0)
+        }
+      }
+    }
+  }
+
+  test("lazy partition pruning with file status caching enabled") {
+    withSQLConf(
+        "spark.sql.hive.filesourcePartitionPruning" -> "true",
+        "spark.sql.hive.filesourcePartitionFileCacheSize" -> "9999999") {
+      withTable("test") {
+        withTempDir { dir =>
+          setupPartitionedTable("test", dir)
+          HiveCatalogMetrics.reset()
+          assert(spark.sql("select * from test where partCol1 = 999").count() == 0)
           assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 0)
           assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 0)
+          assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 0)
+
+          HiveCatalogMetrics.reset()
+          assert(spark.sql("select * from test where partCol1 < 2").count() == 2)
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 2)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 2)
+          assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 0)
+
+          HiveCatalogMetrics.reset()
+          assert(spark.sql("select * from test where partCol1 < 3").count() == 3)
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 3)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 1)
+          assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 2)
+
+          HiveCatalogMetrics.reset()
+          assert(spark.sql("select * from test").count() == 5)
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 5)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 2)
+          assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 3)
+
+          HiveCatalogMetrics.reset()
+          assert(spark.sql("select * from test").count() == 5)
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 5)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 0)
+          assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 5)
+        }
+      }
+    }
+  }
+
+  test("file status caching respects refresh table and refreshByPath") {
+    withSQLConf(
+        "spark.sql.hive.filesourcePartitionPruning" -> "true",
+        "spark.sql.hive.filesourcePartitionFileCacheSize" -> "9999999") {
+      withTable("test") {
+        withTempDir { dir =>
+          setupPartitionedTable("test", dir)
+          HiveCatalogMetrics.reset()
+          assert(spark.sql("select * from test").count() == 5)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 5)
+          assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 0)
+
+          HiveCatalogMetrics.reset()
+          spark.sql("refresh table test")
+          assert(spark.sql("select * from test").count() == 5)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 5)
+          assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 0)
+
+          spark.catalog.cacheTable("test")
+          HiveCatalogMetrics.reset()
+          spark.catalog.refreshByPath(dir.getAbsolutePath)
+          assert(spark.sql("select * from test").count() == 5)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 5)
+          assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 0)
+        }
+      }
+    }
+  }
+
+  test("file status cache respects size limit") {
+    withSQLConf(
+        "spark.sql.hive.filesourcePartitionPruning" -> "true",
+        "spark.sql.hive.filesourcePartitionFileCacheSize" -> "1" /* 1 byte */) {
+      withTable("test") {
+        withTempDir { dir =>
+          setupPartitionedTable("test", dir)
+          HiveCatalogMetrics.reset()
+          assert(spark.sql("select * from test").count() == 5)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 5)
+          assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 0)
+          assert(spark.sql("select * from test").count() == 5)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 10)
+          assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 0)
         }
       }
     }
@@ -124,18 +219,18 @@ class HiveDataFrameSuite extends QueryTest with TestHiveSingleton with SQLTestUt
           // mode. This is kind of terrible, but is needed to preserve the legacy behavior
           // of doing plan cache validation based on the entire partition set.
           HiveCatalogMetrics.reset()
-          spark.sql("select * from test where partCol1 = 999").count()
+          assert(spark.sql("select * from test where partCol1 = 999").count() == 0)
           // 5 from table resolution, another 5 from ListingFileCatalog
           assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 10)
           assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 5)
 
           HiveCatalogMetrics.reset()
-          spark.sql("select * from test where partCol1 < 2").count()
+          assert(spark.sql("select * from test where partCol1 < 2").count() == 2)
           assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 5)
           assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 0)
 
           HiveCatalogMetrics.reset()
-          spark.sql("select * from test").count()
+          assert(spark.sql("select * from test").count() == 5)
           assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 5)
           assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 0)
         }

From 5fa9f8795a71e08bcbef5975ba8c072db5be8866 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sat, 22 Oct 2016 20:09:04 +0200
Subject: [PATCH 093/162] [SPARK-17123][SQL] Use type-widened encoder for
 DataFrame rather than existing encoder to allow type-widening from set
 operations

# What changes were proposed in this pull request?

This PR fixes set operations in `DataFrame` to be performed fine without exceptions when the types are non-scala native types. (e.g, `TimestampType`, `DateType` and `DecimalType`).

The problem is, it seems set operations such as `union`, `intersect` and `except` uses the encoder belonging to the `Dataset` in caller.

So, `Dataset` of the caller holds `ExpressionEncoder[Row]` as it is when the set operations are performed. However, the return types can be actually widen. So, we should use `ExpressionEncoder[Row]` constructed from executed plan rather than using existing one. Otherwise, this will generate some codes wrongly via `StaticInvoke`.

Running the codes below:

```scala
val dates = Seq(
  (new Date(0), BigDecimal.valueOf(1), new Timestamp(2)),
  (new Date(3), BigDecimal.valueOf(4), new Timestamp(5))
).toDF("date", "timestamp", "decimal")

val widenTypedRows = Seq(
  (new Timestamp(2), 10.5D, "string")
).toDF("date", "timestamp", "decimal")

val results = dates.union(widenTypedRows).collect()
results.foreach(println)
```

prints below:

**Before**

```java
23:08:54.490 ERROR org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator: failed to compile: org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 28, Column 107: No applicable constructor/method found for actual parameters "long"; candidates are: "public static java.sql.Date org.apache.spark.sql.catalyst.util.DateTimeUtils.toJavaDate(int)"
/* 001 */ public java.lang.Object generate(Object[] references) {
/* 002 */   return new SpecificSafeProjection(references);
/* 003 */ }
/* 004 */
/* 005 */ class SpecificSafeProjection extends org.apache.spark.sql.catalyst.expressions.codegen.BaseProjection {
/* 006 */
/* 007 */   private Object[] references;
/* 008 */   private MutableRow mutableRow;
/* 009 */   private Object[] values;
/* 010 */   private org.apache.spark.sql.types.StructType schema;
/* 011 */
/* 012 */
/* 013 */   public SpecificSafeProjection(Object[] references) {
/* 014 */     this.references = references;
/* 015 */     mutableRow = (MutableRow) references[references.length - 1];
/* 016 */
/* 017 */     this.schema = (org.apache.spark.sql.types.StructType) references[0];
/* 018 */   }
/* 019 */
/* 020 */   public java.lang.Object apply(java.lang.Object _i) {
/* 021 */     InternalRow i = (InternalRow) _i;
/* 022 */
/* 023 */     values = new Object[3];
/* 024 */
/* 025 */     boolean isNull2 = i.isNullAt(0);
/* 026 */     long value2 = isNull2 ? -1L : (i.getLong(0));
/* 027 */     boolean isNull1 = isNull2;
/* 028 */     final java.sql.Date value1 = isNull1 ? null : org.apache.spark.sql.catalyst.util.DateTimeUtils.toJavaDate(value2);
/* 029 */     isNull1 = value1 == null;
/* 030 */     if (isNull1) {
/* 031 */       values[0] = null;
/* 032 */     } else {
/* 033 */       values[0] = value1;
/* 034 */     }
/* 035 */
/* 036 */     boolean isNull4 = i.isNullAt(1);
/* 037 */     double value4 = isNull4 ? -1.0 : (i.getDouble(1));
/* 038 */
/* 039 */     boolean isNull3 = isNull4;
/* 040 */     java.math.BigDecimal value3 = null;
/* 041 */     if (!isNull3) {
/* 042 */
/* 043 */       Object funcResult = null;
/* 044 */       funcResult = value4.toJavaBigDecimal();
/* 045 */       if (funcResult == null) {
/* 046 */         isNull3 = true;
/* 047 */       } else {
/* 048 */         value3 = (java.math.BigDecimal) funcResult;
/* 049 */       }
/* 050 */
/* 051 */     }
/* 052 */     isNull3 = value3 == null;
/* 053 */     if (isNull3) {
/* 054 */       values[1] = null;
/* 055 */     } else {
/* 056 */       values[1] = value3;
/* 057 */     }
/* 058 */
/* 059 */     boolean isNull6 = i.isNullAt(2);
/* 060 */     UTF8String value6 = isNull6 ? null : (i.getUTF8String(2));
/* 061 */     boolean isNull5 = isNull6;
/* 062 */     final java.sql.Timestamp value5 = isNull5 ? null : org.apache.spark.sql.catalyst.util.DateTimeUtils.toJavaTimestamp(value6);
/* 063 */     isNull5 = value5 == null;
/* 064 */     if (isNull5) {
/* 065 */       values[2] = null;
/* 066 */     } else {
/* 067 */       values[2] = value5;
/* 068 */     }
/* 069 */
/* 070 */     final org.apache.spark.sql.Row value = new org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema(values, schema);
/* 071 */     if (false) {
/* 072 */       mutableRow.setNullAt(0);
/* 073 */     } else {
/* 074 */
/* 075 */       mutableRow.update(0, value);
/* 076 */     }
/* 077 */
/* 078 */     return mutableRow;
/* 079 */   }
/* 080 */ }
```

**After**

```bash
[1969-12-31 00:00:00.0,1.0,1969-12-31 16:00:00.002]
[1969-12-31 00:00:00.0,4.0,1969-12-31 16:00:00.005]
[1969-12-31 16:00:00.002,10.5,string]
```

## How was this patch tested?

Unit tests in `DataFrameSuite`

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #15072 from HyukjinKwon/SPARK-17123.
---
 .../scala/org/apache/spark/sql/Dataset.scala   | 18 ++++++++++++++----
 .../org/apache/spark/sql/DataFrameSuite.scala  | 16 ++++++++++++++++
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 073d2b1512b95..286d8549bfe27 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -556,7 +556,7 @@ class Dataset[T] private[sql](
    *   1983  03    0.410516        0.442194
    *   1984  04    0.450090        0.483521
    * }}}
- *
+   *
    * @param numRows Number of rows to show
    * @param truncate If set to more than 0, truncates strings to `truncate` characters and
    *                    all cells will be aligned right.
@@ -1524,7 +1524,7 @@ class Dataset[T] private[sql](
    * @group typedrel
    * @since 2.0.0
    */
-  def union(other: Dataset[T]): Dataset[T] = withTypedPlan {
+  def union(other: Dataset[T]): Dataset[T] = withSetOperator {
     // This breaks caching, but it's usually ok because it addresses a very specific use case:
     // using union to union many files or partitions.
     CombineUnions(Union(logicalPlan, other.logicalPlan))
@@ -1540,7 +1540,7 @@ class Dataset[T] private[sql](
    * @group typedrel
    * @since 1.6.0
    */
-  def intersect(other: Dataset[T]): Dataset[T] = withTypedPlan {
+  def intersect(other: Dataset[T]): Dataset[T] = withSetOperator {
     Intersect(logicalPlan, other.logicalPlan)
   }
 
@@ -1554,7 +1554,7 @@ class Dataset[T] private[sql](
    * @group typedrel
    * @since 2.0.0
    */
-  def except(other: Dataset[T]): Dataset[T] = withTypedPlan {
+  def except(other: Dataset[T]): Dataset[T] = withSetOperator {
     Except(logicalPlan, other.logicalPlan)
   }
 
@@ -2725,4 +2725,14 @@ class Dataset[T] private[sql](
   @inline private def withTypedPlan[U : Encoder](logicalPlan: => LogicalPlan): Dataset[U] = {
     Dataset(sparkSession, logicalPlan)
   }
+
+  /** A convenient function to wrap a set based logical plan and produce a Dataset. */
+  @inline private def withSetOperator[U : Encoder](logicalPlan: => LogicalPlan): Dataset[U] = {
+    if (classTag.runtimeClass.isAssignableFrom(classOf[Row])) {
+      // Set operators widen types (change the schema), so we cannot reuse the row encoder.
+      Dataset.ofRows(sparkSession, logicalPlan).asInstanceOf[Dataset[U]]
+    } else {
+      Dataset(sparkSession, logicalPlan)
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 16cc368208485..e87baa454c8b3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql
 
 import java.io.File
 import java.nio.charset.StandardCharsets
+import java.sql.{Date, Timestamp}
 import java.util.UUID
 
 import scala.util.Random
@@ -1615,4 +1616,19 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
       qe.assertAnalyzed()
     }
   }
+
+  test("SPARK-17123: Performing set operations that combine non-scala native types") {
+    val dates = Seq(
+      (new Date(0), BigDecimal.valueOf(1), new Timestamp(2)),
+      (new Date(3), BigDecimal.valueOf(4), new Timestamp(5))
+    ).toDF("date", "timestamp", "decimal")
+
+    val widenTypedRows = Seq(
+      (new Timestamp(2), 10.5D, "string")
+    ).toDF("date", "timestamp", "decimal")
+
+    dates.union(widenTypedRows).collect()
+    dates.except(widenTypedRows).collect()
+    dates.intersect(widenTypedRows).collect()
+  }
 }

From 4f1dcd3dce270268b42fbe59409790364fa5c5df Mon Sep 17 00:00:00 2001
From: WeichenXu <WeichenXu123@outlook.com>
Date: Sat, 22 Oct 2016 11:59:28 -0700
Subject: [PATCH 094/162] [SPARK-18051][SPARK CORE] fix bug of custom
 PartitionCoalescer causing serialization exception

## What changes were proposed in this pull request?

add a require check in `CoalescedRDD` to make sure the passed in `partitionCoalescer` to be `serializable`.
and update the document for api `RDD.coalesce`

## How was this patch tested?

Manual.(test code in jira [SPARK-18051])

Author: WeichenXu <WeichenXu123@outlook.com>

Closes #15587 from WeichenXu123/fix_coalescer_bug.
---
 core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala | 4 ++++
 core/src/main/scala/org/apache/spark/rdd/RDD.scala          | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala
index 9c198a61f37af..2cba1febe8759 100644
--- a/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala
@@ -80,6 +80,10 @@ private[spark] class CoalescedRDD[T: ClassTag](
 
   require(maxPartitions > 0 || maxPartitions == prev.partitions.length,
     s"Number of partitions ($maxPartitions) must be positive.")
+  if (partitionCoalescer.isDefined) {
+    require(partitionCoalescer.get.isInstanceOf[Serializable],
+      "The partition coalescer passed in must be serializable.")
+  }
 
   override def getPartitions: Array[Partition] = {
     val pc = partitionCoalescer.getOrElse(new DefaultPartitionCoalescer())
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index be119578d2c35..db535de9e9bb3 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -432,7 +432,8 @@ abstract class RDD[T: ClassTag](
    * of partitions. This is useful if you have a small number of partitions,
    * say 100, potentially with a few partitions being abnormally large. Calling
    * coalesce(1000, shuffle = true) will result in 1000 partitions with the
-   * data distributed using a hash partitioner.
+   * data distributed using a hash partitioner. The optional partition coalescer
+   * passed in must be serializable.
    */
   def coalesce(numPartitions: Int, shuffle: Boolean = false,
                partitionCoalescer: Option[PartitionCoalescer] = Option.empty)

From bc167a2a53f5a795d089e8a884569b1b3e2cd439 Mon Sep 17 00:00:00 2001
From: Sandeep Singh <sandeep@techaddict.me>
Date: Sat, 22 Oct 2016 12:03:37 -0700
Subject: [PATCH 095/162] [SPARK-928][CORE] Add support for Unsafe-based
 serializer in Kryo

## What changes were proposed in this pull request?
Now since we have migrated to Kryo-3.0.0 in https://issues.apache.org/jira/browse/SPARK-11416, we can gives users option to use unsafe SerDer. It can turned by setting `spark.kryo.useUnsafe` to `true`

## How was this patch tested?
Ran existing tests

```
     Benchmark Kryo Unsafe vs safe Serialization: Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
      ------------------------------------------------------------------------------------------------
      basicTypes: Int unsafe:true                    160 /  178         98.5          10.1       1.0X
      basicTypes: Long unsafe:true                   210 /  218         74.9          13.4       0.8X
      basicTypes: Float unsafe:true                  203 /  213         77.5          12.9       0.8X
      basicTypes: Double unsafe:true                 226 /  235         69.5          14.4       0.7X
      Array: Int unsafe:true                        1087 / 1101         14.5          69.1       0.1X
      Array: Long unsafe:true                       2758 / 2844          5.7         175.4       0.1X
      Array: Float unsafe:true                      1511 / 1552         10.4          96.1       0.1X
      Array: Double unsafe:true                     2942 / 2972          5.3         187.0       0.1X
      Map of string->Double unsafe:true             2645 / 2739          5.9         168.2       0.1X
      basicTypes: Int unsafe:false                   211 /  218         74.7          13.4       0.8X
      basicTypes: Long unsafe:false                  247 /  253         63.6          15.7       0.6X
      basicTypes: Float unsafe:false                 211 /  216         74.5          13.4       0.8X
      basicTypes: Double unsafe:false                227 /  233         69.2          14.4       0.7X
      Array: Int unsafe:false                       3012 / 3032          5.2         191.5       0.1X
      Array: Long unsafe:false                      4463 / 4515          3.5         283.8       0.0X
      Array: Float unsafe:false                     2788 / 2868          5.6         177.2       0.1X
      Array: Double unsafe:false                    3558 / 3752          4.4         226.2       0.0X
      Map of string->Double unsafe:false            2806 / 2933          5.6         178.4       0.1X
```

Author: Sandeep Singh <sandeep@techaddict.me>
Author: Sandeep Singh <sandeep@origamilogic.com>

Closes #12913 from techaddict/SPARK-928.
---
 .../spark/serializer/KryoSerializer.scala     |  36 +++--
 .../spark/serializer/KryoBenchmark.scala      | 139 ++++++++++++++++++
 .../serializer/KryoSerializerSuite.scala      |   1 +
 .../UnsafeKryoSerializerSuite.scala           |  33 +++++
 docs/configuration.md                         |   8 +
 5 files changed, 206 insertions(+), 11 deletions(-)
 create mode 100644 core/src/test/scala/org/apache/spark/serializer/KryoBenchmark.scala
 create mode 100644 core/src/test/scala/org/apache/spark/serializer/UnsafeKryoSerializerSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
index 1fba552f70501..0d26281fe1076 100644
--- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
@@ -27,6 +27,7 @@ import scala.reflect.ClassTag
 
 import com.esotericsoftware.kryo.{Kryo, KryoException, Serializer => KryoClassSerializer}
 import com.esotericsoftware.kryo.io.{Input => KryoInput, Output => KryoOutput}
+import com.esotericsoftware.kryo.io.{UnsafeInput => KryoUnsafeInput, UnsafeOutput => KryoUnsafeOutput}
 import com.esotericsoftware.kryo.serializers.{JavaSerializer => KryoJavaSerializer}
 import com.twitter.chill.{AllScalaRegistrar, EmptyScalaKryoInstantiator}
 import org.apache.avro.generic.{GenericData, GenericRecord}
@@ -78,8 +79,15 @@ class KryoSerializer(conf: SparkConf)
     .filter(!_.isEmpty)
 
   private val avroSchemas = conf.getAvroSchema
+  // whether to use unsafe based IO for serialization
+  private val useUnsafe = conf.getBoolean("spark.kryo.unsafe", false)
 
-  def newKryoOutput(): KryoOutput = new KryoOutput(bufferSize, math.max(bufferSize, maxBufferSize))
+  def newKryoOutput(): KryoOutput =
+    if (useUnsafe) {
+      new KryoUnsafeOutput(bufferSize, math.max(bufferSize, maxBufferSize))
+    } else {
+      new KryoOutput(bufferSize, math.max(bufferSize, maxBufferSize))
+    }
 
   def newKryo(): Kryo = {
     val instantiator = new EmptyScalaKryoInstantiator
@@ -172,7 +180,7 @@ class KryoSerializer(conf: SparkConf)
   }
 
   override def newInstance(): SerializerInstance = {
-    new KryoSerializerInstance(this)
+    new KryoSerializerInstance(this, useUnsafe)
   }
 
   private[spark] override lazy val supportsRelocationOfSerializedObjects: Boolean = {
@@ -186,9 +194,12 @@ class KryoSerializer(conf: SparkConf)
 private[spark]
 class KryoSerializationStream(
     serInstance: KryoSerializerInstance,
-    outStream: OutputStream) extends SerializationStream {
+    outStream: OutputStream,
+    useUnsafe: Boolean) extends SerializationStream {
+
+  private[this] var output: KryoOutput =
+    if (useUnsafe) new KryoUnsafeOutput(outStream) else new KryoOutput(outStream)
 
-  private[this] var output: KryoOutput = new KryoOutput(outStream)
   private[this] var kryo: Kryo = serInstance.borrowKryo()
 
   override def writeObject[T: ClassTag](t: T): SerializationStream = {
@@ -219,9 +230,12 @@ class KryoSerializationStream(
 private[spark]
 class KryoDeserializationStream(
     serInstance: KryoSerializerInstance,
-    inStream: InputStream) extends DeserializationStream {
+    inStream: InputStream,
+    useUnsafe: Boolean) extends DeserializationStream {
+
+  private[this] var input: KryoInput =
+    if (useUnsafe) new KryoUnsafeInput(inStream) else new KryoInput(inStream)
 
-  private[this] var input: KryoInput = new KryoInput(inStream)
   private[this] var kryo: Kryo = serInstance.borrowKryo()
 
   override def readObject[T: ClassTag](): T = {
@@ -248,8 +262,8 @@ class KryoDeserializationStream(
   }
 }
 
-private[spark] class KryoSerializerInstance(ks: KryoSerializer) extends SerializerInstance {
-
+private[spark] class KryoSerializerInstance(ks: KryoSerializer, useUnsafe: Boolean)
+  extends SerializerInstance {
   /**
    * A re-used [[Kryo]] instance. Methods will borrow this instance by calling `borrowKryo()`, do
    * their work, then release the instance by calling `releaseKryo()`. Logically, this is a caching
@@ -288,7 +302,7 @@ private[spark] class KryoSerializerInstance(ks: KryoSerializer) extends Serializ
 
   // Make these lazy vals to avoid creating a buffer unless we use them.
   private lazy val output = ks.newKryoOutput()
-  private lazy val input = new KryoInput()
+  private lazy val input = if (useUnsafe) new KryoUnsafeInput() else new KryoInput()
 
   override def serialize[T: ClassTag](t: T): ByteBuffer = {
     output.clear()
@@ -329,11 +343,11 @@ private[spark] class KryoSerializerInstance(ks: KryoSerializer) extends Serializ
   }
 
   override def serializeStream(s: OutputStream): SerializationStream = {
-    new KryoSerializationStream(this, s)
+    new KryoSerializationStream(this, s, useUnsafe)
   }
 
   override def deserializeStream(s: InputStream): DeserializationStream = {
-    new KryoDeserializationStream(this, s)
+    new KryoDeserializationStream(this, s, useUnsafe)
   }
 
   /**
diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoBenchmark.scala b/core/src/test/scala/org/apache/spark/serializer/KryoBenchmark.scala
new file mode 100644
index 0000000000000..64be966276140
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/serializer/KryoBenchmark.scala
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.serializer
+
+import scala.reflect.ClassTag
+import scala.util.Random
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.serializer.KryoTest._
+import org.apache.spark.util.Benchmark
+
+class KryoBenchmark extends SparkFunSuite {
+  val benchmark = new Benchmark("Benchmark Kryo Unsafe vs safe Serialization", 1024 * 1024 * 15, 10)
+
+  ignore(s"Benchmark Kryo Unsafe vs safe Serialization") {
+    Seq (true, false).foreach (runBenchmark)
+    benchmark.run()
+
+    // scalastyle:off
+    /*
+      Benchmark Kryo Unsafe vs safe Serialization: Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+      ------------------------------------------------------------------------------------------------
+      basicTypes: Int with unsafe:true               151 /  170        104.2           9.6       1.0X
+      basicTypes: Long with unsafe:true              175 /  191         89.8          11.1       0.9X
+      basicTypes: Float with unsafe:true             177 /  184         88.8          11.3       0.9X
+      basicTypes: Double with unsafe:true            193 /  216         81.4          12.3       0.8X
+      Array: Int with unsafe:true                    513 /  587         30.7          32.6       0.3X
+      Array: Long with unsafe:true                  1211 / 1358         13.0          77.0       0.1X
+      Array: Float with unsafe:true                  890 /  964         17.7          56.6       0.2X
+      Array: Double with unsafe:true                1335 / 1428         11.8          84.9       0.1X
+      Map of string->Double  with unsafe:true        931 /  988         16.9          59.2       0.2X
+      basicTypes: Int with unsafe:false              197 /  217         79.9          12.5       0.8X
+      basicTypes: Long with unsafe:false             219 /  240         71.8          13.9       0.7X
+      basicTypes: Float with unsafe:false            208 /  217         75.7          13.2       0.7X
+      basicTypes: Double with unsafe:false           208 /  225         75.6          13.2       0.7X
+      Array: Int with unsafe:false                  2559 / 2681          6.1         162.7       0.1X
+      Array: Long with unsafe:false                 3425 / 3516          4.6         217.8       0.0X
+      Array: Float with unsafe:false                2025 / 2134          7.8         128.7       0.1X
+      Array: Double with unsafe:false               2241 / 2358          7.0         142.5       0.1X
+      Map of string->Double  with unsafe:false      1044 / 1085         15.1          66.4       0.1X
+    */
+    // scalastyle:on
+  }
+
+  private def runBenchmark(useUnsafe: Boolean): Unit = {
+    def check[T: ClassTag](t: T, ser: SerializerInstance): Int = {
+      if (ser.deserialize[T](ser.serialize(t)) === t) 1 else 0
+    }
+
+    // Benchmark Primitives
+    val basicTypeCount = 1000000
+    def basicTypes[T: ClassTag](name: String, gen: () => T): Unit = {
+      lazy val ser = createSerializer(useUnsafe)
+      val arrayOfBasicType: Array[T] = Array.fill(basicTypeCount)(gen())
+
+      benchmark.addCase(s"basicTypes: $name with unsafe:$useUnsafe") { _ =>
+        var sum = 0L
+        var i = 0
+        while (i < basicTypeCount) {
+          sum += check(arrayOfBasicType(i), ser)
+          i += 1
+        }
+        sum
+      }
+    }
+    basicTypes("Int", Random.nextInt)
+    basicTypes("Long", Random.nextLong)
+    basicTypes("Float", Random.nextFloat)
+    basicTypes("Double", Random.nextDouble)
+
+    // Benchmark Array of Primitives
+    val arrayCount = 10000
+    def basicTypeArray[T: ClassTag](name: String, gen: () => T): Unit = {
+      lazy val ser = createSerializer(useUnsafe)
+      val arrayOfArrays: Array[Array[T]] =
+        Array.fill(arrayCount)(Array.fill[T](Random.nextInt(arrayCount))(gen()))
+
+      benchmark.addCase(s"Array: $name with unsafe:$useUnsafe") { _ =>
+        var sum = 0L
+        var i = 0
+        while (i < arrayCount) {
+          val arr = arrayOfArrays(i)
+          sum += check(arr, ser)
+          i += 1
+        }
+        sum
+      }
+    }
+    basicTypeArray("Int", Random.nextInt)
+    basicTypeArray("Long", Random.nextLong)
+    basicTypeArray("Float", Random.nextFloat)
+    basicTypeArray("Double", Random.nextDouble)
+
+    // Benchmark Maps
+    val mapsCount = 1000
+    lazy val ser = createSerializer(useUnsafe)
+    val arrayOfMaps: Array[Map[String, Double]] = Array.fill(mapsCount) {
+      Array.fill(Random.nextInt(mapsCount)) {
+        (Random.nextString(mapsCount / 10), Random.nextDouble())
+      }.toMap
+    }
+
+    benchmark.addCase(s"Map of string->Double  with unsafe:$useUnsafe") { _ =>
+      var sum = 0L
+      var i = 0
+      while (i < mapsCount) {
+        val map = arrayOfMaps(i)
+        sum += check(map, ser)
+        i += 1
+      }
+      sum
+    }
+  }
+
+  def createSerializer(useUnsafe: Boolean): SerializerInstance = {
+    val conf = new SparkConf()
+    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
+    conf.set("spark.kryo.registrator", classOf[MyRegistrator].getName)
+    conf.set("spark.kryo.unsafe", useUnsafe.toString)
+
+    new KryoSerializer(conf).newInstance()
+  }
+
+}
diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
index bc6e98365daef..5040841811054 100644
--- a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
@@ -36,6 +36,7 @@ import org.apache.spark.util.Utils
 class KryoSerializerSuite extends SparkFunSuite with SharedSparkContext {
   conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
   conf.set("spark.kryo.registrator", classOf[MyRegistrator].getName)
+  conf.set("spark.kryo.unsafe", "false")
 
   test("SPARK-7392 configuration limits") {
     val kryoBufferProperty = "spark.kryoserializer.buffer"
diff --git a/core/src/test/scala/org/apache/spark/serializer/UnsafeKryoSerializerSuite.scala b/core/src/test/scala/org/apache/spark/serializer/UnsafeKryoSerializerSuite.scala
new file mode 100644
index 0000000000000..d63a45ae4a6a9
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/serializer/UnsafeKryoSerializerSuite.scala
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.serializer
+
+class UnsafeKryoSerializerSuite extends KryoSerializerSuite {
+
+  // This test suite should run all tests in KryoSerializerSuite with kryo unsafe.
+
+  override def beforeAll() {
+    conf.set("spark.kryo.unsafe", "true")
+    super.beforeAll()
+  }
+
+  override def afterAll() {
+    conf.set("spark.kryo.unsafe", "false")
+    super.afterAll()
+  }
+}
diff --git a/docs/configuration.md b/docs/configuration.md
index a4a99d6fa4630..b07867d99aa9d 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -799,6 +799,14 @@ Apart from these, the following properties are also available, and may be useful
     See the <a href="tuning.html#data-serialization">tuning guide</a> for more details.
   </td>
 </tr>
+<tr>
+  <td><code>spark.kryo.unsafe</code></td>
+  <td>false</td>
+  <td>
+    Whether to use unsafe based Kryo serializer. Can be
+    substantially faster by using Unsafe Based IO.
+  </td>
+</tr>
 <tr>
   <td><code>spark.kryoserializer.buffer.max</code></td>
   <td>64m</td>

From eff4aed1ac1e500d4aa40665dd06b527dffbc111 Mon Sep 17 00:00:00 2001
From: Tejas Patil <tejasp@fb.com>
Date: Sat, 22 Oct 2016 20:43:43 -0700
Subject: [PATCH 096/162] [SPARK-18035][SQL] Introduce performant and memory
 efficient APIs to create ArrayBasedMapData

## What changes were proposed in this pull request?

Jira: https://issues.apache.org/jira/browse/SPARK-18035

In HiveInspectors, I saw that converting Java map to Spark's `ArrayBasedMapData` spent quite sometime in buffer copying : https://github.com/apache/spark/blob/master/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala#L658

The reason being `map.toSeq` allocates a new buffer and copies the map entries to it: https://github.com/scala/scala/blob/2.11.x/src/library/scala/collection/MapLike.scala#L323

This copy is not needed as we get rid of it once we extract the key and value arrays.

Here is the call trace:

```
org.apache.spark.sql.hive.HiveInspectors$$anonfun$unwrapperFor$41.apply(HiveInspectors.scala:664)
scala.collection.AbstractMap.toSeq(Map.scala:59)
scala.collection.MapLike$class.toSeq(MapLike.scala:323)
scala.collection.AbstractMap.toBuffer(Map.scala:59)
scala.collection.MapLike$class.toBuffer(MapLike.scala:326)
scala.collection.AbstractTraversable.copyToBuffer(Traversable.scala:104)
scala.collection.TraversableOnce$class.copyToBuffer(TraversableOnce.scala:275)
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
scala.collection.AbstractIterable.foreach(Iterable.scala:54)
scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
scala.collection.Iterator$class.foreach(Iterator.scala:893)
scala.collection.generic.Growable$$anonfun$$plus$plus$eq$1.apply(Growable.scala:59)
scala.collection.generic.Growable$$anonfun$$plus$plus$eq$1.apply(Growable.scala:59)
```

Also, earlier code was populating keys and values arrays separately by iterating twice. The PR avoids double iteration of the map and does it in one iteration.

EDIT: During code review, there were several more places in the code which were found to do similar thing. The PR dedupes those instances and introduces convenient APIs which are performant and memory efficient

## Performance gains

The number is subjective and depends on how many map columns are accessed in the query and average entries per map. For one the queries that I tried out, I saw 3% CPU savings (end-to-end) for the query.

## How was this patch tested?

This does not change the end result produced so relying on existing tests.

Author: Tejas Patil <tejasp@fb.com>

Closes #15573 from tejasapatil/SPARK-18035_avoid_toSeq.
---
 .../sql/catalyst/CatalystTypeConverters.scala | 53 +++---------
 .../expressions/complexTypeCreator.scala      | 32 +++++---
 .../sql/catalyst/util/ArrayBasedMapData.scala | 81 ++++++++++++++++++-
 .../sql/execution/python/EvaluatePython.scala | 10 +--
 .../spark/sql/hive/HiveInspectors.scala       | 11 +--
 5 files changed, 119 insertions(+), 68 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
index f542f5cf40506..5b9161551a7af 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
@@ -199,34 +199,14 @@ object CatalystTypeConverters {
     private[this] val keyConverter = getConverterForType(keyType)
     private[this] val valueConverter = getConverterForType(valueType)
 
-    override def toCatalystImpl(scalaValue: Any): MapData = scalaValue match {
-      case m: Map[_, _] =>
-        val length = m.size
-        val convertedKeys = new Array[Any](length)
-        val convertedValues = new Array[Any](length)
-
-        var i = 0
-        for ((key, value) <- m) {
-          convertedKeys(i) = keyConverter.toCatalyst(key)
-          convertedValues(i) = valueConverter.toCatalyst(value)
-          i += 1
-        }
-        ArrayBasedMapData(convertedKeys, convertedValues)
-
-      case jmap: JavaMap[_, _] =>
-        val length = jmap.size()
-        val convertedKeys = new Array[Any](length)
-        val convertedValues = new Array[Any](length)
-
-        var i = 0
-        val iter = jmap.entrySet.iterator
-        while (iter.hasNext) {
-          val entry = iter.next()
-          convertedKeys(i) = keyConverter.toCatalyst(entry.getKey)
-          convertedValues(i) = valueConverter.toCatalyst(entry.getValue)
-          i += 1
-        }
-        ArrayBasedMapData(convertedKeys, convertedValues)
+    override def toCatalystImpl(scalaValue: Any): MapData = {
+      val keyFunction = (k: Any) => keyConverter.toCatalyst(k)
+      val valueFunction = (k: Any) => valueConverter.toCatalyst(k)
+
+      scalaValue match {
+        case map: Map[_, _] => ArrayBasedMapData(map, keyFunction, valueFunction)
+        case javaMap: JavaMap[_, _] => ArrayBasedMapData(javaMap, keyFunction, valueFunction)
+      }
     }
 
     override def toScala(catalystValue: MapData): Map[Any, Any] = {
@@ -433,18 +413,11 @@ object CatalystTypeConverters {
     case seq: Seq[Any] => new GenericArrayData(seq.map(convertToCatalyst).toArray)
     case r: Row => InternalRow(r.toSeq.map(convertToCatalyst): _*)
     case arr: Array[Any] => new GenericArrayData(arr.map(convertToCatalyst))
-    case m: Map[_, _] =>
-      val length = m.size
-      val convertedKeys = new Array[Any](length)
-      val convertedValues = new Array[Any](length)
-
-      var i = 0
-      for ((key, value) <- m) {
-        convertedKeys(i) = convertToCatalyst(key)
-        convertedValues(i) = convertToCatalyst(value)
-        i += 1
-      }
-      ArrayBasedMapData(convertedKeys, convertedValues)
+    case map: Map[_, _] =>
+      ArrayBasedMapData(
+        map,
+        (key: Any) => convertToCatalyst(key),
+        (value: Any) => convertToCatalyst(value))
     case other => other
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
index 09e22aaf3e3d8..917aa0873130b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -427,18 +427,28 @@ case class StringToMap(text: Expression, pairDelim: Expression, keyValueDelim: E
     }
   }
 
-  override def nullSafeEval(str: Any, delim1: Any, delim2: Any): Any = {
-    val array = str.asInstanceOf[UTF8String]
-      .split(delim1.asInstanceOf[UTF8String], -1)
-      .map { kv =>
-        val arr = kv.split(delim2.asInstanceOf[UTF8String], 2)
-        if (arr.length < 2) {
-          Array(arr(0), null)
-        } else {
-          arr
-        }
+  override def nullSafeEval(
+      inputString: Any,
+      stringDelimiter: Any,
+      keyValueDelimiter: Any): Any = {
+    val keyValues =
+      inputString.asInstanceOf[UTF8String].split(stringDelimiter.asInstanceOf[UTF8String], -1)
+
+    val iterator = new Iterator[(UTF8String, UTF8String)] {
+      var index = 0
+      val keyValueDelimiterUTF8String = keyValueDelimiter.asInstanceOf[UTF8String]
+
+      override def hasNext: Boolean = {
+        keyValues.length > index
       }
-    ArrayBasedMapData(array.map(_ (0)), array.map(_ (1)))
+
+      override def next(): (UTF8String, UTF8String) = {
+        val keyValueArray = keyValues(index).split(keyValueDelimiterUTF8String, 2)
+        index += 1
+        (keyValueArray(0), if (keyValueArray.length < 2) null else keyValueArray(1))
+      }
+    }
+    ArrayBasedMapData(iterator, keyValues.size, identity, identity)
   }
 
   override def prettyName: String = "str_to_map"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapData.scala
index 4449da13c083c..91b3139443696 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapData.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapData.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.catalyst.util
 
+import java.util.{Map => JavaMap}
+
 class ArrayBasedMapData(val keyArray: ArrayData, val valueArray: ArrayData) extends MapData {
   require(keyArray.numElements() == valueArray.numElements())
 
@@ -30,12 +32,83 @@ class ArrayBasedMapData(val keyArray: ArrayData, val valueArray: ArrayData) exte
 }
 
 object ArrayBasedMapData {
-  def apply(map: Map[Any, Any]): ArrayBasedMapData = {
-    val array = map.toArray
-    ArrayBasedMapData(array.map(_._1), array.map(_._2))
+  /**
+   * Creates a [[ArrayBasedMapData]] by applying the given converters over
+   * each (key -> value) pair of the input [[java.util.Map]]
+   *
+   * @param javaMap Input map
+   * @param keyConverter This function is applied over all the keys of the input map to
+   *                     obtain the output map's keys
+   * @param valueConverter This function is applied over all the values of the input map to
+   *                       obtain the output map's values
+   */
+  def apply(
+      javaMap: JavaMap[_, _],
+      keyConverter: (Any) => Any,
+      valueConverter: (Any) => Any): ArrayBasedMapData = {
+    import scala.language.existentials
+
+    val keys: Array[Any] = new Array[Any](javaMap.size())
+    val values: Array[Any] = new Array[Any](javaMap.size())
+
+    var i: Int = 0
+    val iterator = javaMap.entrySet().iterator()
+    while (iterator.hasNext) {
+      val entry = iterator.next()
+      keys(i) = keyConverter(entry.getKey)
+      values(i) = valueConverter(entry.getValue)
+      i += 1
+    }
+    ArrayBasedMapData(keys, values)
+  }
+
+  /**
+   * Creates a [[ArrayBasedMapData]] by applying the given converters over
+   * each (key -> value) pair of the input map
+   *
+   * @param map Input map
+   * @param keyConverter This function is applied over all the keys of the input map to
+   *                     obtain the output map's keys
+   * @param valueConverter This function is applied over all the values of the input map to
+   *                       obtain the output map's values
+   */
+  def apply(
+      map: scala.collection.Map[_, _],
+      keyConverter: (Any) => Any = identity,
+      valueConverter: (Any) => Any = identity): ArrayBasedMapData = {
+    ArrayBasedMapData(map.iterator, map.size, keyConverter, valueConverter)
+  }
+
+  /**
+   * Creates a [[ArrayBasedMapData]] by applying the given converters over
+   * each (key -> value) pair from the given iterator
+   *
+   * @param iterator Input iterator
+   * @param size Number of elements
+   * @param keyConverter This function is applied over all the keys extracted from the
+   *                     given iterator to obtain the output map's keys
+   * @param valueConverter This function is applied over all the values extracted from the
+   *                       given iterator to obtain the output map's values
+   */
+  def apply(
+      iterator: Iterator[(_, _)],
+      size: Int,
+      keyConverter: (Any) => Any,
+      valueConverter: (Any) => Any): ArrayBasedMapData = {
+
+    val keys: Array[Any] = new Array[Any](size)
+    val values: Array[Any] = new Array[Any](size)
+
+    var i = 0
+    for ((key, value) <- iterator) {
+      keys(i) = keyConverter(key)
+      values(i) = valueConverter(value)
+      i += 1
+    }
+    ArrayBasedMapData(keys, values)
   }
 
-  def apply(keys: Array[Any], values: Array[Any]): ArrayBasedMapData = {
+  def apply(keys: Array[_], values: Array[_]): ArrayBasedMapData = {
     new ArrayBasedMapData(new GenericArrayData(keys), new GenericArrayData(values))
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala
index 724025b4647f4..46fd54e5c7420 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala
@@ -124,11 +124,11 @@ object EvaluatePython {
     case (c, ArrayType(elementType, _)) if c.getClass.isArray =>
       new GenericArrayData(c.asInstanceOf[Array[_]].map(e => fromJava(e, elementType)))
 
-    case (c: java.util.Map[_, _], MapType(keyType, valueType, _)) =>
-      val keyValues = c.asScala.toSeq
-      val keys = keyValues.map(kv => fromJava(kv._1, keyType)).toArray
-      val values = keyValues.map(kv => fromJava(kv._2, valueType)).toArray
-      ArrayBasedMapData(keys, values)
+    case (javaMap: java.util.Map[_, _], MapType(keyType, valueType, _)) =>
+      ArrayBasedMapData(
+        javaMap,
+        (key: Any) => fromJava(key, keyType),
+        (value: Any) => fromJava(value, valueType))
 
     case (c, StructType(fields)) if c.getClass.isArray =>
       val array = c.asInstanceOf[Array[_]]
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index 1625116803505..e303065127c3b 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -473,10 +473,8 @@ private[hive] trait HiveInspectors {
       case mi: StandardConstantMapObjectInspector =>
         val keyUnwrapper = unwrapperFor(mi.getMapKeyObjectInspector)
         val valueUnwrapper = unwrapperFor(mi.getMapValueObjectInspector)
-        val keyValues = mi.getWritableConstantValue.asScala.toSeq
-        val keys = keyValues.map(kv => keyUnwrapper(kv._1)).toArray
-        val values = keyValues.map(kv => valueUnwrapper(kv._2)).toArray
-        val constant = ArrayBasedMapData(keys, values)
+        val keyValues = mi.getWritableConstantValue
+        val constant = ArrayBasedMapData(keyValues, keyUnwrapper, valueUnwrapper)
         _ => constant
       case li: StandardConstantListObjectInspector =>
         val unwrapper = unwrapperFor(li.getListElementObjectInspector)
@@ -655,10 +653,7 @@ private[hive] trait HiveInspectors {
             if (map == null) {
               null
             } else {
-              val keyValues = map.asScala.toSeq
-              val keys = keyValues.map(kv => keyUnwrapper(kv._1)).toArray
-              val values = keyValues.map(kv => valueUnwrapper(kv._2)).toArray
-              ArrayBasedMapData(keys, values)
+              ArrayBasedMapData(map, keyUnwrapper, valueUnwrapper)
             }
           } else {
             null

From 21c7539a5274a7e77686d17a6261d56592b85c2d Mon Sep 17 00:00:00 2001
From: Tejas Patil <tejasp@fb.com>
Date: Sun, 23 Oct 2016 13:25:47 +0200
Subject: [PATCH 097/162] [SPARK-18038][SQL] Move output partitioning
 definition from UnaryNodeExec to its children

## What changes were proposed in this pull request?

Jira : https://issues.apache.org/jira/browse/SPARK-18038

This was a suggestion by rxin over one of the dev list discussion : http://apache-spark-developers-list.1001551.n3.nabble.com/Project-not-preserving-child-partitioning-td19417.html

His words:

>> It would be better (safer) to move the output partitioning definition into each of the operator and remove it from UnaryExecNode.

With this PR, following is the output partitioning and ordering for all the impls of `UnaryExecNode`.

UnaryExecNode's impl | outputPartitioning | outputOrdering | comment
------------ | ------------- | ------------ | ------------
AppendColumnsExec | child's | Nil | child's ordering can be used
AppendColumnsWithObjectExec | child's | Nil | child's ordering can be used
BroadcastExchangeExec | BroadcastPartitioning | Nil | -
CoalesceExec | UnknownPartitioning | Nil | -
CollectLimitExec | SinglePartition | Nil | -
DebugExec | child's | Nil | child's ordering can be used
DeserializeToObjectExec | child's | Nil | child's ordering can be used
ExpandExec | UnknownPartitioning | Nil | -
FilterExec | child's | child's | -
FlatMapGroupsInRExec | child's | Nil | child's ordering can be used
GenerateExec | child's | Nil | need to dig more
GlobalLimitExec | child's | child's | -
HashAggregateExec | child's | Nil | -
InputAdapter | child's | child's | -
InsertIntoHiveTable | child's | Nil | terminal node, doesn't need partitioning
LocalLimitExec | child's | child's | -
MapElementsExec | child's | child's | -
MapGroupsExec | child's | Nil | child's ordering can be used
MapPartitionsExec | child's | Nil | child's ordering can be used
ProjectExec | child's | child's | -
SampleExec | child's | Nil | child's ordering can be used
ScriptTransformation | child's | Nil | child's ordering can be used
SerializeFromObjectExec | child's | Nil | child's ordering can be used
ShuffleExchange | custom | Nil | -
SortAggregateExec | child's | sort over grouped exprs | -
SortExec | child's | custom | -
StateStoreRestoreExec  | child's | Nil | child's ordering can be used
StateStoreSaveExec | child's | Nil | child's ordering can be used
SubqueryExec | child's | child's | -
TakeOrderedAndProjectExec | SinglePartition | custom | -
WholeStageCodegenExec | child's | child's | -
WindowExec | child's | child's | -

## How was this patch tested?

This does NOT change any existing functionality so relying on existing tests

Author: Tejas Patil <tejasp@fb.com>

Closes #15575 from tejasapatil/SPARK-18038_UnaryNodeExec_output_partitioning.
---
 .../spark/sql/execution/GenerateExec.scala      |  3 +++
 .../apache/spark/sql/execution/SortExec.scala   |  6 +++++-
 .../apache/spark/sql/execution/SparkPlan.scala  |  2 --
 .../sql/execution/WholeStageCodegenExec.scala   |  4 ++++
 .../execution/aggregate/HashAggregateExec.scala |  2 ++
 .../execution/aggregate/SortAggregateExec.scala |  4 +++-
 .../sql/execution/basicPhysicalOperators.scala  |  8 ++++++++
 .../spark/sql/execution/debug/package.scala     |  4 +++-
 .../org/apache/spark/sql/execution/limit.scala  | 16 +++++++++++-----
 .../apache/spark/sql/execution/objects.scala    | 17 +++++++++++++++++
 .../execution/streaming/StatefulAggregate.scala |  6 ++++++
 .../spark/sql/execution/window/WindowExec.scala |  2 ++
 .../spark/sql/execution/ReferenceSort.scala     |  2 ++
 .../hive/execution/InsertIntoHiveTable.scala    |  4 +++-
 .../hive/execution/ScriptTransformation.scala   |  3 +++
 .../execution/ScriptTransformationSuite.scala   |  4 ++++
 16 files changed, 76 insertions(+), 11 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/GenerateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/GenerateExec.scala
index 39189a2b0c72c..2663129562660 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/GenerateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/GenerateExec.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.execution
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.physical.Partitioning
 import org.apache.spark.sql.execution.metric.SQLMetrics
 
 /**
@@ -60,6 +61,8 @@ case class GenerateExec(
 
   override def producedAttributes: AttributeSet = AttributeSet(output)
 
+  override def outputPartitioning: Partitioning = child.outputPartitioning
+
   val boundGenerator = BindReferences.bindReference(generator, child.output)
 
   protected override def doExecute(): RDD[InternalRow] = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SortExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SortExec.scala
index d8e0675e3eb65..cc576bbc4c802 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SortExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SortExec.scala
@@ -23,7 +23,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
-import org.apache.spark.sql.catalyst.plans.physical.{Distribution, OrderedDistribution, UnspecifiedDistribution}
+import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution.metric.SQLMetrics
 
 /**
@@ -45,6 +45,10 @@ case class SortExec(
 
   override def outputOrdering: Seq[SortOrder] = sortOrder
 
+  // sort performed is local within a given partition so will retain
+  // child operator's partitioning
+  override def outputPartitioning: Partitioning = child.outputPartitioning
+
   override def requiredChildDistribution: Seq[Distribution] =
     if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index 48d6ef6dcd44a..24d0cffef82a2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -395,8 +395,6 @@ trait UnaryExecNode extends SparkPlan {
   def child: SparkPlan
 
   override final def children: Seq[SparkPlan] = child :: Nil
-
-  override def outputPartitioning: Partitioning = child.outputPartitioning
 }
 
 trait BinaryExecNode extends SparkPlan {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
index 62bf6f4a81eec..6303483f22fd3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
@@ -218,7 +218,9 @@ trait CodegenSupport extends SparkPlan {
 case class InputAdapter(child: SparkPlan) extends UnaryExecNode with CodegenSupport {
 
   override def output: Seq[Attribute] = child.output
+
   override def outputPartitioning: Partitioning = child.outputPartitioning
+
   override def outputOrdering: Seq[SortOrder] = child.outputOrdering
 
   override def doExecute(): RDD[InternalRow] = {
@@ -292,7 +294,9 @@ object WholeStageCodegenExec {
 case class WholeStageCodegenExec(child: SparkPlan) extends UnaryExecNode with CodegenSupport {
 
   override def output: Seq[Attribute] = child.output
+
   override def outputPartitioning: Partitioning = child.outputPartitioning
+
   override def outputOrdering: Seq[SortOrder] = child.outputOrdering
 
   override lazy val metrics = Map(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
index 06199ef3e8243..4529ed067e565 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
@@ -63,6 +63,8 @@ case class HashAggregateExec(
 
   override def output: Seq[Attribute] = resultExpressions.map(_.toAttribute)
 
+  override def outputPartitioning: Partitioning = child.outputPartitioning
+
   override def producedAttributes: AttributeSet =
     AttributeSet(aggregateAttributes) ++
     AttributeSet(resultExpressions.diff(groupingExpressions).map(_.toAttribute)) ++
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortAggregateExec.scala
index 2a81a823c44b3..be3198b8e7d82 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortAggregateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortAggregateExec.scala
@@ -22,7 +22,7 @@ import org.apache.spark.sql.catalyst.errors._
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
-import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, ClusteredDistribution, Distribution, UnspecifiedDistribution}
+import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
 import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.util.Utils
@@ -66,6 +66,8 @@ case class SortAggregateExec(
     groupingExpressions.map(SortOrder(_, Ascending)) :: Nil
   }
 
+  override def outputPartitioning: Partitioning = child.outputPartitioning
+
   override def outputOrdering: Seq[SortOrder] = {
     groupingExpressions.map(SortOrder(_, Ascending))
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
index dd78a784915d2..37d750e621c25 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
@@ -78,6 +78,8 @@ case class ProjectExec(projectList: Seq[NamedExpression], child: SparkPlan)
   }
 
   override def outputOrdering: Seq[SortOrder] = child.outputOrdering
+
+  override def outputPartitioning: Partitioning = child.outputPartitioning
 }
 
 
@@ -214,6 +216,8 @@ case class FilterExec(condition: Expression, child: SparkPlan)
   }
 
   override def outputOrdering: Seq[SortOrder] = child.outputOrdering
+
+  override def outputPartitioning: Partitioning = child.outputPartitioning
 }
 
 /**
@@ -234,6 +238,8 @@ case class SampleExec(
     child: SparkPlan) extends UnaryExecNode with CodegenSupport {
   override def output: Seq[Attribute] = child.output
 
+  override def outputPartitioning: Partitioning = child.outputPartitioning
+
   override lazy val metrics = Map(
     "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
 
@@ -517,7 +523,9 @@ case class SubqueryExec(name: String, child: SparkPlan) extends UnaryExecNode {
     "collectTime" -> SQLMetrics.createMetric(sparkContext, "time to collect (ms)"))
 
   override def output: Seq[Attribute] = child.output
+
   override def outputPartitioning: Partitioning = child.outputPartitioning
+
   override def outputOrdering: Seq[SortOrder] = child.outputOrdering
 
   override def sameResult(o: SparkPlan): Boolean = o match {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
index dd9d83767e221..0395c43ba2cbc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
@@ -27,8 +27,8 @@ import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeFormatter, CodegenContext, ExprCode}
+import org.apache.spark.sql.catalyst.plans.physical.Partitioning
 import org.apache.spark.sql.catalyst.trees.TreeNodeRef
-import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.util.{AccumulatorV2, LongAccumulator}
 
 /**
@@ -162,6 +162,8 @@ package object debug {
       }
     }
 
+    override def outputPartitioning: Partitioning = child.outputPartitioning
+
     override def inputRDDs(): Seq[RDD[InternalRow]] = {
       child.asInstanceOf[CodegenSupport].inputRDDs()
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala
index 86a8770715600..9918ac327f2dd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala
@@ -26,7 +26,6 @@ import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution.exchange.ShuffleExchange
 import org.apache.spark.util.Utils
 
-
 /**
  * Take the first `limit` elements and collect them to a single partition.
  *
@@ -54,8 +53,7 @@ case class CollectLimitExec(limit: Int, child: SparkPlan) extends UnaryExecNode
 trait BaseLimitExec extends UnaryExecNode with CodegenSupport {
   val limit: Int
   override def output: Seq[Attribute] = child.output
-  override def outputOrdering: Seq[SortOrder] = child.outputOrdering
-  override def outputPartitioning: Partitioning = child.outputPartitioning
+
   protected override def doExecute(): RDD[InternalRow] = child.execute().mapPartitions { iter =>
     iter.take(limit)
   }
@@ -95,14 +93,22 @@ trait BaseLimitExec extends UnaryExecNode with CodegenSupport {
  * Take the first `limit` elements of each child partition, but do not collect or shuffle them.
  */
 case class LocalLimitExec(limit: Int, child: SparkPlan) extends BaseLimitExec {
+
   override def outputOrdering: Seq[SortOrder] = child.outputOrdering
+
+  override def outputPartitioning: Partitioning = child.outputPartitioning
 }
 
 /**
  * Take the first `limit` elements of the child's single output partition.
  */
 case class GlobalLimitExec(limit: Int, child: SparkPlan) extends BaseLimitExec {
+
   override def requiredChildDistribution: List[Distribution] = AllTuples :: Nil
+
+  override def outputPartitioning: Partitioning = child.outputPartitioning
+
+  override def outputOrdering: Seq[SortOrder] = child.outputOrdering
 }
 
 /**
@@ -122,8 +128,6 @@ case class TakeOrderedAndProjectExec(
     projectList.map(_.toAttribute)
   }
 
-  override def outputPartitioning: Partitioning = SinglePartition
-
   override def executeCollect(): Array[InternalRow] = {
     val ord = new LazilyGeneratedOrdering(sortOrder, child.output)
     val data = child.execute().map(_.copy()).takeOrdered(limit)(ord)
@@ -160,6 +164,8 @@ case class TakeOrderedAndProjectExec(
 
   override def outputOrdering: Seq[SortOrder] = sortOrder
 
+  override def outputPartitioning: Partitioning = SinglePartition
+
   override def simpleString: String = {
     val orderByString = Utils.truncatedString(sortOrder, "[", ",", "]")
     val outputString = Utils.truncatedString(output, "[", ",", "]")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala
index 2acc5110e8950..9df56bbf1ef87 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala
@@ -68,6 +68,8 @@ case class DeserializeToObjectExec(
     outputObjAttr: Attribute,
     child: SparkPlan) extends UnaryExecNode with ObjectProducerExec with CodegenSupport {
 
+  override def outputPartitioning: Partitioning = child.outputPartitioning
+
   override def inputRDDs(): Seq[RDD[InternalRow]] = {
     child.asInstanceOf[CodegenSupport].inputRDDs()
   }
@@ -102,6 +104,8 @@ case class SerializeFromObjectExec(
 
   override def output: Seq[Attribute] = serializer.map(_.toAttribute)
 
+  override def outputPartitioning: Partitioning = child.outputPartitioning
+
   override def inputRDDs(): Seq[RDD[InternalRow]] = {
     child.asInstanceOf[CodegenSupport].inputRDDs()
   }
@@ -171,6 +175,8 @@ case class MapPartitionsExec(
     child: SparkPlan)
   extends ObjectConsumerExec with ObjectProducerExec {
 
+  override def outputPartitioning: Partitioning = child.outputPartitioning
+
   override protected def doExecute(): RDD[InternalRow] = {
     child.execute().mapPartitionsInternal { iter =>
       val getObject = ObjectOperator.unwrapObjectFromRow(child.output.head.dataType)
@@ -231,6 +237,8 @@ case class MapElementsExec(
   }
 
   override def outputOrdering: Seq[SortOrder] = child.outputOrdering
+
+  override def outputPartitioning: Partitioning = child.outputPartitioning
 }
 
 /**
@@ -244,6 +252,8 @@ case class AppendColumnsExec(
 
   override def output: Seq[Attribute] = child.output ++ serializer.map(_.toAttribute)
 
+  override def outputPartitioning: Partitioning = child.outputPartitioning
+
   private def newColumnSchema = serializer.map(_.toAttribute).toStructType
 
   override protected def doExecute(): RDD[InternalRow] = {
@@ -272,6 +282,8 @@ case class AppendColumnsWithObjectExec(
 
   override def output: Seq[Attribute] = (inputSerializer ++ newColumnsSerializer).map(_.toAttribute)
 
+  override def outputPartitioning: Partitioning = child.outputPartitioning
+
   private def inputSchema = inputSerializer.map(_.toAttribute).toStructType
   private def newColumnSchema = newColumnsSerializer.map(_.toAttribute).toStructType
 
@@ -304,6 +316,8 @@ case class MapGroupsExec(
     outputObjAttr: Attribute,
     child: SparkPlan) extends UnaryExecNode with ObjectProducerExec {
 
+  override def outputPartitioning: Partitioning = child.outputPartitioning
+
   override def requiredChildDistribution: Seq[Distribution] =
     ClusteredDistribution(groupingAttributes) :: Nil
 
@@ -347,6 +361,9 @@ case class FlatMapGroupsInRExec(
     child: SparkPlan) extends UnaryExecNode with ObjectProducerExec {
 
   override def output: Seq[Attribute] = outputObjAttr :: Nil
+
+  override def outputPartitioning: Partitioning = child.outputPartitioning
+
   override def producedAttributes: AttributeSet = AttributeSet(outputObjAttr)
 
   override def requiredChildDistribution: Seq[Distribution] =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StatefulAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StatefulAggregate.scala
index 587ea7d02acab..ad8238f189c64 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StatefulAggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StatefulAggregate.scala
@@ -22,6 +22,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.errors._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
+import org.apache.spark.sql.catalyst.plans.physical.Partitioning
 import org.apache.spark.sql.execution
 import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.sql.execution.streaming.state._
@@ -80,7 +81,10 @@ case class StateStoreRestoreExec(
         }
     }
   }
+
   override def output: Seq[Attribute] = child.output
+
+  override def outputPartitioning: Partitioning = child.outputPartitioning
 }
 
 /**
@@ -116,6 +120,8 @@ case class StateStoreSaveExec(
 
   override def output: Seq[Attribute] = child.output
 
+  override def outputPartitioning: Partitioning = child.outputPartitioning
+
   /**
    * Save all the rows to the state store, and return all the rows in the state store.
    * Note that this returns an iterator that pipelines the saving to store with downstream
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala
index 1dd281ebf1034..80b87d5ffa797 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala
@@ -103,6 +103,8 @@ case class WindowExec(
 
   override def outputOrdering: Seq[SortOrder] = child.outputOrdering
 
+  override def outputPartitioning: Partitioning = child.outputPartitioning
+
   /**
    * Create a bound ordering object for a given frame type and offset. A bound ordering object is
    * used to determine which input row lies within the frame boundaries of an output row.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ReferenceSort.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ReferenceSort.scala
index a19ea51af7c01..6abcb1f067968 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ReferenceSort.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ReferenceSort.scala
@@ -57,4 +57,6 @@ case class ReferenceSort(
   override def output: Seq[Attribute] = child.output
 
   override def outputOrdering: Seq[SortOrder] = sortOrder
+
+  override def outputPartitioning: Partitioning = child.outputPartitioning
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index 53bb3b93db738..c3c4e2925b90c 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -20,7 +20,6 @@ package org.apache.spark.sql.hive.execution
 import java.io.IOException
 import java.net.URI
 import java.text.SimpleDateFormat
-import java.util
 import java.util.{Date, Random}
 
 import scala.collection.JavaConverters._
@@ -36,6 +35,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.plans.physical.Partitioning
 import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
 import org.apache.spark.sql.hive._
 import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc}
@@ -291,6 +291,8 @@ case class InsertIntoHiveTable(
     Seq.empty[InternalRow]
   }
 
+  override def outputPartitioning: Partitioning = child.outputPartitioning
+
   override def executeCollect(): Array[InternalRow] = sideEffectResult.toArray
 
   protected override def doExecute(): RDD[InternalRow] = {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
index 1025b8f70d9ff..50855e48bc8fe 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
@@ -38,6 +38,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.ScriptInputOutputSchema
+import org.apache.spark.sql.catalyst.plans.physical.Partitioning
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.hive.HiveInspectors
 import org.apache.spark.sql.hive.HiveShim._
@@ -61,6 +62,8 @@ case class ScriptTransformation(
 
   override def producedAttributes: AttributeSet = outputSet -- inputSet
 
+  override def outputPartitioning: Partitioning = child.outputPartitioning
+
   protected override def doExecute(): RDD[InternalRow] = {
     def processIterator(inputIterator: Iterator[InternalRow], hadoopConf: Configuration)
       : Iterator[InternalRow] = {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala
index a8e81d7a3c42a..0e837766e2ea4 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala
@@ -24,6 +24,7 @@ import org.apache.spark.{SparkException, TaskContext}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
+import org.apache.spark.sql.catalyst.plans.physical.Partitioning
 import org.apache.spark.sql.execution.{SparkPlan, SparkPlanTest, UnaryExecNode}
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.types.StringType
@@ -135,5 +136,8 @@ private case class ExceptionInjectingOperator(child: SparkPlan) extends UnaryExe
       throw new IllegalArgumentException("intentional exception")
     }
   }
+
   override def output: Seq[Attribute] = child.output
+
+  override def outputPartitioning: Partitioning = child.outputPartitioning
 }

From b158256c2e719edde3dbdfe27a9a65cd3b3039f4 Mon Sep 17 00:00:00 2001
From: jiangxingbo <jiangxb1987@gmail.com>
Date: Sun, 23 Oct 2016 13:28:35 +0200
Subject: [PATCH 098/162] [SPARK-18045][SQL][TESTS] Move
 `HiveDataFrameAnalyticsSuite` to package `sql`

## What changes were proposed in this pull request?

The testsuite `HiveDataFrameAnalyticsSuite` has nothing to do with HIVE, we should move it to package `sql`.
The original test cases in that suite are splited into two existing testsuites: `DataFrameAggregateSuite` tests for the functions and ~~`SQLQuerySuite`~~`SQLQueryTestSuite` tests for the SQL statements.

## How was this patch tested?
~~Modified `SQLQuerySuite` in package `sql`.~~
Add query file for `SQLQueryTestSuite`.

Author: jiangxingbo <jiangxb1987@gmail.com>

Closes #15582 from jiangxb1987/group-analytics-test.
---
 .../sql-tests/inputs/group-analytics.sql      | 13 +++
 .../sql-tests/results/group-analytics.sql.out | 87 +++++++++++++++++++
 .../hive/HiveDataFrameAnalyticsSuite.scala    | 72 ---------------
 3 files changed, 100 insertions(+), 72 deletions(-)
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/results/group-analytics.sql.out
 delete mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameAnalyticsSuite.scala

diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql b/sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql
new file mode 100644
index 0000000000000..2f783495ddf96
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql
@@ -0,0 +1,13 @@
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2)
+AS testData(a, b);
+
+-- CUBE on overlapping columns
+SELECT a + b, b, SUM(a - b) FROM testData GROUP BY a + b, b WITH CUBE;
+
+SELECT a, b, SUM(b) FROM testData GROUP BY a, b WITH CUBE;
+
+-- ROLLUP on overlapping columns
+SELECT a + b, b, SUM(a - b) FROM testData GROUP BY a + b, b WITH ROLLUP;
+
+SELECT a, b, SUM(b) FROM testData GROUP BY a, b WITH ROLLUP;
\ No newline at end of file
diff --git a/sql/core/src/test/resources/sql-tests/results/group-analytics.sql.out b/sql/core/src/test/resources/sql-tests/results/group-analytics.sql.out
new file mode 100644
index 0000000000000..8ea7de809d19d
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/group-analytics.sql.out
@@ -0,0 +1,87 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 5
+
+
+-- !query 0
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2)
+AS testData(a, b)
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+SELECT a + b, b, SUM(a - b) FROM testData GROUP BY a + b, b WITH CUBE
+-- !query 1 schema
+struct<(a + b):int,b:int,sum((a - b)):bigint>
+-- !query 1 output
+2	1	0
+2	NULL	0
+3	1	1
+3	2	-1
+3	NULL	0
+4	1	2
+4	2	0
+4	NULL	2
+5	2	1
+5	NULL	1
+NULL	1	3
+NULL	2	0
+NULL	NULL	3
+
+
+
+-- !query 2
+SELECT a, b, SUM(b) FROM testData GROUP BY a, b WITH CUBE
+-- !query 2 schema
+struct<a:int,b:int,sum(b):bigint>
+-- !query 2 output
+1	1	1
+1	2	2
+1	NULL	3
+2	1	1
+2	2	2
+2	NULL	3
+3	1	1
+3	2	2
+3	NULL	3
+NULL	1	3
+NULL	2	6
+NULL	NULL	9
+
+
+-- !query 3
+SELECT a + b, b, SUM(a - b) FROM testData GROUP BY a + b, b WITH ROLLUP
+-- !query 3 schema
+struct<(a + b):int,b:int,sum((a - b)):bigint>
+-- !query 3 output
+2	1	0
+2	NULL	0
+3	1	1
+3	2	-1
+3	NULL	0
+4	1	2
+4	2	0
+4	NULL	2
+5	2	1
+5	NULL	1
+NULL	NULL	3
+
+
+-- !query 4
+SELECT a, b, SUM(b) FROM testData GROUP BY a, b WITH ROLLUP
+-- !query 4 schema
+struct<a:int,b:int,sum(b):bigint>
+-- !query 4 output
+1	1	1
+1	2	2
+1	NULL	3
+2	1	1
+2	2	2
+2	NULL	3
+3	1	1
+3	2	2
+3	NULL	3
+NULL	NULL	9
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameAnalyticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameAnalyticsSuite.scala
deleted file mode 100644
index 6477974fe713a..0000000000000
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameAnalyticsSuite.scala
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.hive
-
-import org.scalatest.BeforeAndAfterAll
-
-import org.apache.spark.sql.{DataFrame, QueryTest, Row}
-import org.apache.spark.sql.functions._
-import org.apache.spark.sql.hive.test.TestHiveSingleton
-
-// TODO ideally we should put the test suite into the package `sql`, as
-// `hive` package is optional in compiling, however, `SQLContext.sql` doesn't
-// support the `cube` or `rollup` yet.
-class HiveDataFrameAnalyticsSuite extends QueryTest with TestHiveSingleton with BeforeAndAfterAll {
-  import spark.implicits._
-  import spark.sql
-
-  private var testData: DataFrame = _
-
-  override def beforeAll() {
-    super.beforeAll()
-    testData = Seq((1, 2), (2, 2), (3, 4)).toDF("a", "b")
-    testData.createOrReplaceTempView("mytable")
-  }
-
-  override def afterAll(): Unit = {
-    try {
-      spark.catalog.dropTempView("mytable")
-    } finally {
-      super.afterAll()
-    }
-  }
-
-  test("rollup") {
-    checkAnswer(
-      testData.rollup($"a" + $"b", $"b").agg(sum($"a" - $"b")),
-      sql("select a + b, b, sum(a - b) from mytable group by a + b, b with rollup").collect()
-    )
-
-    checkAnswer(
-      testData.rollup("a", "b").agg(sum("b")),
-      sql("select a, b, sum(b) from mytable group by a, b with rollup").collect()
-    )
-  }
-
-  test("cube") {
-    checkAnswer(
-      testData.cube($"a" + $"b", $"b").agg(sum($"a" - $"b")),
-      sql("select a + b, b, sum(a - b) from mytable group by a + b, b with cube").collect()
-    )
-
-    checkAnswer(
-      testData.cube("a", "b").agg(sum("b")),
-      sql("select a, b, sum(b) from mytable group by a, b with cube").collect()
-    )
-  }
-}

From a81fba048fabcd413730548ab65955802508d4e4 Mon Sep 17 00:00:00 2001
From: CodingCat <zhunansjtu@gmail.com>
Date: Sun, 23 Oct 2016 19:42:11 +0200
Subject: [PATCH 099/162] [SPARK-18058][SQL] Comparing column types ignoring
 Nullability in Union and SetOperation

## What changes were proposed in this pull request?

The PR tries to fix [SPARK-18058](https://issues.apache.org/jira/browse/SPARK-18058) which refers to a bug that the column types are compared with the extra care about Nullability in Union and SetOperation.

This PR converts the columns types by setting all fields as nullable before comparison

## How was this patch tested?

regular unit test cases

Author: CodingCat <zhunansjtu@gmail.com>

Closes #15595 from CodingCat/SPARK-18058.
---
 .../sql/catalyst/analysis/CheckAnalysis.scala |  3 +-
 .../plans/logical/basicLogicalOperators.scala | 30 +++++++------------
 .../sql/catalyst/analysis/AnalysisSuite.scala | 19 ++++++++++++
 3 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index 9c06069f24f76..9a7c2a944b588 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -287,7 +287,8 @@ trait CheckAnalysis extends PredicateHelper {
               }
               // Check if the data types match.
               dataTypes(child).zip(ref).zipWithIndex.foreach { case ((dt1, dt2), ci) =>
-                if (dt1 != dt2) {
+                // SPARK-18058: we shall not care about the nullability of columns
+                if (dt1.asNullable != dt2.asNullable) {
                   failAnalysis(
                     s"""
                       |${operator.nodeName} can only be performed on tables with the compatible
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index d2d33e40a8c8f..64a787a7ae351 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -117,6 +117,8 @@ case class Filter(condition: Expression, child: LogicalPlan)
 
 abstract class SetOperation(left: LogicalPlan, right: LogicalPlan) extends BinaryNode {
 
+  def duplicateResolved: Boolean = left.outputSet.intersect(right.outputSet).isEmpty
+
   protected def leftConstraints: Set[Expression] = left.constraints
 
   protected def rightConstraints: Set[Expression] = {
@@ -126,6 +128,13 @@ abstract class SetOperation(left: LogicalPlan, right: LogicalPlan) extends Binar
       case a: Attribute => attributeRewrites(a)
     })
   }
+
+  override lazy val resolved: Boolean =
+    childrenResolved &&
+      left.output.length == right.output.length &&
+      left.output.zip(right.output).forall { case (l, r) =>
+        l.dataType.asNullable == r.dataType.asNullable
+      } && duplicateResolved
 }
 
 object SetOperation {
@@ -134,8 +143,6 @@ object SetOperation {
 
 case class Intersect(left: LogicalPlan, right: LogicalPlan) extends SetOperation(left, right) {
 
-  def duplicateResolved: Boolean = left.outputSet.intersect(right.outputSet).isEmpty
-
   override def output: Seq[Attribute] =
     left.output.zip(right.output).map { case (leftAttr, rightAttr) =>
       leftAttr.withNullability(leftAttr.nullable && rightAttr.nullable)
@@ -144,14 +151,6 @@ case class Intersect(left: LogicalPlan, right: LogicalPlan) extends SetOperation
   override protected def validConstraints: Set[Expression] =
     leftConstraints.union(rightConstraints)
 
-  // Intersect are only resolved if they don't introduce ambiguous expression ids,
-  // since the Optimizer will convert Intersect to Join.
-  override lazy val resolved: Boolean =
-    childrenResolved &&
-      left.output.length == right.output.length &&
-      left.output.zip(right.output).forall { case (l, r) => l.dataType == r.dataType } &&
-      duplicateResolved
-
   override def maxRows: Option[Long] = {
     if (children.exists(_.maxRows.isEmpty)) {
       None
@@ -172,19 +171,11 @@ case class Intersect(left: LogicalPlan, right: LogicalPlan) extends SetOperation
 
 case class Except(left: LogicalPlan, right: LogicalPlan) extends SetOperation(left, right) {
 
-  def duplicateResolved: Boolean = left.outputSet.intersect(right.outputSet).isEmpty
-
   /** We don't use right.output because those rows get excluded from the set. */
   override def output: Seq[Attribute] = left.output
 
   override protected def validConstraints: Set[Expression] = leftConstraints
 
-  override lazy val resolved: Boolean =
-    childrenResolved &&
-      left.output.length == right.output.length &&
-      left.output.zip(right.output).forall { case (l, r) => l.dataType == r.dataType } &&
-      duplicateResolved
-
   override lazy val statistics: Statistics = {
     left.statistics.copy()
   }
@@ -219,9 +210,8 @@ case class Union(children: Seq[LogicalPlan]) extends LogicalPlan {
         child.output.length == children.head.output.length &&
         // compare the data types with the first child
         child.output.zip(children.head.output).forall {
-          case (l, r) => l.dataType == r.dataType }
+          case (l, r) => l.dataType.asNullable == r.dataType.asNullable }
       )
-
     children.length > 1 && childrenResolved && allChildrenCompatible
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index 50ebad25cd258..590774c043040 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -377,4 +377,23 @@ class AnalysisSuite extends AnalysisTest {
     assertExpressionType(sum(Divide(Decimal(1), 2.0)), DoubleType)
     assertExpressionType(sum(Divide(1.0, Decimal(2.0))), DoubleType)
   }
+
+  test("SPARK-18058: union and set operations shall not care about the nullability" +
+    " when comparing column types") {
+    val firstTable = LocalRelation(
+      AttributeReference("a",
+        StructType(Seq(StructField("a", IntegerType, nullable = true))), nullable = false)())
+    val secondTable = LocalRelation(
+      AttributeReference("a",
+        StructType(Seq(StructField("a", IntegerType, nullable = false))), nullable = false)())
+
+    val unionPlan = Union(firstTable, secondTable)
+    assertAnalysisSuccess(unionPlan)
+
+    val r1 = Except(firstTable, secondTable)
+    val r2 = Intersect(firstTable, secondTable)
+
+    assertAnalysisSuccess(r1)
+    assertAnalysisSuccess(r2)
+  }
 }

From 3a423f5a0373de87ddfb4744852b2fda14fcc3cb Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Sun, 23 Oct 2016 10:53:27 -0700
Subject: [PATCH 100/162] [SPARKR][BRANCH-2.0] R merge API doc and example fix

## What changes were proposed in this pull request?

Fixes for R doc

## How was this patch tested?

N/A

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #15589 from felixcheung/rdocmergefix.

(cherry picked from commit 0e0d83a597885ab1773cb69d6dcc10346d6976a3)
Signed-off-by: Felix Cheung <felixcheung@apache.org>
---
 R/pkg/R/DataFrame.R                       | 2 +-
 R/pkg/inst/tests/testthat/test_sparkSQL.R | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 8910a4b138a37..b6ce838969a44 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -365,7 +365,7 @@ setMethod("colnames<-",
 
             # Check if the column names have . in it
             if (any(regexec(".", value, fixed = TRUE)[[1]][1] != -1)) {
-              stop("Colum names cannot contain the '.' symbol.")
+              stop("Column names cannot contain the '.' symbol.")
             }
 
             sdf <- callJMethod(x@sdf, "toDF", as.list(value))
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index b4b43fdba42ce..e77dbde44ee66 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -845,7 +845,7 @@ test_that("names() colnames() set the column names", {
   expect_equal(names(df)[1], "col3")
 
   expect_error(colnames(df) <- c("sepal.length", "sepal_width"),
-               "Colum names cannot contain the '.' symbol.")
+               "Column names cannot contain the '.' symbol.")
   expect_error(colnames(df) <- c(1, 2), "Invalid column names.")
   expect_error(colnames(df) <- c("a"),
                "Column names must have the same length as the number of columns in the dataset.")

From c64a8ff39794d60c596c0d34130019c09c9c8012 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Mon, 24 Oct 2016 10:25:24 +0100
Subject: [PATCH 101/162] [SPARK-18049][MLLIB][TEST] Add missing tests for
 truePositiveRate and weightedTruePositiveRate

## What changes were proposed in this pull request?
Add missing tests for `truePositiveRate` and `weightedTruePositiveRate` in `MulticlassMetricsSuite`

## How was this patch tested?
added testing

Author: Zheng RuiFeng <ruifengz@foxmail.com>

Closes #15585 from zhengruifeng/mc_missing_test.
---
 .../api/python/WriteInputFormatTestDataGenerator.scala    | 2 +-
 .../main/scala/org/apache/spark/ml/util/ReadWrite.scala   | 2 +-
 .../apache/spark/mllib/evaluation/RegressionMetrics.scala | 2 +-
 .../spark/mllib/linalg/distributed/BlockMatrix.scala      | 4 ++--
 .../spark/mllib/evaluation/MulticlassMetricsSuite.scala   | 8 ++++++++
 .../spark/mllib/evaluation/MultilabelMetricsSuite.scala   | 2 +-
 6 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/WriteInputFormatTestDataGenerator.scala b/core/src/main/scala/org/apache/spark/api/python/WriteInputFormatTestDataGenerator.scala
index 34cb7c61d7034..86965dbc2e778 100644
--- a/core/src/main/scala/org/apache/spark/api/python/WriteInputFormatTestDataGenerator.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/WriteInputFormatTestDataGenerator.scala
@@ -144,7 +144,7 @@ object WriteInputFormatTestDataGenerator {
 
     // Create test data for ArrayWritable
     val data = Seq(
-      (1, Array()),
+      (1, Array.empty[Double]),
       (2, Array(3.0, 4.0, 5.0)),
       (3, Array(4.0, 5.0, 6.0))
     )
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
index 4413fefdea3ca..bc4f9e6716ee8 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
@@ -474,7 +474,7 @@ private[ml] object MetaAlgorithmReadWrite {
       case ovr: OneVsRest => Array(ovr.getClassifier)
       case ovrModel: OneVsRestModel => Array(ovrModel.getClassifier) ++ ovrModel.models
       case rformModel: RFormulaModel => Array(rformModel.pipelineModel)
-      case _: Params => Array()
+      case _: Params => Array.empty[Params]
     }
     val subStageMaps = subStages.flatMap(getUidMapImpl)
     List((instance.uid, instance)) ++ subStageMaps
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
index ce4421515126c..8f777cc35b93f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
@@ -73,7 +73,7 @@ class RegressionMetrics @Since("2.0.0") (
 
   /**
    * Returns the variance explained by regression.
-   * explainedVariance = $\sum_i (\hat{y_i} - \bar{y})^2 / n$
+   * explainedVariance = $\sum_i (\hat{y_i} - \bar{y})^2^ / n$
    * @see [[https://en.wikipedia.org/wiki/Fraction_of_variance_unexplained]]
    */
   @Since("1.2.0")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
index ff1068417d94f..377be6bfb9886 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
@@ -429,14 +429,14 @@ class BlockMatrix @Since("1.3.0") (
 
     val rightCounterpartsHelper = rightMatrix.groupBy(_._1).mapValues(_.map(_._2))
     val leftDestinations = leftMatrix.map { case (rowIndex, colIndex) =>
-      val rightCounterparts = rightCounterpartsHelper.getOrElse(colIndex, Array())
+      val rightCounterparts = rightCounterpartsHelper.getOrElse(colIndex, Array.empty[Int])
       val partitions = rightCounterparts.map(b => partitioner.getPartition((rowIndex, b)))
       ((rowIndex, colIndex), partitions.toSet)
     }.toMap
 
     val leftCounterpartsHelper = leftMatrix.groupBy(_._2).mapValues(_.map(_._1))
     val rightDestinations = rightMatrix.map { case (rowIndex, colIndex) =>
-      val leftCounterparts = leftCounterpartsHelper.getOrElse(rowIndex, Array())
+      val leftCounterparts = leftCounterpartsHelper.getOrElse(rowIndex, Array.empty[Int])
       val partitions = leftCounterparts.map(b => partitioner.getPartition((b, colIndex)))
       ((rowIndex, colIndex), partitions.toSet)
     }.toMap
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MulticlassMetricsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MulticlassMetricsSuite.scala
index f316c67234f18..142d1e9812ef1 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MulticlassMetricsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MulticlassMetricsSuite.scala
@@ -36,6 +36,9 @@ class MulticlassMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {
         (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)), 2)
     val metrics = new MulticlassMetrics(predictionAndLabels)
     val delta = 0.0000001
+    val tpRate0 = 2.0 / (2 + 2)
+    val tpRate1 = 3.0 / (3 + 1)
+    val tpRate2 = 1.0 / (1 + 0)
     val fpRate0 = 1.0 / (9 - 4)
     val fpRate1 = 1.0 / (9 - 4)
     val fpRate2 = 1.0 / (9 - 1)
@@ -53,6 +56,9 @@ class MulticlassMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {
     val f2measure2 = (1 + 2 * 2) * precision2 * recall2 / (2 * 2 * precision2 + recall2)
 
     assert(metrics.confusionMatrix.toArray.sameElements(confusionMatrix.toArray))
+    assert(math.abs(metrics.truePositiveRate(0.0) - tpRate0) < delta)
+    assert(math.abs(metrics.truePositiveRate(1.0) - tpRate1) < delta)
+    assert(math.abs(metrics.truePositiveRate(2.0) - tpRate2) < delta)
     assert(math.abs(metrics.falsePositiveRate(0.0) - fpRate0) < delta)
     assert(math.abs(metrics.falsePositiveRate(1.0) - fpRate1) < delta)
     assert(math.abs(metrics.falsePositiveRate(2.0) - fpRate2) < delta)
@@ -75,6 +81,8 @@ class MulticlassMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(math.abs(metrics.accuracy - metrics.recall) < delta)
     assert(math.abs(metrics.accuracy - metrics.fMeasure) < delta)
     assert(math.abs(metrics.accuracy - metrics.weightedRecall) < delta)
+    assert(math.abs(metrics.weightedTruePositiveRate -
+      ((4.0 / 9) * tpRate0 + (4.0 / 9) * tpRate1 + (1.0 / 9) * tpRate2)) < delta)
     assert(math.abs(metrics.weightedFalsePositiveRate -
       ((4.0 / 9) * fpRate0 + (4.0 / 9) * fpRate1 + (1.0 / 9) * fpRate2)) < delta)
     assert(math.abs(metrics.weightedPrecision -
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MultilabelMetricsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MultilabelMetricsSuite.scala
index f3b19aeb42f84..a660492c7ae59 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MultilabelMetricsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MultilabelMetricsSuite.scala
@@ -47,7 +47,7 @@ class MultilabelMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {
     val scoreAndLabels: RDD[(Array[Double], Array[Double])] = sc.parallelize(
       Seq((Array(0.0, 1.0), Array(0.0, 2.0)),
         (Array(0.0, 2.0), Array(0.0, 1.0)),
-        (Array(), Array(0.0)),
+        (Array.empty[Double], Array(0.0)),
         (Array(2.0), Array(2.0)),
         (Array(2.0, 0.0), Array(2.0, 0.0)),
         (Array(0.0, 1.0, 2.0), Array(0.0, 1.0)),

From 4ecbe1b92f4c4c5b2d734895c09d8ded0ed48d4d Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Mon, 24 Oct 2016 10:44:45 +0100
Subject: [PATCH 102/162] [SPARK-17810][SQL] Default spark.sql.warehouse.dir is
 relative to local FS but can resolve as HDFS path

## What changes were proposed in this pull request?

Always resolve spark.sql.warehouse.dir as a local path, and as relative to working dir not home dir

## How was this patch tested?

Existing tests.

Author: Sean Owen <sowen@cloudera.com>

Closes #15382 from srowen/SPARK-17810.
---
 docs/sql-programming-guide.md                 | 33 +++----------------
 .../sql/hive/JavaSparkHiveExample.java        |  2 +-
 examples/src/main/python/sql/hive.py          |  2 +-
 .../examples/sql/hive/SparkHiveExample.scala  |  2 +-
 .../apache/spark/sql/internal/SQLConf.scala   |  3 +-
 .../sql/execution/command/DDLSuite.scala      | 23 ++++++-------
 .../spark/sql/internal/SQLConfSuite.scala     |  6 ++--
 .../sql/hive/execution/HiveQuerySuite.scala   |  4 ++-
 .../spark/sql/sources/BucketedReadSuite.scala |  5 +--
 9 files changed, 29 insertions(+), 51 deletions(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index d334a86bc73d7..064af41965b70 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -904,50 +904,27 @@ access data stored in Hive.
 Configuration of Hive is done by placing your `hive-site.xml`, `core-site.xml` (for security configuration),
 and `hdfs-site.xml` (for HDFS configuration) file in `conf/`.
 
-<div class="codetabs">
-
-<div data-lang="scala"  markdown="1">
-
 When working with Hive, one must instantiate `SparkSession` with Hive support, including
 connectivity to a persistent Hive metastore, support for Hive serdes, and Hive user-defined functions.
 Users who do not have an existing Hive deployment can still enable Hive support. When not configured
 by the `hive-site.xml`, the context automatically creates `metastore_db` in the current directory and
 creates a directory configured by `spark.sql.warehouse.dir`, which defaults to the directory
-`spark-warehouse` in the current directory that the spark application is started. Note that
+`spark-warehouse` in the current directory that the Spark application is started. Note that
 the `hive.metastore.warehouse.dir` property in `hive-site.xml` is deprecated since Spark 2.0.0.
 Instead, use `spark.sql.warehouse.dir` to specify the default location of database in warehouse.
-You may need to grant write privilege to the user who starts the spark application.
+You may need to grant write privilege to the user who starts the Spark application.
 
+<div class="codetabs">
+
+<div data-lang="scala"  markdown="1">
 {% include_example spark_hive scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala %}
 </div>
 
 <div data-lang="java"  markdown="1">
-
-When working with Hive, one must instantiate `SparkSession` with Hive support, including
-connectivity to a persistent Hive metastore, support for Hive serdes, and Hive user-defined functions.
-Users who do not have an existing Hive deployment can still enable Hive support. When not configured
-by the `hive-site.xml`, the context automatically creates `metastore_db` in the current directory and
-creates a directory configured by `spark.sql.warehouse.dir`, which defaults to the directory
-`spark-warehouse` in the current directory that the spark application is started. Note that
-the `hive.metastore.warehouse.dir` property in `hive-site.xml` is deprecated since Spark 2.0.0.
-Instead, use `spark.sql.warehouse.dir` to specify the default location of database in warehouse.
-You may need to grant write privilege to the user who starts the spark application.
-
 {% include_example spark_hive java/org/apache/spark/examples/sql/hive/JavaSparkHiveExample.java %}
 </div>
 
 <div data-lang="python"  markdown="1">
-
-When working with Hive, one must instantiate `SparkSession` with Hive support, including
-connectivity to a persistent Hive metastore, support for Hive serdes, and Hive user-defined functions.
-Users who do not have an existing Hive deployment can still enable Hive support. When not configured
-by the `hive-site.xml`, the context automatically creates `metastore_db` in the current directory and
-creates a directory configured by `spark.sql.warehouse.dir`, which defaults to the directory
-`spark-warehouse` in the current directory that the spark application is started. Note that
-the `hive.metastore.warehouse.dir` property in `hive-site.xml` is deprecated since Spark 2.0.0.
-Instead, use `spark.sql.warehouse.dir` to specify the default location of database in warehouse.
-You may need to grant write privilege to the user who starts the spark application.
-
 {% include_example spark_hive python/sql/hive.py %}
 </div>
 
diff --git a/examples/src/main/java/org/apache/spark/examples/sql/hive/JavaSparkHiveExample.java b/examples/src/main/java/org/apache/spark/examples/sql/hive/JavaSparkHiveExample.java
index 76dd160d5568b..052153c9e9736 100644
--- a/examples/src/main/java/org/apache/spark/examples/sql/hive/JavaSparkHiveExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/sql/hive/JavaSparkHiveExample.java
@@ -56,7 +56,7 @@ public void setValue(String value) {
   public static void main(String[] args) {
     // $example on:spark_hive$
     // warehouseLocation points to the default location for managed databases and tables
-    String warehouseLocation = "file:" + System.getProperty("user.dir") + "spark-warehouse";
+    String warehouseLocation = "spark-warehouse";
     SparkSession spark = SparkSession
       .builder()
       .appName("Java Spark Hive Example")
diff --git a/examples/src/main/python/sql/hive.py b/examples/src/main/python/sql/hive.py
index 98b48908b5a12..ad83fe1cf14b5 100644
--- a/examples/src/main/python/sql/hive.py
+++ b/examples/src/main/python/sql/hive.py
@@ -34,7 +34,7 @@
 if __name__ == "__main__":
     # $example on:spark_hive$
     # warehouse_location points to the default location for managed databases and tables
-    warehouse_location = 'file:${system:user.dir}/spark-warehouse'
+    warehouse_location = 'spark-warehouse'
 
     spark = SparkSession \
         .builder \
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala b/examples/src/main/scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala
index 11e84c0e45632..ded18dacf1fe3 100644
--- a/examples/src/main/scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala
@@ -38,7 +38,7 @@ object SparkHiveExample {
 
     // $example on:spark_hive$
     // warehouseLocation points to the default location for managed databases and tables
-    val warehouseLocation = "file:${system:user.dir}/spark-warehouse"
+    val warehouseLocation = "spark-warehouse"
 
     val spark = SparkSession
       .builder()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index a6e2fa26cb5ef..f47ec7f3963a4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -30,6 +30,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config._
 import org.apache.spark.network.util.ByteUnit
 import org.apache.spark.sql.catalyst.CatalystConf
+import org.apache.spark.util.Utils
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // This file defines the configuration options for Spark SQL.
@@ -56,7 +57,7 @@ object SQLConf {
   val WAREHOUSE_PATH = SQLConfigBuilder("spark.sql.warehouse.dir")
     .doc("The default location for managed databases and tables.")
     .stringConf
-    .createWithDefault("${system:user.dir}/spark-warehouse")
+    .createWithDefault(Utils.resolveURI("spark-warehouse").toString)
 
   val OPTIMIZER_MAX_ITERATIONS = SQLConfigBuilder("spark.sql.optimizer.maxIterations")
     .internal()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index a6da8a86c1623..d593bfb4ce19a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -43,8 +43,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
       // drop all databases, tables and functions after each test
       spark.sessionState.catalog.reset()
     } finally {
-      val path = System.getProperty("user.dir") + "/spark-warehouse"
-      Utils.deleteRecursively(new File(path))
+      Utils.deleteRecursively(new File("spark-warehouse"))
       super.afterEach()
     }
   }
@@ -116,7 +115,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
     val catalog = spark.sessionState.catalog
 
     withTempDir { tmpDir =>
-      val path = tmpDir.toString
+      val path = tmpDir.getCanonicalPath
       // The generated temp path is not qualified.
       assert(!path.startsWith("file:/"))
       val uri = tmpDir.toURI
@@ -148,7 +147,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
 
   test("Create/Drop Database") {
     withTempDir { tmpDir =>
-      val path = tmpDir.toString
+      val path = tmpDir.getCanonicalPath
       withSQLConf(SQLConf.WAREHOUSE_PATH.key -> path) {
         val catalog = spark.sessionState.catalog
         val databaseNames = Seq("db1", "`database`")
@@ -159,7 +158,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
 
             sql(s"CREATE DATABASE $dbName")
             val db1 = catalog.getDatabaseMetadata(dbNameWithoutBackTicks)
-            val expectedLocation = makeQualifiedPath(path + "/" + s"$dbNameWithoutBackTicks.db")
+            val expectedLocation = makeQualifiedPath(s"$path/$dbNameWithoutBackTicks.db")
             assert(db1 == CatalogDatabase(
               dbNameWithoutBackTicks,
               "",
@@ -184,9 +183,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
       try {
         sql(s"CREATE DATABASE $dbName")
         val db1 = catalog.getDatabaseMetadata(dbName)
-        val expectedLocation =
-          makeQualifiedPath(s"${System.getProperty("user.dir")}/spark-warehouse" +
-            "/" + s"$dbName.db")
+        val expectedLocation = makeQualifiedPath(s"spark-warehouse/$dbName.db")
         assert(db1 == CatalogDatabase(
           dbName,
           "",
@@ -204,7 +201,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
     val catalog = spark.sessionState.catalog
     val databaseNames = Seq("db1", "`database`")
     withTempDir { tmpDir =>
-      val path = new Path(tmpDir.toString).toUri.toString
+      val path = new Path(tmpDir.getCanonicalPath).toUri
       databaseNames.foreach { dbName =>
         try {
           val dbNameWithoutBackTicks = cleanIdentifier(dbName)
@@ -227,7 +224,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
 
   test("Create Database - database already exists") {
     withTempDir { tmpDir =>
-      val path = tmpDir.toString
+      val path = tmpDir.getCanonicalPath
       withSQLConf(SQLConf.WAREHOUSE_PATH.key -> path) {
         val catalog = spark.sessionState.catalog
         val databaseNames = Seq("db1", "`database`")
@@ -237,7 +234,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
             val dbNameWithoutBackTicks = cleanIdentifier(dbName)
             sql(s"CREATE DATABASE $dbName")
             val db1 = catalog.getDatabaseMetadata(dbNameWithoutBackTicks)
-            val expectedLocation = makeQualifiedPath(path + "/" + s"$dbNameWithoutBackTicks.db")
+            val expectedLocation = makeQualifiedPath(s"$path/$dbNameWithoutBackTicks.db")
             assert(db1 == CatalogDatabase(
               dbNameWithoutBackTicks,
               "",
@@ -476,7 +473,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
 
   test("Alter/Describe Database") {
     withTempDir { tmpDir =>
-      val path = tmpDir.toString
+      val path = tmpDir.getCanonicalPath
       withSQLConf(SQLConf.WAREHOUSE_PATH.key -> path) {
         val catalog = spark.sessionState.catalog
         val databaseNames = Seq("db1", "`database`")
@@ -484,7 +481,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
         databaseNames.foreach { dbName =>
           try {
             val dbNameWithoutBackTicks = cleanIdentifier(dbName)
-            val location = makeQualifiedPath(path + "/" + s"$dbNameWithoutBackTicks.db")
+            val location = makeQualifiedPath(s"$path/$dbNameWithoutBackTicks.db")
 
             sql(s"CREATE DATABASE $dbName")
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
index df640ffab91de..a89a43fa1e777 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
@@ -19,11 +19,11 @@ package org.apache.spark.sql.internal
 
 import org.apache.hadoop.fs.Path
 
-import org.apache.spark.SparkContext
 import org.apache.spark.sql._
 import org.apache.spark.sql.execution.WholeStageCodegenExec
 import org.apache.spark.sql.internal.StaticSQLConf._
 import org.apache.spark.sql.test.{SharedSQLContext, TestSQLContext}
+import org.apache.spark.util.Utils
 
 class SQLConfSuite extends QueryTest with SharedSQLContext {
   import testImplicits._
@@ -219,8 +219,8 @@ class SQLConfSuite extends QueryTest with SharedSQLContext {
     try {
       // to get the default value, always unset it
       spark.conf.unset(SQLConf.WAREHOUSE_PATH.key)
-      assert(spark.sessionState.conf.warehousePath
-        === new Path(s"${System.getProperty("user.dir")}/spark-warehouse").toString)
+      assert(new Path(Utils.resolveURI("spark-warehouse")).toString ===
+        spark.sessionState.conf.warehousePath + "/")
     } finally {
       sql(s"set ${SQLConf.WAREHOUSE_PATH}=$original")
     }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index 2b945dbbe03dd..6fbbed1d47e04 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.hive.execution
 
 import java.io.File
+import java.net.URI
 import java.sql.Timestamp
 import java.util.{Locale, TimeZone}
 
@@ -954,7 +955,8 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd
         .mkString("/")
 
       // Loads partition data to a temporary table to verify contents
-      val path = s"${sparkSession.getWarehousePath}/dynamic_part_table/$partFolder/part-00000"
+      val warehousePathFile = new URI(sparkSession.getWarehousePath()).getPath
+      val path = s"$warehousePathFile/dynamic_part_table/$partFolder/part-00000"
 
       sql("DROP TABLE IF EXISTS dp_verify")
       sql("CREATE TABLE dp_verify(intcol INT)")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala
index 9ed454e578d69..d9ddcbd57ca83 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.sources
 
 import java.io.File
+import java.net.URI
 
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.catalog.BucketSpec
@@ -489,8 +490,8 @@ class BucketedReadSuite extends QueryTest with SQLTestUtils with TestHiveSinglet
   test("error if there exists any malformed bucket files") {
     withTable("bucketed_table") {
       df1.write.format("parquet").bucketBy(8, "i").saveAsTable("bucketed_table")
-      val tableDir = new File(hiveContext
-        .sparkSession.getWarehousePath, "bucketed_table")
+      val warehouseFilePath = new URI(hiveContext.sparkSession.getWarehousePath).getPath
+      val tableDir = new File(warehouseFilePath, "bucketed_table")
       Utils.deleteRecursively(tableDir)
       df1.write.parquet(tableDir.getAbsolutePath)
 

From 81d6933e75579343b1dd14792c18149e97e92cdd Mon Sep 17 00:00:00 2001
From: Eren Avsarogullari <erenavsarogullari@gmail.com>
Date: Mon, 24 Oct 2016 15:33:02 -0700
Subject: [PATCH 103/162] [SPARK-17894][CORE] Ensure uniqueness of
 TaskSetManager name.

`TaskSetManager` should have unique name to avoid adding duplicate ones to parent `Pool` via `SchedulableBuilder`. This problem has been surfaced with following discussion: [[PR: Avoid adding duplicate schedulables]](https://github.com/apache/spark/pull/15326)

**Proposal** :
There is 1x1 relationship between `stageAttemptId` and `TaskSetManager` so `taskSet.Id` covering both `stageId` and `stageAttemptId` looks to be used for uniqueness of `TaskSetManager` name instead of just `stageId`.

**Current TaskSetManager Name** :
`var name = "TaskSet_" + taskSet.stageId.toString`
**Sample**: TaskSet_0

**Proposed TaskSetManager Name** :
`val name = "TaskSet_" + taskSet.Id ` `// taskSet.Id = (stageId + "." + stageAttemptId)`
**Sample** : TaskSet_0.0

Added new Unit Test.

Author: erenavsarogullari <erenavsarogullari@gmail.com>

Closes #15463 from erenavsarogullari/SPARK-17894.
---
 .../spark/scheduler/TaskSetManager.scala      |  2 +-
 .../org/apache/spark/scheduler/FakeTask.scala | 13 ++++++++----
 .../spark/scheduler/TaskSetManagerSuite.scala | 20 ++++++++++++++++++-
 3 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index 9491bc7a0497e..b766e4148e496 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -79,7 +79,7 @@ private[spark] class TaskSetManager(
   var minShare = 0
   var priority = taskSet.priority
   var stageId = taskSet.stageId
-  var name = "TaskSet_" + taskSet.stageId.toString
+  val name = "TaskSet_" + taskSet.id
   var parent: Pool = null
   var totalResultSize = 0L
   var calculatedTasks = 0
diff --git a/core/src/test/scala/org/apache/spark/scheduler/FakeTask.scala b/core/src/test/scala/org/apache/spark/scheduler/FakeTask.scala
index 87600fe504b98..f395fe9804c91 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/FakeTask.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/FakeTask.scala
@@ -22,7 +22,7 @@ import org.apache.spark.TaskContext
 class FakeTask(
     stageId: Int,
     partitionId: Int,
-    prefLocs: Seq[TaskLocation] = Nil) extends Task[Int](stageId, 0, partitionId) {
+    prefLocs: Seq[TaskLocation] = Nil) extends Task[Int](stageId, stageAttemptId = 0, partitionId) {
   override def runTask(context: TaskContext): Int = 0
   override def preferredLocations: Seq[TaskLocation] = prefLocs
 }
@@ -33,16 +33,21 @@ object FakeTask {
    * locations for each task (given as varargs) if this sequence is not empty.
    */
   def createTaskSet(numTasks: Int, prefLocs: Seq[TaskLocation]*): TaskSet = {
-    createTaskSet(numTasks, 0, prefLocs: _*)
+    createTaskSet(numTasks, stageAttemptId = 0, prefLocs: _*)
   }
 
   def createTaskSet(numTasks: Int, stageAttemptId: Int, prefLocs: Seq[TaskLocation]*): TaskSet = {
+    createTaskSet(numTasks, stageId = 0, stageAttemptId, prefLocs: _*)
+  }
+
+  def createTaskSet(numTasks: Int, stageId: Int, stageAttemptId: Int, prefLocs: Seq[TaskLocation]*):
+  TaskSet = {
     if (prefLocs.size != 0 && prefLocs.size != numTasks) {
       throw new IllegalArgumentException("Wrong number of task locations")
     }
     val tasks = Array.tabulate[Task[_]](numTasks) { i =>
-      new FakeTask(0, i, if (prefLocs.size != 0) prefLocs(i) else Nil)
+      new FakeTask(stageId, i, if (prefLocs.size != 0) prefLocs(i) else Nil)
     }
-    new TaskSet(tasks, 0, stageAttemptId, 0, null)
+    new TaskSet(tasks, stageId, stageAttemptId, priority = 0, null)
   }
 }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
index 69edcf3347243..b49ba085ca5d2 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
@@ -904,7 +904,7 @@ class TaskSetManagerSuite extends SparkFunSuite with LocalSparkContext with Logg
         task.index == index && !sched.endedTasks.contains(task.taskId)
       }.getOrElse {
         throw new RuntimeException(s"couldn't find index $index in " +
-          s"tasks: ${tasks.map{t => t.index -> t.taskId}} with endedTasks:" +
+          s"tasks: ${tasks.map { t => t.index -> t.taskId }} with endedTasks:" +
           s" ${sched.endedTasks.keys}")
       }
     }
@@ -974,6 +974,24 @@ class TaskSetManagerSuite extends SparkFunSuite with LocalSparkContext with Logg
     assert(manager.isZombie)
   }
 
+  test("SPARK-17894: Verify TaskSetManagers for different stage attempts have unique names") {
+    sc = new SparkContext("local", "test")
+    sched = new FakeTaskScheduler(sc, ("exec1", "host1"))
+    val taskSet = FakeTask.createTaskSet(numTasks = 1, stageId = 0, stageAttemptId = 0)
+    val manager = new TaskSetManager(sched, taskSet, MAX_TASK_FAILURES, new ManualClock)
+    assert(manager.name === "TaskSet_0.0")
+
+    // Make sure a task set with the same stage ID but different attempt ID has a unique name
+    val taskSet2 = FakeTask.createTaskSet(numTasks = 1, stageId = 0, stageAttemptId = 1)
+    val manager2 = new TaskSetManager(sched, taskSet2, MAX_TASK_FAILURES, new ManualClock)
+    assert(manager2.name === "TaskSet_0.1")
+
+    // Make sure a task set with the same attempt ID but different stage ID also has a unique name
+    val taskSet3 = FakeTask.createTaskSet(numTasks = 1, stageId = 1, stageAttemptId = 1)
+    val manager3 = new TaskSetManager(sched, taskSet3, MAX_TASK_FAILURES, new ManualClock)
+    assert(manager3.name === "TaskSet_1.1")
+  }
+
   private def createTaskResult(
       id: Int,
       accumUpdates: Seq[AccumulatorV2[_, _]] = Seq.empty): DirectTaskResult[Int] = {

From 407c3cedf29a4413339dcde758295dc3225a0054 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Mon, 24 Oct 2016 17:21:16 -0700
Subject: [PATCH 104/162] [SPARK-17624][SQL][STREAMING][TEST] Fixed flaky
 StateStoreSuite.maintenance

## What changes were proposed in this pull request?

The reason for the flakiness was follows. The test starts the maintenance background thread, and then writes 20 versions of the state store. The maintenance thread is expected to create snapshots in the middle, and clean up old files that are not needed any more. The earliest delta file (1.delta) is expected to be deleted as snapshots will ensure that the earliest delta would not be needed.

However, the default configuration for the maintenance thread is to retain files such that last 2 versions can be recovered, and delete the rest. Now while generating the versions, the maintenance thread can kick in and create snapshots anywhere between version 10 and 20 (at least 10 deltas needed for snapshot). Then later it will choose to retain only version 20 and 19 (last 2). There are two cases.

- Common case: One of the version between 10 and 19 gets snapshotted. Then recovering versions 19 and 20 just needs 19.snapshot and 20.delta, so 1.delta gets deleted.

- Uncommon case (reason for flakiness): Only version 20 gets snapshotted. Then recovering versoin 20 requires 20.snapshot, and recovering version 19 all the previous 19...1.delta. So 1.delta does not get deleted.

This PR rearranges the checks such that it create 20 versions, and then waits that there is at least one snapshot, then creates another 20. This will ensure that the latest 2 versions cannot require anything older than the first snapshot generated, and therefore will 1.delta will be deleted.

In addition, I have added more logs, and comments that I felt would help future debugging and understanding what is going on.

## How was this patch tested?

Ran the StateStoreSuite > 6K times in a heavily loaded machine (10 instances of tests running in parallel). No failures.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #15592 from tdas/SPARK-17624.
---
 .../state/HDFSBackedStateStoreProvider.scala  | 18 ++++---
 .../state/StateStoreCoordinator.scala         | 18 +++++--
 .../streaming/state/StateStoreSuite.scala     | 49 ++++++++++++-------
 3 files changed, 57 insertions(+), 28 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
index 7d71f5242c27d..f1e7f1d113ce7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
@@ -159,7 +159,7 @@ private[state] class HDFSBackedStateStoreProvider(
       } catch {
         case NonFatal(e) =>
           throw new IllegalStateException(
-            s"Error committing version $newVersion into ${HDFSBackedStateStoreProvider.this}", e)
+            s"Error committing version $newVersion into $this", e)
       }
     }
 
@@ -205,6 +205,10 @@ private[state] class HDFSBackedStateStoreProvider(
     override private[state] def hasCommitted: Boolean = {
       state == COMMITTED
     }
+
+    override def toString(): String = {
+      s"HDFSStateStore[id = (op=${id.operatorId}, part=${id.partitionId}), dir = $baseDir]"
+    }
   }
 
   /** Get the state store for making updates to create a new `version` of the store. */
@@ -215,7 +219,7 @@ private[state] class HDFSBackedStateStoreProvider(
       newMap.putAll(loadMap(version))
     }
     val store = new HDFSBackedStateStore(version, newMap)
-    logInfo(s"Retrieved version $version of $this for update")
+    logInfo(s"Retrieved version $version of ${HDFSBackedStateStoreProvider.this} for update")
     store
   }
 
@@ -231,7 +235,7 @@ private[state] class HDFSBackedStateStoreProvider(
   }
 
   override def toString(): String = {
-    s"StateStore[id = (op=${id.operatorId}, part=${id.partitionId}), dir = $baseDir]"
+    s"HDFSStateStoreProvider[id = (op=${id.operatorId}, part=${id.partitionId}), dir = $baseDir]"
   }
 
   /* Internal classes and methods */
@@ -493,10 +497,12 @@ private[state] class HDFSBackedStateStoreProvider(
             val mapsToRemove = loadedMaps.keys.filter(_ < earliestVersionToRetain).toSeq
             mapsToRemove.foreach(loadedMaps.remove)
           }
-          files.filter(_.version < earliestFileToRetain.version).foreach { f =>
+          val filesToDelete = files.filter(_.version < earliestFileToRetain.version)
+          filesToDelete.foreach { f =>
             fs.delete(f.path, true)
           }
-          logInfo(s"Deleted files older than ${earliestFileToRetain.version} for $this")
+          logInfo(s"Deleted files older than ${earliestFileToRetain.version} for $this: " +
+            filesToDelete.mkString(", "))
         }
       }
     } catch {
@@ -560,7 +566,7 @@ private[state] class HDFSBackedStateStoreProvider(
       }
     }
     val storeFiles = versionToFiles.values.toSeq.sortBy(_.version)
-    logDebug(s"Current set of files for $this: $storeFiles")
+    logDebug(s"Current set of files for $this: ${storeFiles.mkString(", ")}")
     storeFiles
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreCoordinator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreCoordinator.scala
index d945d7aff2da4..267d17623d5e5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreCoordinator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreCoordinator.scala
@@ -38,7 +38,7 @@ private case class VerifyIfInstanceActive(storeId: StateStoreId, executorId: Str
 private case class GetLocation(storeId: StateStoreId)
   extends StateStoreCoordinatorMessage
 
-private case class DeactivateInstances(storeRootLocation: String)
+private case class DeactivateInstances(checkpointLocation: String)
   extends StateStoreCoordinatorMessage
 
 private object StopCoordinator
@@ -111,11 +111,13 @@ class StateStoreCoordinatorRef private(rpcEndpointRef: RpcEndpointRef) {
  * Class for coordinating instances of [[StateStore]]s loaded in executors across the cluster,
  * and get their locations for job scheduling.
  */
-private class StateStoreCoordinator(override val rpcEnv: RpcEnv) extends ThreadSafeRpcEndpoint {
+private class StateStoreCoordinator(override val rpcEnv: RpcEnv)
+    extends ThreadSafeRpcEndpoint with Logging {
   private val instances = new mutable.HashMap[StateStoreId, ExecutorCacheTaskLocation]
 
   override def receive: PartialFunction[Any, Unit] = {
     case ReportActiveInstance(id, host, executorId) =>
+      logDebug(s"Reported state store $id is active at $executorId")
       instances.put(id, ExecutorCacheTaskLocation(host, executorId))
   }
 
@@ -125,19 +127,25 @@ private class StateStoreCoordinator(override val rpcEnv: RpcEnv) extends ThreadS
         case Some(location) => location.executorId == execId
         case None => false
       }
+      logDebug(s"Verified that state store $id is active: $response")
       context.reply(response)
 
     case GetLocation(id) =>
-      context.reply(instances.get(id).map(_.toString))
+      val executorId = instances.get(id).map(_.toString)
+      logDebug(s"Got location of the state store $id: $executorId")
+      context.reply(executorId)
 
-    case DeactivateInstances(loc) =>
+    case DeactivateInstances(checkpointLocation) =>
       val storeIdsToRemove =
-        instances.keys.filter(_.checkpointLocation == loc).toSeq
+        instances.keys.filter(_.checkpointLocation == checkpointLocation).toSeq
       instances --= storeIdsToRemove
+      logDebug(s"Deactivating instances related to checkpoint location $checkpointLocation: " +
+        storeIdsToRemove.mkString(", "))
       context.reply(true)
 
     case StopCoordinator =>
       stop() // Stop before replying to ensure that endpoint name has been deregistered
+      logInfo("StateStoreCoordinator stopped")
       context.reply(true)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala
index 06f1bd6c3bcc7..fcf300b3c81bb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala
@@ -367,7 +367,10 @@ class StateStoreSuite extends SparkFunSuite with BeforeAndAfter with PrivateMeth
     val conf = new SparkConf()
       .setMaster("local")
       .setAppName("test")
+      // Make maintenance thread do snapshots and cleanups very fast
       .set(StateStore.MAINTENANCE_INTERVAL_CONFIG, "10ms")
+      // Make sure that when SparkContext stops, the StateStore maintenance thread 'quickly'
+      // fails to talk to the StateStoreCoordinator and unloads all the StateStores
       .set("spark.rpc.numRetries", "1")
     val opId = 0
     val dir = Utils.createDirectory(tempDir, Random.nextString(5)).toString
@@ -377,37 +380,49 @@ class StateStoreSuite extends SparkFunSuite with BeforeAndAfter with PrivateMeth
     val provider = new HDFSBackedStateStoreProvider(
       storeId, keySchema, valueSchema, storeConf, hadoopConf)
 
+    var latestStoreVersion = 0
+
+    def generateStoreVersions() {
+      for (i <- 1 to 20) {
+        val store = StateStore.get(
+          storeId, keySchema, valueSchema, latestStoreVersion, storeConf, hadoopConf)
+        put(store, "a", i)
+        store.commit()
+        latestStoreVersion += 1
+      }
+    }
 
     quietly {
       withSpark(new SparkContext(conf)) { sc =>
         withCoordinatorRef(sc) { coordinatorRef =>
           require(!StateStore.isMaintenanceRunning, "StateStore is unexpectedly running")
 
-          for (i <- 1 to 20) {
-            val store = StateStore.get(
-              storeId, keySchema, valueSchema, i - 1, storeConf, hadoopConf)
-            put(store, "a", i)
-            store.commit()
-          }
+          // Generate sufficient versions of store for snapshots
+          generateStoreVersions()
 
           eventually(timeout(10 seconds)) {
+            // Store should have been reported to the coordinator
             assert(coordinatorRef.getLocation(storeId).nonEmpty, "active instance was not reported")
-          }
 
-          // Background maintenance should clean up and generate snapshots
-          assert(StateStore.isMaintenanceRunning, "Maintenance task is not running")
-
-          eventually(timeout(10 seconds)) {
-            // Earliest delta file should get cleaned up
-            assert(!fileExists(provider, 1, isSnapshot = false), "earliest file not deleted")
+            // Background maintenance should clean up and generate snapshots
+            assert(StateStore.isMaintenanceRunning, "Maintenance task is not running")
 
             // Some snapshots should have been generated
-            val snapshotVersions = (0 to 20).filter { version =>
+            val snapshotVersions = (1 to latestStoreVersion).filter { version =>
               fileExists(provider, version, isSnapshot = true)
             }
             assert(snapshotVersions.nonEmpty, "no snapshot file found")
           }
 
+          // Generate more versions such that there is another snapshot and
+          // the earliest delta file will be cleaned up
+          generateStoreVersions()
+
+          // Earliest delta file should get cleaned up
+          eventually(timeout(10 seconds)) {
+            assert(!fileExists(provider, 1, isSnapshot = false), "earliest file not deleted")
+          }
+
           // If driver decides to deactivate all instances of the store, then this instance
           // should be unloaded
           coordinatorRef.deactivateInstances(dir)
@@ -416,7 +431,7 @@ class StateStoreSuite extends SparkFunSuite with BeforeAndAfter with PrivateMeth
           }
 
           // Reload the store and verify
-          StateStore.get(storeId, keySchema, valueSchema, 20, storeConf, hadoopConf)
+          StateStore.get(storeId, keySchema, valueSchema, latestStoreVersion, storeConf, hadoopConf)
           assert(StateStore.isLoaded(storeId))
 
           // If some other executor loads the store, then this instance should be unloaded
@@ -426,14 +441,14 @@ class StateStoreSuite extends SparkFunSuite with BeforeAndAfter with PrivateMeth
           }
 
           // Reload the store and verify
-          StateStore.get(storeId, keySchema, valueSchema, 20, storeConf, hadoopConf)
+          StateStore.get(storeId, keySchema, valueSchema, latestStoreVersion, storeConf, hadoopConf)
           assert(StateStore.isLoaded(storeId))
         }
       }
 
       // Verify if instance is unloaded if SparkContext is stopped
-      require(SparkEnv.get === null)
       eventually(timeout(10 seconds)) {
+        require(SparkEnv.get === null)
         assert(!StateStore.isLoaded(storeId))
         assert(!StateStore.isMaintenanceRunning)
       }

From 84a33999082af88ea6365cdb5c7232ed0933b1c6 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Tue, 25 Oct 2016 08:42:21 +0800
Subject: [PATCH 105/162] [SPARK-18028][SQL] simplify TableFileCatalog

## What changes were proposed in this pull request?

Simplify/cleanup TableFileCatalog:

1. pass a `CatalogTable` instead of `databaseName` and `tableName` into `TableFileCatalog`, so that we don't need to fetch table metadata from metastore again
2. In `TableFileCatalog.filterPartitions0`, DO NOT set `PartitioningAwareFileCatalog.BASE_PATH_PARAM`. According to the [classdoc](https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala#L189-L209), the default value of `basePath` already satisfies our need. What's more, if we set this parameter, we may break the case 2 which is metioned in the classdoc.
3. add `equals` and `hashCode` to `TableFileCatalog`
4. add `SessionCatalog.listPartitionsByFilter` which handles case sensitivity.

## How was this patch tested?

existing tests.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #15568 from cloud-fan/table-file-catalog.
---
 .../sql/catalyst/catalog/SessionCatalog.scala | 14 +++++
 .../datasources/TableFileCatalog.scala        | 54 ++++++++++---------
 .../spark/sql/hive/HiveMetastoreCatalog.scala |  4 +-
 .../spark/sql/hive/CachedTableSuite.scala     | 41 +++++++++++++-
 .../PruneFileSourcePartitionsSuite.scala      |  7 +--
 5 files changed, 84 insertions(+), 36 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
index 9711131d88a05..3d6eec81c03c8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -755,6 +755,20 @@ class SessionCatalog(
     externalCatalog.listPartitions(db, table, partialSpec)
   }
 
+  /**
+   * List the metadata of partitions that belong to the specified table, assuming it exists, that
+   * satisfy the given partition-pruning predicate expressions.
+   */
+  def listPartitionsByFilter(
+      tableName: TableIdentifier,
+      predicates: Seq[Expression]): Seq[CatalogTablePartition] = {
+    val db = formatDatabaseName(tableName.database.getOrElse(getCurrentDatabase))
+    val table = formatTableName(tableName.table)
+    requireDbExists(db)
+    requireTableExists(TableIdentifier(table, Option(db)))
+    externalCatalog.listPartitionsByFilter(db, table, predicates)
+  }
+
   /**
    * Verify if the input partition spec exactly matches the existing defined partition spec
    * The columns must be the same but the orders could be different.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index 31a01bc6db082..667379b222c48 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -20,36 +20,30 @@ package org.apache.spark.sql.execution.datasources
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.catalog.CatalogTable
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.types.StructType
 
 
 /**
  * A [[FileCatalog]] for a metastore catalog table.
  *
  * @param sparkSession a [[SparkSession]]
- * @param db the table's database name
- * @param table the table's (unqualified) name
- * @param partitionSchema the schema of a partitioned table's partition columns
+ * @param table the metadata of the table
  * @param sizeInBytes the table's data size in bytes
- * @param fileStatusCache optional cache implementation to use for file listing
  */
 class TableFileCatalog(
     sparkSession: SparkSession,
-    db: String,
-    table: String,
-    partitionSchema: Option[StructType],
+    val table: CatalogTable,
     override val sizeInBytes: Long) extends FileCatalog {
 
   protected val hadoopConf = sparkSession.sessionState.newHadoopConf
 
   private val fileStatusCache = FileStatusCache.newCache(sparkSession)
 
-  private val externalCatalog = sparkSession.sharedState.externalCatalog
+  assert(table.identifier.database.isDefined,
+    "The table identifier must be qualified in TableFileCatalog")
 
-  private val catalogTable = externalCatalog.getTable(db, table)
-
-  private val baseLocation = catalogTable.storage.locationUri
+  private val baseLocation = table.storage.locationUri
 
   override def rootPaths: Seq[Path] = baseLocation.map(new Path(_)).toSeq
 
@@ -66,24 +60,32 @@ class TableFileCatalog(
    * @param filters partition-pruning filters
    */
   def filterPartitions(filters: Seq[Expression]): ListingFileCatalog = {
-    val parameters = baseLocation
-      .map(loc => Map(PartitioningAwareFileCatalog.BASE_PATH_PARAM -> loc))
-      .getOrElse(Map.empty)
-    partitionSchema match {
-      case Some(schema) =>
-        val selectedPartitions = externalCatalog.listPartitionsByFilter(db, table, filters)
-        val partitions = selectedPartitions.map { p =>
-          PartitionPath(p.toRow(schema), p.storage.locationUri.get)
-        }
-        val partitionSpec = PartitionSpec(schema, partitions)
-        new PrunedTableFileCatalog(
-          sparkSession, new Path(baseLocation.get), fileStatusCache, partitionSpec)
-      case None =>
-        new ListingFileCatalog(sparkSession, rootPaths, parameters, None, fileStatusCache)
+    if (table.partitionColumnNames.nonEmpty) {
+      val selectedPartitions = sparkSession.sessionState.catalog.listPartitionsByFilter(
+        table.identifier, filters)
+      val partitionSchema = table.partitionSchema
+      val partitions = selectedPartitions.map { p =>
+        PartitionPath(p.toRow(partitionSchema), p.storage.locationUri.get)
+      }
+      val partitionSpec = PartitionSpec(partitionSchema, partitions)
+      new PrunedTableFileCatalog(
+        sparkSession, new Path(baseLocation.get), fileStatusCache, partitionSpec)
+    } else {
+      new ListingFileCatalog(sparkSession, rootPaths, table.storage.properties, None)
     }
   }
 
   override def inputFiles: Array[String] = filterPartitions(Nil).inputFiles
+
+  // `TableFileCatalog` may be a member of `HadoopFsRelation`, `HadoopFsRelation` may be a member
+  // of `LogicalRelation`, and `LogicalRelation` may be used as the cache key. So we need to
+  // implement `equals` and `hashCode` here, to make it work with cache lookup.
+  override def equals(o: Any): Boolean = o match {
+    case other: TableFileCatalog => this.table.identifier == other.table.identifier
+    case _ => false
+  }
+
+  override def hashCode(): Int = table.identifier.hashCode()
 }
 
 /**
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 44089335e1a1d..6c1585d5f5617 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -226,12 +226,10 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
         Some(partitionSchema))
 
       val logicalRelation = cached.getOrElse {
-        val db = metastoreRelation.databaseName
-        val table = metastoreRelation.tableName
         val sizeInBytes = metastoreRelation.statistics.sizeInBytes.toLong
         val fileCatalog = {
           val catalog = new TableFileCatalog(
-            sparkSession, db, table, Some(partitionSchema), sizeInBytes)
+            sparkSession, metastoreRelation.catalogTable, sizeInBytes)
           if (lazyPruningEnabled) {
             catalog
           } else {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
index 7d4ef6f26a600..ecdf4f14b3985 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
@@ -19,12 +19,15 @@ package org.apache.spark.sql.hive
 
 import java.io.File
 
-import org.apache.spark.sql.{AnalysisException, QueryTest, SaveMode}
+import org.apache.spark.sql.{AnalysisException, Dataset, QueryTest, SaveMode}
 import org.apache.spark.sql.catalyst.analysis.NoSuchTableException
 import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec
+import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation, TableFileCatalog}
+import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.test.SQLTestUtils
+import org.apache.spark.sql.types.StructType
 import org.apache.spark.storage.RDDBlockId
 import org.apache.spark.util.Utils
 
@@ -317,4 +320,40 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto
 
     sql("DROP TABLE cachedTable")
   }
+
+  test("cache a table using TableFileCatalog") {
+    withTable("test") {
+      sql("CREATE TABLE test(i int) PARTITIONED BY (p int) STORED AS parquet")
+      val tableMeta = spark.sharedState.externalCatalog.getTable("default", "test")
+      val tableFileCatalog = new TableFileCatalog(spark, tableMeta, 0)
+
+      val dataSchema = StructType(tableMeta.schema.filterNot { f =>
+        tableMeta.partitionColumnNames.contains(f.name)
+      })
+      val relation = HadoopFsRelation(
+        location = tableFileCatalog,
+        partitionSchema = tableMeta.partitionSchema,
+        dataSchema = dataSchema,
+        bucketSpec = None,
+        fileFormat = new ParquetFileFormat(),
+        options = Map.empty)(sparkSession = spark)
+
+      val plan = LogicalRelation(relation, catalogTable = Some(tableMeta))
+      spark.sharedState.cacheManager.cacheQuery(Dataset.ofRows(spark, plan))
+
+      assert(spark.sharedState.cacheManager.lookupCachedData(plan).isDefined)
+
+      val sameCatalog = new TableFileCatalog(spark, tableMeta, 0)
+      val sameRelation = HadoopFsRelation(
+        location = sameCatalog,
+        partitionSchema = tableMeta.partitionSchema,
+        dataSchema = dataSchema,
+        bucketSpec = None,
+        fileFormat = new ParquetFileFormat(),
+        options = Map.empty)(sparkSession = spark)
+      val samePlan = LogicalRelation(sameRelation, catalogTable = Some(tableMeta))
+
+      assert(spark.sharedState.cacheManager.lookupCachedData(samePlan).isDefined)
+    }
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala
index 346ea0ca4367e..59639aacf3a3f 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala
@@ -45,12 +45,7 @@ class PruneFileSourcePartitionsSuite extends QueryTest with SQLTestUtils with Te
             |LOCATION '${dir.getAbsolutePath}'""".stripMargin)
 
         val tableMeta = spark.sharedState.externalCatalog.getTable("default", "test")
-        val tableFileCatalog = new TableFileCatalog(
-          spark,
-          tableMeta.database,
-          tableMeta.identifier.table,
-          Some(tableMeta.partitionSchema),
-          0)
+        val tableFileCatalog = new TableFileCatalog(spark, tableMeta, 0)
 
         val dataSchema = StructType(tableMeta.schema.filterNot { f =>
           tableMeta.partitionColumnNames.contains(f.name)

From d479c5262276b47302659bd877a9e3467400bdb6 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 25 Oct 2016 10:47:11 +0800
Subject: [PATCH 106/162] [SPARK-17409][SQL][FOLLOW-UP] Do Not Optimize Query
 in CTAS More Than Once

### What changes were proposed in this pull request?
This follow-up PR is for addressing the [comment](https://github.com/apache/spark/pull/15048).

We added two test cases based on the suggestion from yhuai . One is a new test case using the `saveAsTable` API to create a data source table. Another is for CTAS on Hive serde table.

Note: No need to backport this PR to 2.0. Will submit a new PR to backport the whole fix with new test cases to Spark 2.0

### How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #15459 from gatorsmile/ctasOptimizedTestCases.
---
 .../org/apache/spark/sql/DataFrameSuite.scala | 18 +++++++++++++++++
 .../sources/CreateTableAsSelectSuite.scala    |  2 +-
 .../sql/hive/MetastoreRelationSuite.scala     | 20 +++++++++++++++++--
 3 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index e87baa454c8b3..3fb7eeefba67f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -1599,6 +1599,24 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
     assert(df.persist.take(1).apply(0).toSeq(100).asInstanceOf[Long] == 100)
   }
 
+  test("SPARK-17409: Do Not Optimize Query in CTAS (Data source tables) More Than Once") {
+    withTable("bar") {
+      withTempView("foo") {
+        withSQLConf(SQLConf.DEFAULT_DATA_SOURCE_NAME.key -> "json") {
+          sql("select 0 as id").createOrReplaceTempView("foo")
+          val df = sql("select * from foo group by id")
+          // If we optimize the query in CTAS more than once, the following saveAsTable will fail
+          // with the error: `GROUP BY position 0 is not in select list (valid range is [1, 1])`
+          df.write.mode("overwrite").saveAsTable("bar")
+          checkAnswer(spark.table("bar"), Row(0) :: Nil)
+          val tableMetadata = spark.sessionState.catalog.getTableMetadata(TableIdentifier("bar"))
+          assert(tableMetadata.provider == Some("json"),
+            "the expected table is a data source table using json")
+        }
+      }
+    }
+  }
+
   test("copy results for sampling with replacement") {
     val df = Seq((1, 0), (2, 0), (3, 0)).toDF("a", "b")
     val sampleDf = df.sample(true, 2.00)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
index c39005f6a1063..5cc9467395adc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
@@ -238,7 +238,7 @@ class CreateTableAsSelectSuite
     }
   }
 
-  test("CTAS of decimal calculation") {
+  test("SPARK-17409: CTAS of decimal calculation") {
     withTable("tab2") {
       withTempView("tab1") {
         spark.range(99, 101).createOrReplaceTempView("tab1")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreRelationSuite.scala
index c28e41a85c39d..91ff711445e82 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreRelationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreRelationSuite.scala
@@ -17,12 +17,14 @@
 
 package org.apache.spark.sql.hive
 
-import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.{QueryTest, Row}
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType}
+import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
 
-class MetastoreRelationSuite extends SparkFunSuite {
+class MetastoreRelationSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
   test("makeCopy and toJSON should work") {
     val table = CatalogTable(
       identifier = TableIdentifier("test", Some("db")),
@@ -36,4 +38,18 @@ class MetastoreRelationSuite extends SparkFunSuite {
     // No exception should be thrown
     relation.toJSON
   }
+
+  test("SPARK-17409: Do Not Optimize Query in CTAS (Hive Serde Table) More Than Once") {
+    withTable("bar") {
+      withTempView("foo") {
+        sql("select 0 as id").createOrReplaceTempView("foo")
+        // If we optimize the query in CTAS more than once, the following saveAsTable will fail
+        // with the error: `GROUP BY position 0 is not in select list (valid range is [1, 1])`
+        sql("CREATE TABLE bar AS SELECT * FROM foo group by id")
+        checkAnswer(spark.table("bar"), Row(0) :: Nil)
+        val tableMetadata = spark.sessionState.catalog.getTableMetadata(TableIdentifier("bar"))
+        assert(tableMetadata.provider == Some("hive"), "the expected table is a Hive serde table")
+      }
+    }
+  }
 }

From 483c37c581fedc64b218e294ecde1a7bb4b2af9c Mon Sep 17 00:00:00 2001
From: Kay Ousterhout <kayousterhout@gmail.com>
Date: Mon, 24 Oct 2016 20:16:00 -0700
Subject: [PATCH 107/162] [SPARK-17894][HOTFIX] Fix broken build from

The named parameter in an overridden class isn't supported in Scala 2.10 so was breaking the build.

cc zsxwing

Author: Kay Ousterhout <kayousterhout@gmail.com>

Closes #15617 from kayousterhout/hotfix.
---
 core/src/test/scala/org/apache/spark/scheduler/FakeTask.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/test/scala/org/apache/spark/scheduler/FakeTask.scala b/core/src/test/scala/org/apache/spark/scheduler/FakeTask.scala
index f395fe9804c91..a757041299411 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/FakeTask.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/FakeTask.scala
@@ -22,7 +22,7 @@ import org.apache.spark.TaskContext
 class FakeTask(
     stageId: Int,
     partitionId: Int,
-    prefLocs: Seq[TaskLocation] = Nil) extends Task[Int](stageId, stageAttemptId = 0, partitionId) {
+    prefLocs: Seq[TaskLocation] = Nil) extends Task[Int](stageId, 0, partitionId) {
   override def runTask(context: TaskContext): Int = 0
   override def preferredLocations: Seq[TaskLocation] = prefLocs
 }

From 78d740a08a04b74b49b5cba4bb6a821631390ab4 Mon Sep 17 00:00:00 2001
From: sethah <seth.hendrickson16@gmail.com>
Date: Mon, 24 Oct 2016 23:47:59 -0700
Subject: [PATCH 108/162] [SPARK-17748][ML] One pass solver for Weighted Least
 Squares with ElasticNet

## What changes were proposed in this pull request?

1. Make a pluggable solver interface for `WeightedLeastSquares`
2. Add a `QuasiNewton` solver to handle elastic net regularization for `WeightedLeastSquares`
3. Add method `BLAS.dspmv` used by QN solver
4. Add mechanism for WLS to handle singular covariance matrices by falling back to QN solver when Cholesky fails.

## How was this patch tested?
Unit tests - see below.

## Design choices

**Pluggable Normal Solver**

Before, the `WeightedLeastSquares` package always used the Cholesky decomposition solver to compute the solution to the normal equations. Now, we specify the solver as a constructor argument to the `WeightedLeastSquares`. We introduce a new trait:

````scala
private[ml] sealed trait NormalEquationSolver {

  def solve(
      bBar: Double,
      bbBar: Double,
      abBar: DenseVector,
      aaBar: DenseVector,
      aBar: DenseVector): NormalEquationSolution
}
````

We extend this trait for different variants of normal equation solvers. In the future, we can easily add others (like QR) using this interface.

**Always train in the standardized space**

The normal solver did not previously standardize the data, but this patch introduces a change such that we always solve the normal equations in the standardized space. We convert back to the original space in the same way that is done for distributed L-BFGS/OWL-QN. We add test cases for zero variance features/labels.

**Use L-BFGS locally to solve normal equations for singular matrix**

When linear regression with the normal solver is called for a singular matrix, we initially try to solve with Cholesky. We use the output of `lapack.dppsv` to determine if the matrix is singular. If it is, we fall back to using L-BFGS locally to solve the normal equations. We add test cases for this as well.

## Test cases
I found it helpful to enumerate some of the test cases and hopefully it makes review easier.

**WeightedLeastSquares**

1. Constant columns - Cholesky solver fails with no regularization, Auto solver falls back to QN, and QN trains successfully.
2. Collinear features - Cholesky solver fails with no regularization, Auto solver falls back to QN, and QN trains successfully.
3. Label is constant zero - no training is performed regardless of intercept. Coefficients are zero and intercept is zero.
4. Label is constant - if fitIntercept, then no training is performed and intercept equals label mean. If not fitIntercept, then we train and return an answer that matches R's lm package.
5. Test with L1 - go through various combinations of L1/L2, standardization, fitIntercept and verify that output matches glmnet.
6. Initial intercept - verify that setting the initial intercept to label mean is correct by training model with strong L1 regularization so that all coefficients are zero and intercept converges to label mean.
7. Test diagInvAtWA - since we are standardizing features now during training, we should test that the inverse is computed to match R.

**LinearRegression**
1. For all existing L1 test cases, test the "normal" solver too.
2. Check that using the normal solver now handles singular matrices.
3. Check that using the normal solver with L1 produces an objective history in the model summary, but does not produce the inverse of AtA.

**BLAS**
1. Test new method `dspmv`.

## Performance Testing
This patch will speed up linear regression with L1/elasticnet penalties when the feature size is < 4096. I have not conducted performance tests at scale, only observed by testing locally that there is a speed improvement.

We should decide if this PR needs to be blocked before performance testing is conducted.

Author: sethah <seth.hendrickson16@gmail.com>

Closes #15394 from sethah/SPARK-17748.
---
 .../org/apache/spark/ml/linalg/BLAS.scala     |  18 +
 .../apache/spark/ml/linalg/BLASSuite.scala    |  45 ++
 .../IterativelyReweightedLeastSquares.scala   |   4 +-
 .../spark/ml/optim/NormalEquationSolver.scala | 163 +++++++
 .../spark/ml/optim/WeightedLeastSquares.scala | 270 +++++++++--
 .../GeneralizedLinearRegression.scala         |   4 +-
 .../ml/regression/LinearRegression.scala      |  20 +-
 .../mllib/linalg/CholeskyDecomposition.scala  |   4 +-
 ...erativelyReweightedLeastSquaresSuite.scala |   6 +-
 .../ml/optim/WeightedLeastSquaresSuite.scala  | 400 ++++++++++++++--
 .../ml/regression/LinearRegressionSuite.scala | 431 +++++++++---------
 11 files changed, 1057 insertions(+), 308 deletions(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/optim/NormalEquationSolver.scala

diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/BLAS.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/BLAS.scala
index 4ca19f3387f07..ef3890962494d 100644
--- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/BLAS.scala
+++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/BLAS.scala
@@ -243,6 +243,24 @@ private[spark] object BLAS extends Serializable {
     spr(alpha, v, U.values)
   }
 
+  /**
+   * y := alpha*A*x + beta*y
+   *
+   * @param n The order of the n by n matrix A.
+   * @param A The upper triangular part of A in a [[DenseVector]] (column major).
+   * @param x The [[DenseVector]] transformed by A.
+   * @param y The [[DenseVector]] to be modified in place.
+   */
+  def dspmv(
+      n: Int,
+      alpha: Double,
+      A: DenseVector,
+      x: DenseVector,
+      beta: Double,
+      y: DenseVector): Unit = {
+    f2jBLAS.dspmv("U", n, alpha, A.values, x.values, 1, beta, y.values, 1)
+  }
+
   /**
    * Adds alpha * x * x.t to a matrix in-place. This is the same as BLAS's ?SPR.
    *
diff --git a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/BLASSuite.scala b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/BLASSuite.scala
index 6e72a5fff0a91..877ac68983348 100644
--- a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/BLASSuite.scala
+++ b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/BLASSuite.scala
@@ -422,4 +422,49 @@ class BLASSuite extends SparkMLFunSuite {
     assert(dATT.multiply(sx) ~== expected absTol 1e-15)
     assert(sATT.multiply(sx) ~== expected absTol 1e-15)
   }
+
+  test("spmv") {
+    /*
+      A = [[3.0, -2.0, 2.0, -4.0],
+           [-2.0, -8.0, 4.0, 7.0],
+           [2.0, 4.0, -3.0, -3.0],
+           [-4.0, 7.0, -3.0, 0.0]]
+      x =  [5.0, 2.0, -1.0, -9.0]
+      Ax = [ 45., -93.,  48.,  -3.]
+     */
+    val A = new DenseVector(Array(3.0, -2.0, -8.0, 2.0, 4.0, -3.0, -4.0, 7.0, -3.0, 0.0))
+    val x = new DenseVector(Array(5.0, 2.0, -1.0, -9.0))
+    val n = 4
+
+    val y1 = new DenseVector(Array(-3.0, 6.0, -8.0, -3.0))
+    val y2 = y1.copy
+    val y3 = y1.copy
+    val y4 = y1.copy
+    val y5 = y1.copy
+    val y6 = y1.copy
+    val y7 = y1.copy
+
+    val expected1 = new DenseVector(Array(42.0, -87.0, 40.0, -6.0))
+    val expected2 = new DenseVector(Array(19.5, -40.5, 16.0, -4.5))
+    val expected3 = new DenseVector(Array(-25.5, 52.5, -32.0, -1.5))
+    val expected4 = new DenseVector(Array(-3.0, 6.0, -8.0, -3.0))
+    val expected5 = new DenseVector(Array(43.5, -90.0, 44.0, -4.5))
+    val expected6 = new DenseVector(Array(46.5, -96.0, 52.0, -1.5))
+    val expected7 = new DenseVector(Array(45.0, -93.0, 48.0, -3.0))
+
+    dspmv(n, 1.0, A, x, 1.0, y1)
+    dspmv(n, 0.5, A, x, 1.0, y2)
+    dspmv(n, -0.5, A, x, 1.0, y3)
+    dspmv(n, 0.0, A, x, 1.0, y4)
+    dspmv(n, 1.0, A, x, 0.5, y5)
+    dspmv(n, 1.0, A, x, -0.5, y6)
+    dspmv(n, 1.0, A, x, 0.0, y7)
+    assert(y1 ~== expected1 absTol 1e-8)
+    assert(y2 ~== expected2 absTol 1e-8)
+    assert(y3 ~== expected3 absTol 1e-8)
+    assert(y4 ~== expected4 absTol 1e-8)
+    assert(y5 ~== expected5 absTol 1e-8)
+    assert(y6 ~== expected6 absTol 1e-8)
+    assert(y7 ~== expected7 absTol 1e-8)
+  }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquares.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquares.scala
index d732f53029e8c..8a6b862cda170 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquares.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquares.scala
@@ -81,8 +81,8 @@ private[ml] class IterativelyReweightedLeastSquares(
       }
 
       // Estimate new model
-      model = new WeightedLeastSquares(fitIntercept, regParam, standardizeFeatures = false,
-        standardizeLabel = false).fit(newInstances)
+      model = new WeightedLeastSquares(fitIntercept, regParam, elasticNetParam = 0.0,
+        standardizeFeatures = false, standardizeLabel = false).fit(newInstances)
 
       // Check convergence
       val oldCoefficients = oldModel.coefficients
diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/NormalEquationSolver.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/NormalEquationSolver.scala
new file mode 100644
index 0000000000000..2f5299b010223
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/NormalEquationSolver.scala
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.ml.optim
+
+import breeze.linalg.{DenseVector => BDV}
+import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS, OWLQN => BreezeOWLQN}
+import scala.collection.mutable
+
+import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vectors}
+import org.apache.spark.mllib.linalg.CholeskyDecomposition
+
+/**
+ * A class to hold the solution to the normal equations A^T^ W A x = A^T^ W b.
+ *
+ * @param coefficients The least squares coefficients. The last element in the coefficients
+ *                     is the intercept when bias is added to A.
+ * @param aaInv An option containing the upper triangular part of (A^T^ W A)^-1^, in column major
+ *              format. None when an optimization program is used to solve the normal equations.
+ * @param objectiveHistory Option containing the objective history when an optimization program is
+ *                         used to solve the normal equations. None when an analytic solver is used.
+ */
+private[ml] class NormalEquationSolution(
+    val coefficients: Array[Double],
+    val aaInv: Option[Array[Double]],
+    val objectiveHistory: Option[Array[Double]])
+
+/**
+ * Interface for classes that solve the normal equations locally.
+ */
+private[ml] sealed trait NormalEquationSolver {
+
+  /** Solve the normal equations from summary statistics. */
+  def solve(
+      bBar: Double,
+      bbBar: Double,
+      abBar: DenseVector,
+      aaBar: DenseVector,
+      aBar: DenseVector): NormalEquationSolution
+}
+
+/**
+ * A class that solves the normal equations directly, using Cholesky decomposition.
+ */
+private[ml] class CholeskySolver extends NormalEquationSolver {
+
+  def solve(
+      bBar: Double,
+      bbBar: Double,
+      abBar: DenseVector,
+      aaBar: DenseVector,
+      aBar: DenseVector): NormalEquationSolution = {
+    val k = abBar.size
+    val x = CholeskyDecomposition.solve(aaBar.values, abBar.values)
+    val aaInv = CholeskyDecomposition.inverse(aaBar.values, k)
+
+    new NormalEquationSolution(x, Some(aaInv), None)
+  }
+}
+
+/**
+ * A class for solving the normal equations using Quasi-Newton optimization methods.
+ */
+private[ml] class QuasiNewtonSolver(
+    fitIntercept: Boolean,
+    maxIter: Int,
+    tol: Double,
+    l1RegFunc: Option[(Int) => Double]) extends NormalEquationSolver {
+
+  def solve(
+      bBar: Double,
+      bbBar: Double,
+      abBar: DenseVector,
+      aaBar: DenseVector,
+      aBar: DenseVector): NormalEquationSolution = {
+    val numFeatures = aBar.size
+    val numFeaturesPlusIntercept = if (fitIntercept) numFeatures + 1 else numFeatures
+    val initialCoefficientsWithIntercept = new Array[Double](numFeaturesPlusIntercept)
+    if (fitIntercept) {
+      initialCoefficientsWithIntercept(numFeaturesPlusIntercept - 1) = bBar
+    }
+
+    val costFun =
+      new NormalEquationCostFun(bBar, bbBar, abBar, aaBar, aBar, fitIntercept, numFeatures)
+    val optimizer = l1RegFunc.map { func =>
+      new BreezeOWLQN[Int, BDV[Double]](maxIter, 10, func, tol)
+    }.getOrElse(new BreezeLBFGS[BDV[Double]](maxIter, 10, tol))
+
+    val states = optimizer.iterations(new CachedDiffFunction(costFun),
+      new BDV[Double](initialCoefficientsWithIntercept))
+
+    val arrayBuilder = mutable.ArrayBuilder.make[Double]
+    var state: optimizer.State = null
+    while (states.hasNext) {
+      state = states.next()
+      arrayBuilder += state.adjustedValue
+    }
+    val x = state.x.toArray.clone()
+    new NormalEquationSolution(x, None, Some(arrayBuilder.result()))
+  }
+
+  /**
+   * NormalEquationCostFun implements Breeze's DiffFunction[T] for the normal equation.
+   * It returns the loss and gradient with L2 regularization at a particular point (coefficients).
+   * It's used in Breeze's convex optimization routines.
+   */
+  private class NormalEquationCostFun(
+      bBar: Double,
+      bbBar: Double,
+      ab: DenseVector,
+      aa: DenseVector,
+      aBar: DenseVector,
+      fitIntercept: Boolean,
+      numFeatures: Int) extends DiffFunction[BDV[Double]] {
+
+    private val numFeaturesPlusIntercept = if (fitIntercept) numFeatures + 1 else numFeatures
+
+    override def calculate(coefficients: BDV[Double]): (Double, BDV[Double]) = {
+      val coef = Vectors.fromBreeze(coefficients).toDense
+      if (fitIntercept) {
+        var j = 0
+        var dotProd = 0.0
+        val coefValues = coef.values
+        val aBarValues = aBar.values
+        while (j < numFeatures) {
+          dotProd += coefValues(j) * aBarValues(j)
+          j += 1
+        }
+        coefValues(numFeatures) = bBar - dotProd
+      }
+      val aax = new DenseVector(new Array[Double](numFeaturesPlusIntercept))
+      BLAS.dspmv(numFeaturesPlusIntercept, 1.0, aa, coef, 1.0, aax)
+      // loss = 1/2 (b^T W b - 2 x^T A^T W b + x^T A^T W A x)
+      val loss = 0.5 * bbBar - BLAS.dot(ab, coef) + 0.5 * BLAS.dot(coef, aax)
+      // gradient = A^T W A x - A^T W b
+      BLAS.axpy(-1.0, ab, aax)
+      (loss, aax.asBreeze.toDenseVector)
+    }
+  }
+}
+
+/**
+ * Exception thrown when solving a linear system Ax = b for which the matrix A is non-invertible
+ * (singular).
+ */
+class SingularMatrixException(message: String, cause: Throwable)
+  extends IllegalArgumentException(message, cause) {
+
+  def this(message: String) = this(message, null)
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
index 8f5f4427e1f4b..2223f126f1b69 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
@@ -20,19 +20,21 @@ package org.apache.spark.ml.optim
 import org.apache.spark.internal.Logging
 import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.linalg._
-import org.apache.spark.mllib.linalg.CholeskyDecomposition
 import org.apache.spark.rdd.RDD
 
 /**
  * Model fitted by [[WeightedLeastSquares]].
+ *
  * @param coefficients model coefficients
  * @param intercept model intercept
  * @param diagInvAtWA diagonal of matrix (A^T * W * A)^-1
+ * @param objectiveHistory objective function (scaled loss + regularization) at each iteration.
  */
 private[ml] class WeightedLeastSquaresModel(
     val coefficients: DenseVector,
     val intercept: Double,
-    val diagInvAtWA: DenseVector) extends Serializable {
+    val diagInvAtWA: DenseVector,
+    val objectiveHistory: Array[Double]) extends Serializable {
 
   def predict(features: Vector): Double = {
     BLAS.dot(coefficients, features) + intercept
@@ -44,35 +46,52 @@ private[ml] class WeightedLeastSquaresModel(
  * Given weighted observations (w,,i,,, a,,i,,, b,,i,,), we use the following weighted least squares
  * formulation:
  *
- * min,,x,z,, 1/2 sum,,i,, w,,i,, (a,,i,,^T^ x + z - b,,i,,)^2^ / sum,,i,, w_i
- *   + 1/2 lambda / delta sum,,j,, (sigma,,j,, x,,j,,)^2^,
+ * min,,x,z,, 1/2 sum,,i,, w,,i,, (a,,i,,^T^ x + z - b,,i,,)^2^ / sum,,i,, w,,i,,
+ *   + lambda / delta (1/2 (1 - alpha) sumj,, (sigma,,j,, x,,j,,)^2^
+ *   + alpha sum,,j,, abs(sigma,,j,, x,,j,,)),
  *
- * where lambda is the regularization parameter, and delta and sigma,,j,, are controlled by
- * [[standardizeLabel]] and [[standardizeFeatures]], respectively.
+ * where lambda is the regularization parameter, alpha is the ElasticNet mixing parameter,
+ * and delta and sigma,,j,, are controlled by [[standardizeLabel]] and [[standardizeFeatures]],
+ * respectively.
  *
  * Set [[regParam]] to 0.0 and turn off both [[standardizeFeatures]] and [[standardizeLabel]] to
  * match R's `lm`.
  * Turn on [[standardizeLabel]] to match R's `glmnet`.
  *
+ * @note The coefficients and intercept are always trained in the scaled space, but are returned
+ *       on the original scale. [[standardizeFeatures]] and [[standardizeLabel]] can be used to
+ *       control whether regularization is applied in the original space or the scaled space.
  * @param fitIntercept whether to fit intercept. If false, z is 0.0.
- * @param regParam L2 regularization parameter (lambda)
- * @param standardizeFeatures whether to standardize features. If true, sigma_,,j,, is the
+ * @param regParam Regularization parameter (lambda).
+ * @param elasticNetParam the ElasticNet mixing parameter (alpha).
+ * @param standardizeFeatures whether to standardize features. If true, sigma,,j,, is the
  *                            population standard deviation of the j-th column of A. Otherwise,
  *                            sigma,,j,, is 1.0.
  * @param standardizeLabel whether to standardize label. If true, delta is the population standard
  *                         deviation of the label column b. Otherwise, delta is 1.0.
+ * @param solverType the type of solver to use for optimization.
+ * @param maxIter maximum number of iterations. Only for QuasiNewton solverType.
+ * @param tol the convergence tolerance of the iterations. Only for QuasiNewton solverType.
  */
 private[ml] class WeightedLeastSquares(
     val fitIntercept: Boolean,
     val regParam: Double,
+    val elasticNetParam: Double,
     val standardizeFeatures: Boolean,
-    val standardizeLabel: Boolean) extends Logging with Serializable {
+    val standardizeLabel: Boolean,
+    val solverType: WeightedLeastSquares.Solver = WeightedLeastSquares.Auto,
+    val maxIter: Int = 100,
+    val tol: Double = 1e-6) extends Logging with Serializable {
   import WeightedLeastSquares._
 
   require(regParam >= 0.0, s"regParam cannot be negative: $regParam")
   if (regParam == 0.0) {
     logWarning("regParam is zero, which might cause numerical instability and overfitting.")
   }
+  require(elasticNetParam >= 0.0 && elasticNetParam <= 1.0,
+    s"elasticNetParam must be in [0, 1]: $elasticNetParam")
+  require(maxIter >= 0, s"maxIter must be a positive integer: $maxIter")
+  require(tol > 0, s"tol must be greater than zero: $tol")
 
   /**
    * Creates a [[WeightedLeastSquaresModel]] from an RDD of [[Instance]]s.
@@ -85,73 +104,198 @@ private[ml] class WeightedLeastSquares(
     val triK = summary.triK
     val wSum = summary.wSum
     val bBar = summary.bBar
-    val bStd = summary.bStd
+    val bbBar = summary.bbBar
     val aBar = summary.aBar
-    val aVar = summary.aVar
+    val aStd = summary.aStd
     val abBar = summary.abBar
     val aaBar = summary.aaBar
-    val aaValues = aaBar.values
-
-    if (bStd == 0) {
-      if (fitIntercept) {
-        logWarning(s"The standard deviation of the label is zero, so the coefficients will be " +
-          s"zeros and the intercept will be the mean of the label; as a result, " +
-          s"training is not needed.")
-        val coefficients = new DenseVector(Array.ofDim(k-1))
+    val numFeatures = abBar.size
+    val rawBStd = summary.bStd
+    // if b is constant (rawBStd is zero), then b cannot be scaled. In this case
+    // setting bStd=abs(bBar) ensures that b is not scaled anymore in l-bfgs algorithm.
+    val bStd = if (rawBStd == 0.0) math.abs(bBar) else rawBStd
+
+    if (rawBStd == 0) {
+      if (fitIntercept || bBar == 0.0) {
+        if (bBar == 0.0) {
+          logWarning(s"Mean and standard deviation of the label are zero, so the coefficients " +
+            s"and the intercept will all be zero; as a result, training is not needed.")
+        } else {
+          logWarning(s"The standard deviation of the label is zero, so the coefficients will be " +
+            s"zeros and the intercept will be the mean of the label; as a result, " +
+            s"training is not needed.")
+        }
+        val coefficients = new DenseVector(Array.ofDim(numFeatures))
         val intercept = bBar
         val diagInvAtWA = new DenseVector(Array(0D))
-        return new WeightedLeastSquaresModel(coefficients, intercept, diagInvAtWA)
+        return new WeightedLeastSquaresModel(coefficients, intercept, diagInvAtWA, Array(0D))
+      } else {
+        require(!(regParam > 0.0 && standardizeLabel), "The standard deviation of the label is " +
+          "zero. Model cannot be regularized with standardization=true")
+        logWarning(s"The standard deviation of the label is zero. Consider setting " +
+          s"fitIntercept=true.")
+      }
+    }
+
+    // scale aBar to standardized space in-place
+    val aBarValues = aBar.values
+    var j = 0
+    while (j < numFeatures) {
+      if (aStd(j) == 0.0) {
+        aBarValues(j) = 0.0
       } else {
-        require(!(regParam > 0.0 && standardizeLabel),
-          "The standard deviation of the label is zero. " +
-            "Model cannot be regularized with standardization=true")
-        logWarning(s"The standard deviation of the label is zero. " +
-          "Consider setting fitIntercept=true.")
+        aBarValues(j) /= aStd(j)
+      }
+      j += 1
+    }
+
+    // scale abBar to standardized space in-place
+    val abBarValues = abBar.values
+    val aStdValues = aStd.values
+    j = 0
+    while (j < numFeatures) {
+      if (aStdValues(j) == 0.0) {
+        abBarValues(j) = 0.0
+      } else {
+        abBarValues(j) /= (aStdValues(j) * bStd)
+      }
+      j += 1
+    }
+
+    // scale aaBar to standardized space in-place
+    val aaBarValues = aaBar.values
+    j = 0
+    var p = 0
+    while (j < numFeatures) {
+      val aStdJ = aStdValues(j)
+      var i = 0
+      while (i <= j) {
+        val aStdI = aStdValues(i)
+        if (aStdJ == 0.0 || aStdI == 0.0) {
+          aaBarValues(p) = 0.0
+        } else {
+          aaBarValues(p) /= (aStdI * aStdJ)
+        }
+        p += 1
+        i += 1
       }
+      j += 1
     }
 
-    // add regularization to diagonals
+    val bBarStd = bBar / bStd
+    val bbBarStd = bbBar / (bStd * bStd)
+
+    val effectiveRegParam = regParam / bStd
+    val effectiveL1RegParam = elasticNetParam * effectiveRegParam
+    val effectiveL2RegParam = (1.0 - elasticNetParam) * effectiveRegParam
+
+    // add L2 regularization to diagonals
     var i = 0
-    var j = 2
+    j = 2
     while (i < triK) {
-      var lambda = regParam
-      if (standardizeFeatures) {
-        lambda *= aVar(j - 2)
+      var lambda = effectiveL2RegParam
+      if (!standardizeFeatures) {
+        val std = aStd(j - 2)
+        if (std != 0.0) {
+          lambda /= (std * std)
+        } else {
+          lambda = 0.0
+        }
       }
-      if (standardizeLabel && bStd != 0) {
-        lambda /= bStd
+      if (!standardizeLabel) {
+        lambda *= bStd
       }
-      aaValues(i) += lambda
+      aaBarValues(i) += lambda
       i += j
       j += 1
     }
+    val aa = getAtA(aaBar.values, aBar.values)
+    val ab = getAtB(abBar.values, bBarStd)
 
-    val aa = if (fitIntercept) {
-      Array.concat(aaBar.values, aBar.values, Array(1.0))
+    val solver = if ((solverType == WeightedLeastSquares.Auto && elasticNetParam != 0.0 &&
+      regParam != 0.0) || (solverType == WeightedLeastSquares.QuasiNewton)) {
+      val effectiveL1RegFun: Option[(Int) => Double] = if (effectiveL1RegParam != 0.0) {
+        Some((index: Int) => {
+            if (fitIntercept && index == numFeatures) {
+              0.0
+            } else {
+              if (standardizeFeatures) {
+                effectiveL1RegParam
+              } else {
+                if (aStdValues(index) != 0.0) effectiveL1RegParam / aStdValues(index) else 0.0
+              }
+            }
+          })
+      } else {
+        None
+      }
+      new QuasiNewtonSolver(fitIntercept, maxIter, tol, effectiveL1RegFun)
     } else {
-      aaBar.values
+      new CholeskySolver
+    }
+
+    val solution = solver match {
+      case cholesky: CholeskySolver =>
+        try {
+          cholesky.solve(bBarStd, bbBarStd, ab, aa, aBar)
+        } catch {
+          // if Auto solver is used and Cholesky fails due to singular AtA, then fall back to
+          // quasi-newton solver
+          case _: SingularMatrixException if solverType == WeightedLeastSquares.Auto =>
+            logWarning("Cholesky solver failed due to singular covariance matrix. " +
+              "Retrying with Quasi-Newton solver.")
+            // ab and aa were modified in place, so reconstruct them
+            val _aa = getAtA(aaBar.values, aBar.values)
+            val _ab = getAtB(abBar.values, bBarStd)
+            val newSolver = new QuasiNewtonSolver(fitIntercept, maxIter, tol, None)
+            newSolver.solve(bBarStd, bbBarStd, _ab, _aa, aBar)
+        }
+      case qn: QuasiNewtonSolver =>
+        qn.solve(bBarStd, bbBarStd, ab, aa, aBar)
     }
-    val ab = if (fitIntercept) {
-      Array.concat(abBar.values, Array(bBar))
+    val (coefficientArray, intercept) = if (fitIntercept) {
+      (solution.coefficients.slice(0, solution.coefficients.length - 1),
+        solution.coefficients.last * bStd)
     } else {
-      abBar.values
+      (solution.coefficients, 0.0)
     }
 
-    val x = CholeskyDecomposition.solve(aa, ab)
-
-    val aaInv = CholeskyDecomposition.inverse(aa, k)
+    // convert the coefficients from the scaled space to the original space
+    var q = 0
+    val len = coefficientArray.length
+    while (q < len) {
+      coefficientArray(q) *= { if (aStdValues(q) != 0.0) bStd / aStdValues(q) else 0.0 }
+      q += 1
+    }
 
     // aaInv is a packed upper triangular matrix, here we get all elements on diagonal
-    val diagInvAtWA = new DenseVector((1 to k).map { i =>
-      aaInv(i + (i - 1) * i / 2 - 1) / wSum }.toArray)
+    val diagInvAtWA = solution.aaInv.map { inv =>
+      new DenseVector((1 to k).map { i =>
+        val multiplier = if (i == k && fitIntercept) 1.0 else aStdValues(i - 1) * aStdValues(i - 1)
+        inv(i + (i - 1) * i / 2 - 1) / (wSum * multiplier)
+      }.toArray)
+    }.getOrElse(new DenseVector(Array(0D)))
 
-    val (coefficients, intercept) = if (fitIntercept) {
-      (new DenseVector(x.slice(0, x.length - 1)), x.last)
+    new WeightedLeastSquaresModel(new DenseVector(coefficientArray), intercept, diagInvAtWA,
+      solution.objectiveHistory.getOrElse(Array(0D)))
+  }
+
+  /** Construct A^T^ A from summary statistics. */
+  private def getAtA(aaBar: Array[Double], aBar: Array[Double]): DenseVector = {
+    if (fitIntercept) {
+      new DenseVector(Array.concat(aaBar, aBar, Array(1.0)))
     } else {
-      (new DenseVector(x), 0.0)
+      new DenseVector(aaBar.clone())
     }
+  }
 
-    new WeightedLeastSquaresModel(coefficients, intercept, diagInvAtWA)
+  /** Construct A^T^ b from summary statistics. */
+  private def getAtB(abBar: Array[Double], bBar: Double): DenseVector = {
+    if (fitIntercept) {
+      new DenseVector(Array.concat(abBar, Array(bBar)))
+    } else {
+      new DenseVector(abBar.clone())
+    }
   }
 }
 
@@ -163,6 +307,13 @@ private[ml] object WeightedLeastSquares {
    */
   val MAX_NUM_FEATURES: Int = 4096
 
+  sealed trait Solver
+  case object Auto extends Solver
+  case object Cholesky extends Solver
+  case object QuasiNewton extends Solver
+
+  val supportedSolvers = Array(Auto, Cholesky, QuasiNewton)
+
   /**
    * Aggregator to provide necessary summary statistics for solving [[WeightedLeastSquares]].
    */
@@ -262,6 +413,11 @@ private[ml] object WeightedLeastSquares {
      */
     def bBar: Double = bSum / wSum
 
+    /**
+     * Weighted mean of squared labels.
+     */
+    def bbBar: Double = bbSum / wSum
+
     /**
      * Weighted population standard deviation of labels.
      */
@@ -285,6 +441,24 @@ private[ml] object WeightedLeastSquares {
       output
     }
 
+    /**
+     * Weighted population standard deviation of features.
+     */
+    def aStd: DenseVector = {
+      val std = Array.ofDim[Double](k)
+      var i = 0
+      var j = 2
+      val aaValues = aaSum.values
+      while (i < triK) {
+        val l = j - 2
+        val aw = aSum(l) / wSum
+        std(l) = math.sqrt(aaValues(i) / wSum - aw * aw)
+        i += j
+        j += 1
+      }
+      new DenseVector(std)
+    }
+
     /**
      * Weighted population variance of features.
      */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index bb9e150c49772..33cb25c8c7f66 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -262,7 +262,7 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val
 
     if (familyObj == Gaussian && linkObj == Identity) {
       // TODO: Make standardizeFeatures and standardizeLabel configurable.
-      val optimizer = new WeightedLeastSquares($(fitIntercept), $(regParam),
+      val optimizer = new WeightedLeastSquares($(fitIntercept), $(regParam), elasticNetParam = 0.0,
         standardizeFeatures = true, standardizeLabel = true)
       val wlsModel = optimizer.fit(instances)
       val model = copyValues(
@@ -337,7 +337,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
         Instance(eta, instance.weight, instance.features)
       }
       // TODO: Make standardizeFeatures and standardizeLabel configurable.
-      val initialModel = new WeightedLeastSquares(fitIntercept, regParam,
+      val initialModel = new WeightedLeastSquares(fitIntercept, regParam, elasticNetParam = 0.0,
         standardizeFeatures = true, standardizeLabel = true)
         .fit(newInstances)
       initialModel
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 025ed20c75a04..519f3bdec82df 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -31,7 +31,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.linalg.BLAS._
-import org.apache.spark.ml.optim.WeightedLeastSquares
+import org.apache.spark.ml.optim.{NormalEquationSolver, WeightedLeastSquares}
 import org.apache.spark.ml.PredictorParams
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.param.shared._
@@ -177,6 +177,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
    * If the dimensions of features or the number of partitions are large,
    * this param could be adjusted to a larger size.
    * Default is 2.
+   *
    * @group expertSetParam
    */
   @Since("2.1.0")
@@ -194,21 +195,18 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
         Instance(label, weight, features)
     }
 
-    if (($(solver) == "auto" && $(elasticNetParam) == 0.0 &&
+    if (($(solver) == "auto" &&
       numFeatures <= WeightedLeastSquares.MAX_NUM_FEATURES) || $(solver) == "normal") {
-      require($(elasticNetParam) == 0.0, "Only L2 regularization can be used when normal " +
-        "solver is used.'")
-      // For low dimensional data, WeightedLeastSquares is more efficiently since the
+      // For low dimensional data, WeightedLeastSquares is more efficient since the
       // training algorithm only requires one pass through the data. (SPARK-10668)
 
       val optimizer = new WeightedLeastSquares($(fitIntercept), $(regParam),
-        $(standardization), true)
+        elasticNetParam = $(elasticNetParam), $(standardization), true,
+        solverType = WeightedLeastSquares.Auto, maxIter = $(maxIter), tol = $(tol))
       val model = optimizer.fit(instances)
       // When it is trained by WeightedLeastSquares, training summary does not
-      // attached returned model.
+      // attach returned model.
       val lrModel = copyValues(new LinearRegressionModel(uid, model.coefficients, model.intercept))
-      // WeightedLeastSquares does not run through iterations. So it does not generate
-      // an objective history.
       val (summaryModel, predictionColName) = lrModel.findSummaryModelAndPredictionCol()
       val trainingSummary = new LinearRegressionTrainingSummary(
         summaryModel.transform(dataset),
@@ -217,7 +215,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
         $(featuresCol),
         summaryModel,
         model.diagInvAtWA.toArray,
-        Array(0D))
+        model.objectiveHistory)
 
       return lrModel.setSummary(trainingSummary)
     }
@@ -243,7 +241,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
     val yMean = ySummarizer.mean(0)
     val rawYStd = math.sqrt(ySummarizer.variance(0))
     if (rawYStd == 0.0) {
-      if ($(fitIntercept) || yMean==0.0) {
+      if ($(fitIntercept) || yMean == 0.0) {
         // If the rawYStd is zero and fitIntercept=true, then the intercept is yMean with
         // zero coefficient; as a result, training is not needed.
         // Also, if yMean==0 and rawYStd==0, all the coefficients are zero regardless of
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/CholeskyDecomposition.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/CholeskyDecomposition.scala
index 08f8f19c1e77d..68771f1afbe8c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/CholeskyDecomposition.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/CholeskyDecomposition.scala
@@ -20,6 +20,8 @@ package org.apache.spark.mllib.linalg
 import com.github.fommil.netlib.LAPACK.{getInstance => lapack}
 import org.netlib.util.intW
 
+import org.apache.spark.ml.optim.SingularMatrixException
+
 /**
  * Compute Cholesky decomposition.
  */
@@ -60,7 +62,7 @@ private[spark] object CholeskyDecomposition {
       case code if code < 0 =>
         throw new IllegalStateException(s"LAPACK.$method returned $code; arg ${-code} is illegal")
       case code if code > 0 =>
-        throw new IllegalArgumentException(
+        throw new SingularMatrixException (
           s"LAPACK.$method returned $code because A is not positive definite. Is A derived from " +
           "a singular matrix (e.g. collinear column values)?")
       case _ => // do nothing
diff --git a/mllib/src/test/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquaresSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquaresSuite.scala
index b30d995794d4c..50260952ecb66 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquaresSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquaresSuite.scala
@@ -85,7 +85,7 @@ class IterativelyReweightedLeastSquaresSuite extends SparkFunSuite with MLlibTes
         val eta = math.log(mu / (1.0 - mu))
         Instance(eta, instance.weight, instance.features)
       }
-      val initial = new WeightedLeastSquares(fitIntercept, regParam = 0.0,
+      val initial = new WeightedLeastSquares(fitIntercept, regParam = 0.0, elasticNetParam = 0.0,
         standardizeFeatures = false, standardizeLabel = false).fit(newInstances)
       val irls = new IterativelyReweightedLeastSquares(initial, BinomialReweightFunc,
         fitIntercept, regParam = 0.0, maxIter = 25, tol = 1e-8).fit(instances1)
@@ -122,7 +122,7 @@ class IterativelyReweightedLeastSquaresSuite extends SparkFunSuite with MLlibTes
         val eta = math.log(mu)
         Instance(eta, instance.weight, instance.features)
       }
-      val initial = new WeightedLeastSquares(fitIntercept, regParam = 0.0,
+      val initial = new WeightedLeastSquares(fitIntercept, regParam = 0.0, elasticNetParam = 0.0,
         standardizeFeatures = false, standardizeLabel = false).fit(newInstances)
       val irls = new IterativelyReweightedLeastSquares(initial, PoissonReweightFunc,
         fitIntercept, regParam = 0.0, maxIter = 25, tol = 1e-8).fit(instances2)
@@ -155,7 +155,7 @@ class IterativelyReweightedLeastSquaresSuite extends SparkFunSuite with MLlibTes
 
     var idx = 0
     for (fitIntercept <- Seq(false, true)) {
-      val initial = new WeightedLeastSquares(fitIntercept, regParam = 0.0,
+      val initial = new WeightedLeastSquares(fitIntercept, regParam = 0.0, elasticNetParam = 0.0,
         standardizeFeatures = false, standardizeLabel = false).fit(instances2)
       val irls = new IterativelyReweightedLeastSquares(initial, L1RegressionReweightFunc,
         fitIntercept, regParam = 0.0, maxIter = 200, tol = 1e-7).fit(instances2)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala
index 2cb1af0dee0bc..5f638b4880058 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.ml.optim
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.feature.Instance
-import org.apache.spark.ml.linalg.Vectors
+import org.apache.spark.ml.linalg.{BLAS, Vectors}
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.rdd.RDD
@@ -28,6 +28,9 @@ class WeightedLeastSquaresSuite extends SparkFunSuite with MLlibTestSparkContext
 
   private var instances: RDD[Instance] = _
   private var instancesConstLabel: RDD[Instance] = _
+  private var instancesConstZeroLabel: RDD[Instance] = _
+  private var collinearInstances: RDD[Instance] = _
+  private var constantFeaturesInstances: RDD[Instance] = _
 
   override def beforeAll(): Unit = {
     super.beforeAll()
@@ -58,26 +61,121 @@ class WeightedLeastSquaresSuite extends SparkFunSuite with MLlibTestSparkContext
       Instance(17.0, 3.0, Vectors.dense(2.0, 11.0)),
       Instance(17.0, 4.0, Vectors.dense(3.0, 13.0))
     ), 2)
-  }
 
-  test("two collinear features result in error with no regularization") {
-    val singularInstances = sc.parallelize(Seq(
+    /*
+       A <- matrix(c(1, 2, 3, 4, 2, 4, 6, 8), 4, 2)
+       b <- c(1, 2, 3, 4)
+       w <- c(1, 1, 1, 1)
+     */
+    collinearInstances = sc.parallelize(Seq(
       Instance(1.0, 1.0, Vectors.dense(1.0, 2.0)),
       Instance(2.0, 1.0, Vectors.dense(2.0, 4.0)),
       Instance(3.0, 1.0, Vectors.dense(3.0, 6.0)),
       Instance(4.0, 1.0, Vectors.dense(4.0, 8.0))
     ), 2)
 
-    intercept[IllegalArgumentException] {
-      new WeightedLeastSquares(
-        false, regParam = 0.0, standardizeFeatures = false,
-        standardizeLabel = false).fit(singularInstances)
+    /*
+       R code:
+
+       A <- matrix(c(0, 1, 2, 3, 5, 7, 11, 13), 4, 2)
+       b.const <- c(0, 0, 0, 0)
+       w <- c(1, 2, 3, 4)
+     */
+    instancesConstZeroLabel = sc.parallelize(Seq(
+      Instance(0.0, 1.0, Vectors.dense(0.0, 5.0).toSparse),
+      Instance(0.0, 2.0, Vectors.dense(1.0, 7.0)),
+      Instance(0.0, 3.0, Vectors.dense(2.0, 11.0)),
+      Instance(0.0, 4.0, Vectors.dense(3.0, 13.0))
+    ), 2)
+
+    /*
+       R code:
+
+       A <- matrix(c(1, 1, 1, 1, 5, 7, 11, 13), 4, 2)
+       b <- c(17, 19, 23, 29)
+       w <- c(1, 2, 3, 4)
+     */
+    constantFeaturesInstances = sc.parallelize(Seq(
+      Instance(17.0, 1.0, Vectors.dense(1.0, 5.0)),
+      Instance(19.0, 2.0, Vectors.dense(1.0, 7.0)),
+      Instance(23.0, 3.0, Vectors.dense(1.0, 11.0)),
+      Instance(29.0, 4.0, Vectors.dense(1.0, 13.0))
+    ), 2)
+  }
+
+  test("WLS with strong L1 regularization") {
+    /*
+      We initialize the coefficients for WLS QN solver to be weighted average of the label. Check
+      here that with only an intercept the model converges to bBar.
+     */
+    val bAgg = instances.collect().foldLeft((0.0, 0.0)) {
+      case ((sum, weightSum), Instance(l, w, f)) => (sum + w * l, weightSum + w)
     }
+    val bBar = bAgg._1 / bAgg._2
+    val wls = new WeightedLeastSquares(true, 10, 1.0, true, true)
+    val model = wls.fit(instances)
+    assert(model.intercept ~== bBar relTol 1e-6)
+  }
 
-    // Should not throw an exception
-    new WeightedLeastSquares(
-      false, regParam = 1.0, standardizeFeatures = false,
-      standardizeLabel = false).fit(singularInstances)
+  test("diagonal inverse of AtWA") {
+    /*
+      library(Matrix)
+      A <- matrix(c(0, 1, 2, 3, 5, 7, 11, 13), 4, 2)
+      w <- c(1, 2, 3, 4)
+      W <- Diagonal(length(w), w)
+      A.intercept <- cbind(A, rep.int(1, length(w)))
+      AtA.intercept <- t(A.intercept) %*% W %*% A.intercept
+      inv.intercept <- solve(AtA.intercept)
+      print(diag(inv.intercept))
+      [1]  4.02  0.50 12.02
+
+      AtA <- t(A) %*% W %*% A
+      inv <- solve(AtA)
+      print(diag(inv))
+      [1] 0.48336106 0.02079867
+
+     */
+    val expectedWithIntercept = Vectors.dense(4.02, 0.50, 12.02)
+    val expected = Vectors.dense(0.48336106, 0.02079867)
+    val wlsWithIntercept = new WeightedLeastSquares(fitIntercept = true, regParam = 0.0,
+      elasticNetParam = 0.0, standardizeFeatures = true, standardizeLabel = true,
+      solverType = WeightedLeastSquares.Cholesky)
+    val wlsModelWithIntercept = wlsWithIntercept.fit(instances)
+    val wls = new WeightedLeastSquares(false, 0.0, 0.0, true, true,
+      solverType = WeightedLeastSquares.Cholesky)
+    val wlsModel = wls.fit(instances)
+
+    assert(expectedWithIntercept ~== wlsModelWithIntercept.diagInvAtWA relTol 1e-4)
+    assert(expected ~== wlsModel.diagInvAtWA relTol 1e-4)
+  }
+
+  test("two collinear features") {
+    // Cholesky solver does not handle singular input
+    intercept[SingularMatrixException] {
+      new WeightedLeastSquares(fitIntercept = false, regParam = 0.0, elasticNetParam = 0.0,
+        standardizeFeatures = false, standardizeLabel = false,
+        solverType = WeightedLeastSquares.Cholesky).fit(collinearInstances)
+    }
+
+    // Cholesky should not throw an exception since regularization is applied
+    new WeightedLeastSquares(fitIntercept = false, regParam = 1.0, elasticNetParam = 0.0,
+      standardizeFeatures = false, standardizeLabel = false,
+      solverType = WeightedLeastSquares.Cholesky).fit(collinearInstances)
+
+    // quasi-newton solvers should handle singular input and make correct predictions
+    // auto solver should try Cholesky first, then fall back to QN
+    for (fitIntercept <- Seq(false, true);
+         standardization <- Seq(false, true);
+         solver <- Seq(WeightedLeastSquares.Auto, WeightedLeastSquares.QuasiNewton)) {
+      val singularModel = new WeightedLeastSquares(fitIntercept, regParam = 0.0,
+        elasticNetParam = 0.0, standardizeFeatures = standardization,
+        standardizeLabel = standardization, solverType = solver).fit(collinearInstances)
+
+      collinearInstances.collect().foreach { case Instance(l, w, f) =>
+        val pred = BLAS.dot(singularModel.coefficients, f) + singularModel.intercept
+        assert(pred ~== l absTol 1e-6)
+      }
+    }
   }
 
   test("WLS against lm") {
@@ -100,13 +198,15 @@ class WeightedLeastSquaresSuite extends SparkFunSuite with MLlibTestSparkContext
 
     var idx = 0
     for (fitIntercept <- Seq(false, true)) {
-       for (standardization <- Seq(false, true)) {
-         val wls = new WeightedLeastSquares(
-           fitIntercept, regParam = 0.0, standardizeFeatures = standardization,
-           standardizeLabel = standardization).fit(instances)
-         val actual = Vectors.dense(wls.intercept, wls.coefficients(0), wls.coefficients(1))
-         assert(actual ~== expected(idx) absTol 1e-4)
-       }
+      for (standardization <- Seq(false, true)) {
+        for (solver <- WeightedLeastSquares.supportedSolvers) {
+          val wls = new WeightedLeastSquares(fitIntercept, regParam = 0.0, elasticNetParam = 0.0,
+            standardizeFeatures = standardization, standardizeLabel = standardization,
+            solverType = solver).fit(instances)
+          val actual = Vectors.dense(wls.intercept, wls.coefficients(0), wls.coefficients(1))
+          assert(actual ~== expected(idx) absTol 1e-4)
+        }
+      }
       idx += 1
     }
   }
@@ -132,28 +232,256 @@ class WeightedLeastSquaresSuite extends SparkFunSuite with MLlibTestSparkContext
     var idx = 0
     for (fitIntercept <- Seq(false, true)) {
       for (standardization <- Seq(false, true)) {
-        val wls = new WeightedLeastSquares(
-          fitIntercept, regParam = 0.0, standardizeFeatures = standardization,
-          standardizeLabel = standardization).fit(instancesConstLabel)
-        val actual = Vectors.dense(wls.intercept, wls.coefficients(0), wls.coefficients(1))
-        assert(actual ~== expected(idx) absTol 1e-4)
+        for (solver <- WeightedLeastSquares.supportedSolvers) {
+          val wls = new WeightedLeastSquares(fitIntercept, regParam = 0.0, elasticNetParam = 0.0,
+            standardizeFeatures = standardization, standardizeLabel = standardization,
+            solverType = solver).fit(instancesConstLabel)
+          val actual = Vectors.dense(wls.intercept, wls.coefficients(0), wls.coefficients(1))
+          assert(actual ~== expected(idx) absTol 1e-4)
+        }
       }
       idx += 1
     }
+
+    // when label is constant zero, and fitIntercept is false, we should not train and get all zeros
+    for (solver <- WeightedLeastSquares.supportedSolvers) {
+      val wls = new WeightedLeastSquares(fitIntercept = false, regParam = 0.0,
+        elasticNetParam = 0.0, standardizeFeatures = true, standardizeLabel = true,
+        solverType = solver).fit(instancesConstZeroLabel)
+      val actual = Vectors.dense(wls.intercept, wls.coefficients(0), wls.coefficients(1))
+      assert(actual === Vectors.dense(0.0, 0.0, 0.0))
+      assert(wls.objectiveHistory === Array(0.0))
+    }
   }
 
   test("WLS with regularization when label is constant") {
     // if regParam is non-zero and standardization is true, the problem is ill-defined and
     // an exception is thrown.
-    val wls = new WeightedLeastSquares(
-      fitIntercept = false, regParam = 0.1, standardizeFeatures = true,
-      standardizeLabel = true)
-    intercept[IllegalArgumentException]{
-      wls.fit(instancesConstLabel)
+    for (solver <- WeightedLeastSquares.supportedSolvers) {
+      val wls = new WeightedLeastSquares(fitIntercept = false, regParam = 0.1,
+        elasticNetParam = 0.0, standardizeFeatures = true, standardizeLabel = true,
+        solverType = solver)
+      intercept[IllegalArgumentException]{
+        wls.fit(instancesConstLabel)
+      }
     }
   }
 
-  test("WLS against glmnet") {
+  test("WLS against glmnet with constant features") {
+    // Cholesky solver does not handle singular input with no regularization
+    for (fitIntercept <- Seq(false, true);
+         standardization <- Seq(false, true)) {
+      val wls = new WeightedLeastSquares(fitIntercept, regParam = 0.0, elasticNetParam = 0.0,
+        standardizeFeatures = standardization, standardizeLabel = standardization,
+        solverType = WeightedLeastSquares.Cholesky)
+      intercept[SingularMatrixException] {
+        wls.fit(constantFeaturesInstances)
+      }
+    }
+
+    // Cholesky also fails when regularization is added but we don't wish to standardize
+    val wls = new WeightedLeastSquares(true, regParam = 0.5, elasticNetParam = 0.0,
+      standardizeFeatures = false, standardizeLabel = false,
+      solverType = WeightedLeastSquares.Cholesky)
+    intercept[SingularMatrixException] {
+      wls.fit(constantFeaturesInstances)
+    }
+
+    /*
+      for (intercept in c(FALSE, TRUE)) {
+        model <- glmnet(A, b, weights=w, intercept=intercept, lambda=0.5,
+                       standardize=T, alpha=0.0, thresh=1E-14)
+        print(as.vector(coef(model)))
+      }
+      [1] 0.000000 0.000000 2.235802
+      [1] 9.798771 0.000000 1.365503
+     */
+    // should not fail when regularization and standardization are added
+    val expectedCholesky = Seq(
+      Vectors.dense(0.0, 0.0, 2.235802),
+      Vectors.dense(9.798771, 0.0, 1.365503)
+    )
+    var idx = 0
+    for (fitIntercept <- Seq(false, true)) {
+      val wls = new WeightedLeastSquares(fitIntercept = fitIntercept, regParam = 0.5,
+        elasticNetParam = 0.0, standardizeFeatures = true,
+        standardizeLabel = true, solverType = WeightedLeastSquares.Cholesky)
+        .fit(constantFeaturesInstances)
+      val actual = Vectors.dense(wls.intercept, wls.coefficients(0), wls.coefficients(1))
+      assert(actual ~== expectedCholesky(idx) absTol 1e-6)
+      idx += 1
+    }
+
+    /*
+      for (intercept in c(FALSE, TRUE)) {
+        for (standardize in c(FALSE, TRUE)) {
+          for (regParams in list(c(0.0, 0.0), c(0.5, 0.0), c(0.5, 0.5), c(0.5, 1.0))) {
+            model <- glmnet(A, b, weights=w, intercept=intercept, lambda=regParams[1],
+                           standardize=standardize, alpha=regParams[2], thresh=1E-14)
+            print(as.vector(coef(model)))
+          }
+        }
+      }
+      [1] 0.000000 0.000000 2.253012
+      [1] 0.000000 0.000000 2.250857
+      [1] 0.000000 0.000000 2.249784
+      [1] 0.000000 0.000000 2.248709
+      [1] 0.000000 0.000000 2.253012
+      [1] 0.000000 0.000000 2.235802
+      [1] 0.000000 0.000000 2.238297
+      [1] 0.000000 0.000000 2.240811
+      [1] 8.218905 0.000000 1.517413
+      [1] 8.434286 0.000000 1.496703
+      [1] 8.648497 0.000000 1.476106
+      [1] 8.865672 0.000000 1.455224
+      [1] 8.218905 0.000000 1.517413
+      [1] 9.798771 0.000000 1.365503
+      [1] 9.919095 0.000000 1.353933
+      [1] 10.052804  0.000000  1.341077
+     */
+    val expectedQuasiNewton = Seq(
+      Vectors.dense(0.000000, 0.000000, 2.253012),
+      Vectors.dense(0.000000, 0.000000, 2.250857),
+      Vectors.dense(0.000000, 0.000000, 2.249784),
+      Vectors.dense(0.000000, 0.000000, 2.248709),
+      Vectors.dense(0.000000, 0.000000, 2.253012),
+      Vectors.dense(0.000000, 0.000000, 2.235802),
+      Vectors.dense(0.000000, 0.000000, 2.238297),
+      Vectors.dense(0.000000, 0.000000, 2.240811),
+      Vectors.dense(8.218905, 0.000000, 1.517413),
+      Vectors.dense(8.434286, 0.000000, 1.496703),
+      Vectors.dense(8.648497, 0.000000, 1.476106),
+      Vectors.dense(8.865672, 0.000000, 1.455224),
+      Vectors.dense(8.218905, 0.000000, 1.517413),
+      Vectors.dense(9.798771, 0.000000, 1.365503),
+      Vectors.dense(9.919095, 0.000000, 1.353933),
+      Vectors.dense(10.052804, 0.000000, 1.341077))
+
+    idx = 0
+    for (fitIntercept <- Seq(false, true);
+         standardization <- Seq(false, true);
+         (lambda, alpha) <- Seq((0.0, 0.0), (0.5, 0.0), (0.5, 0.5), (0.5, 1.0))) {
+      for (solver <- Seq(WeightedLeastSquares.Auto, WeightedLeastSquares.Cholesky)) {
+        val wls = new WeightedLeastSquares(fitIntercept, regParam = lambda, elasticNetParam = alpha,
+          standardizeFeatures = standardization, standardizeLabel = true,
+          solverType = WeightedLeastSquares.QuasiNewton)
+        val model = wls.fit(constantFeaturesInstances)
+        val actual = Vectors.dense(model.intercept, model.coefficients(0), model.coefficients(1))
+        assert(actual ~== expectedQuasiNewton(idx) absTol 1e-6)
+      }
+      idx += 1
+    }
+  }
+
+  test("WLS against glmnet with L1/ElasticNet regularization") {
+    /*
+      R code:
+
+      library(glmnet)
+
+      for (intercept in c(FALSE, TRUE)) {
+        for (lambda in c(0.1, 0.5, 1.0)) {
+          for (standardize in c(FALSE, TRUE)) {
+            for (alpha in c(0.1, 0.5, 1.0)) {
+              model <- glmnet(A, b, weights=w, intercept=intercept, lambda=lambda,
+                           standardize=standardize, alpha=alpha, thresh=1E-14)
+              print(as.vector(coef(model)))
+            }
+          }
+        }
+      }
+      [1] 0.000000 -3.292821  2.921188
+      [1] 0.000000 -3.230854  2.908484
+      [1] 0.000000 -3.145586  2.891014
+      [1] 0.000000 -2.919246  2.841724
+      [1] 0.000000 -2.938323  2.846369
+      [1] 0.000000 -2.965397  2.852838
+      [1] 0.000000 -2.137858  2.684464
+      [1] 0.000000 -1.680094  2.590844
+      [1] 0.0000000 -0.8194631  2.4151405
+      [1] 0.0000000 -0.9608375  2.4301013
+      [1] 0.0000000 -0.6187922  2.3634907
+      [1] 0.000000 0.000000 2.240811
+      [1] 0.000000 -1.346573  2.521293
+      [1] 0.0000000 -0.3680456  2.3212362
+      [1] 0.000000 0.000000 2.244406
+      [1] 0.000000 0.000000 2.219816
+      [1] 0.000000 0.000000 2.223694
+      [1] 0.00000 0.00000 2.22861
+      [1] 13.5631592  3.2811513  0.3725517
+      [1] 13.6953934  3.3336271  0.3497454
+      [1] 13.9600276  3.4600170  0.2999941
+      [1] 14.2389889  3.6589920  0.2349065
+      [1] 15.2374080  4.2119643  0.0325638
+      [1] 15.4  4.3  0.0
+      [1] 10.442365  1.246065  1.063991
+      [1] 8.9580718 0.1938471 1.4090610
+      [1] 8.865672 0.000000 1.455224
+      [1] 13.0430927  2.4927151  0.5741805
+      [1] 13.814429  2.722027  0.455915
+      [1] 16.2  3.9  0.0
+      [1] 9.8904768 0.7574694 1.2110177
+      [1] 9.072226 0.000000 1.435363
+      [1] 9.512438 0.000000 1.393035
+      [1] 13.3677796  2.1721216  0.6046132
+      [1] 14.2554457  2.2285185  0.5084151
+      [1] 17.2  3.4  0.0
+      */
+
+    val expected = Seq(
+      Vectors.dense(0, -3.2928206726474, 2.92118822588649),
+      Vectors.dense(0, -3.23085414359003, 2.90848366035008),
+      Vectors.dense(0, -3.14558628299477, 2.89101408157209),
+      Vectors.dense(0, -2.91924558816421, 2.84172398097327),
+      Vectors.dense(0, -2.93832343383477, 2.84636891947663),
+      Vectors.dense(0, -2.96539689593024, 2.85283836322185),
+      Vectors.dense(0, -2.13785756976542, 2.68446351346705),
+      Vectors.dense(0, -1.68009377560774, 2.59084422793154),
+      Vectors.dense(0, -0.819463123385533, 2.41514053108346),
+      Vectors.dense(0, -0.960837488151064, 2.43010130999756),
+      Vectors.dense(0, -0.618792151647599, 2.36349074148962),
+      Vectors.dense(0, 0, 2.24081114726441),
+      Vectors.dense(0, -1.34657309253953, 2.52129296638512),
+      Vectors.dense(0, -0.368045602821844, 2.32123616258871),
+      Vectors.dense(0, 0, 2.24440619621343),
+      Vectors.dense(0, 0, 2.21981559944924),
+      Vectors.dense(0, 0, 2.22369447413621),
+      Vectors.dense(0, 0, 2.22861024633605),
+      Vectors.dense(13.5631591827557, 3.28115132060568, 0.372551747695477),
+      Vectors.dense(13.6953934007661, 3.3336271417751, 0.349745414969587),
+      Vectors.dense(13.960027608754, 3.46001702257532, 0.29999407173994),
+      Vectors.dense(14.2389889013085, 3.65899196445023, 0.234906458633754),
+      Vectors.dense(15.2374079667397, 4.21196428071551, 0.0325637953681963),
+      Vectors.dense(15.4, 4.3, 0),
+      Vectors.dense(10.4423647474653, 1.24606545153166, 1.06399080283378),
+      Vectors.dense(8.95807177856822, 0.193847088148233, 1.4090609658784),
+      Vectors.dense(8.86567164179104, 0, 1.45522388059702),
+      Vectors.dense(13.0430927453034, 2.49271514356687, 0.574180477650271),
+      Vectors.dense(13.8144287399675, 2.72202744354555, 0.455915035859752),
+      Vectors.dense(16.2, 3.9, 0),
+      Vectors.dense(9.89047681835741, 0.757469417613661, 1.21101772561685),
+      Vectors.dense(9.07222551185964, 0, 1.43536293155196),
+      Vectors.dense(9.51243781094527, 0, 1.39303482587065),
+      Vectors.dense(13.3677796362763, 2.17212164262107, 0.604613180623227),
+      Vectors.dense(14.2554457236073, 2.22851848830683, 0.508415124978748),
+      Vectors.dense(17.2, 3.4, 0)
+      )
+
+    var idx = 0
+    for (fitIntercept <- Seq(false, true);
+         regParam <- Seq(0.1, 0.5, 1.0);
+         standardizeFeatures <- Seq(false, true);
+         elasticNetParam <- Seq(0.1, 0.5, 1.0)) {
+      val wls = new WeightedLeastSquares(fitIntercept, regParam, elasticNetParam = elasticNetParam,
+        standardizeFeatures, standardizeLabel = true, solverType = WeightedLeastSquares.Auto)
+        .fit(instances)
+      val actual = Vectors.dense(wls.intercept, wls.coefficients(0), wls.coefficients(1))
+      assert(actual ~== expected(idx) absTol 1e-4)
+      idx += 1
+    }
+  }
+
+  test("WLS against glmnet with L2 regularization") {
     /*
        R code:
 
@@ -201,11 +529,13 @@ class WeightedLeastSquaresSuite extends SparkFunSuite with MLlibTestSparkContext
     for (fitIntercept <- Seq(false, true);
          regParam <- Seq(0.0, 0.1, 1.0);
          standardizeFeatures <- Seq(false, true)) {
-      val wls = new WeightedLeastSquares(
-        fitIntercept, regParam, standardizeFeatures, standardizeLabel = true)
-        .fit(instances)
-      val actual = Vectors.dense(wls.intercept, wls.coefficients(0), wls.coefficients(1))
-      assert(actual ~== expected(idx) absTol 1e-4)
+      for (solver <- WeightedLeastSquares.supportedSolvers) {
+        val wls = new WeightedLeastSquares(fitIntercept, regParam, elasticNetParam = 0.0,
+          standardizeFeatures, standardizeLabel = true, solverType = solver)
+          .fit(instances)
+        val actual = Vectors.dense(wls.intercept, wls.coefficients(0), wls.coefficients(1))
+        assert(actual ~== expected(idx) absTol 1e-4)
+      }
       idx += 1
     }
   }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index 1c94ec67d79d1..c0e8afbf5e346 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -57,7 +57,7 @@ class LinearRegressionSuite
         xVariance = Array(0.7, 1.2), nPoints = 10000, seed, eps = 0.1), 2).map(_.asML).toDF()
 
     val r = new Random(seed)
-    // When feature size is larger than 4096, normal optimizer is choosed
+    // When feature size is larger than 4096, normal optimizer is chosen
     // as the solver of linear regression in the case of "auto" mode.
     val featureSize = 4100
     datasetWithSparseFeature = sc.parallelize(LinearDataGenerator.generateLinearInput(
@@ -155,6 +155,42 @@ class LinearRegressionSuite
     assert(model.numFeatures === numFeatures)
   }
 
+  test("linear regression handles singular matrices") {
+    // check for both constant columns with intercept (zero std) and collinear
+    val singularDataConstantColumn = sc.parallelize(Seq(
+      Instance(17.0, 1.0, Vectors.dense(1.0, 5.0).toSparse),
+      Instance(19.0, 2.0, Vectors.dense(1.0, 7.0)),
+      Instance(23.0, 3.0, Vectors.dense(1.0, 11.0)),
+      Instance(29.0, 4.0, Vectors.dense(1.0, 13.0))
+    ), 2).toDF()
+
+    Seq("auto", "l-bfgs", "normal").foreach { solver =>
+      val trainer = new LinearRegression().setSolver(solver).setFitIntercept(true)
+      val model = trainer.fit(singularDataConstantColumn)
+      // to make it clear that WLS did not solve analytically
+      intercept[UnsupportedOperationException] {
+        model.summary.coefficientStandardErrors
+      }
+      assert(model.summary.objectiveHistory !== Array(0.0))
+    }
+
+    val singularDataCollinearFeatures = sc.parallelize(Seq(
+      Instance(17.0, 1.0, Vectors.dense(10.0, 5.0).toSparse),
+      Instance(19.0, 2.0, Vectors.dense(14.0, 7.0)),
+      Instance(23.0, 3.0, Vectors.dense(22.0, 11.0)),
+      Instance(29.0, 4.0, Vectors.dense(26.0, 13.0))
+    ), 2).toDF()
+
+    Seq("auto", "l-bfgs", "normal").foreach { solver =>
+      val trainer = new LinearRegression().setSolver(solver).setFitIntercept(true)
+      val model = trainer.fit(singularDataCollinearFeatures)
+      intercept[UnsupportedOperationException] {
+        model.summary.coefficientStandardErrors
+      }
+      assert(model.summary.objectiveHistory !== Array(0.0))
+    }
+  }
+
   test("linear regression with intercept without regularization") {
     Seq("auto", "l-bfgs", "normal").foreach { solver =>
       val trainer1 = new LinearRegression().setSolver(solver)
@@ -233,12 +269,12 @@ class LinearRegressionSuite
          as.numeric.data3.V2. 4.70011
          as.numeric.data3.V3. 7.19943
        */
-      val coefficientsWithourInterceptR = Vectors.dense(4.70011, 7.19943)
+      val coefficientsWithoutInterceptR = Vectors.dense(4.70011, 7.19943)
 
       assert(modelWithoutIntercept1.intercept ~== 0 absTol 1E-3)
-      assert(modelWithoutIntercept1.coefficients ~= coefficientsWithourInterceptR relTol 1E-3)
+      assert(modelWithoutIntercept1.coefficients ~= coefficientsWithoutInterceptR relTol 1E-3)
       assert(modelWithoutIntercept2.intercept ~== 0 absTol 1E-3)
-      assert(modelWithoutIntercept2.coefficients ~= coefficientsWithourInterceptR relTol 1E-3)
+      assert(modelWithoutIntercept2.coefficients ~= coefficientsWithoutInterceptR relTol 1E-3)
     }
   }
 
@@ -249,55 +285,47 @@ class LinearRegressionSuite
       val trainer2 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57)
         .setSolver(solver).setStandardization(false)
 
-      // Normal optimizer is not supported with only L1 regularization case.
-      if (solver == "normal") {
-        intercept[IllegalArgumentException] {
-            trainer1.fit(datasetWithDenseFeature)
-            trainer2.fit(datasetWithDenseFeature)
-          }
-      } else {
-        val model1 = trainer1.fit(datasetWithDenseFeature)
-        val model2 = trainer2.fit(datasetWithDenseFeature)
-
-        /*
-           coefficients <- coef(glmnet(features, label, family="gaussian",
-             alpha = 1.0, lambda = 0.57 ))
-           > coefficients
-            3 x 1 sparse Matrix of class "dgCMatrix"
-                                    s0
-           (Intercept)       6.242284
-           as.numeric.d1.V2. 4.019605
-           as.numeric.d1.V3. 6.679538
-         */
-        val interceptR1 = 6.242284
-        val coefficientsR1 = Vectors.dense(4.019605, 6.679538)
-        assert(model1.intercept ~== interceptR1 relTol 1E-2)
-        assert(model1.coefficients ~= coefficientsR1 relTol 1E-2)
-
-        /*
-           coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 1.0,
-             lambda = 0.57, standardize=FALSE ))
-           > coefficients
-            3 x 1 sparse Matrix of class "dgCMatrix"
-                                    s0
-           (Intercept)         6.416948
-           as.numeric.data.V2. 3.893869
-           as.numeric.data.V3. 6.724286
-         */
-        val interceptR2 = 6.416948
-        val coefficientsR2 = Vectors.dense(3.893869, 6.724286)
-
-        assert(model2.intercept ~== interceptR2 relTol 1E-3)
-        assert(model2.coefficients ~= coefficientsR2 relTol 1E-3)
-
-        model1.transform(datasetWithDenseFeature).select("features", "prediction")
-          .collect().foreach {
-            case Row(features: DenseVector, prediction1: Double) =>
-              val prediction2 =
-                features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) +
-                  model1.intercept
-              assert(prediction1 ~== prediction2 relTol 1E-5)
-        }
+      val model1 = trainer1.fit(datasetWithDenseFeature)
+      val model2 = trainer2.fit(datasetWithDenseFeature)
+
+      /*
+         coefficients <- coef(glmnet(features, label, family="gaussian",
+           alpha = 1.0, lambda = 0.57 ))
+         > coefficients
+          3 x 1 sparse Matrix of class "dgCMatrix"
+                                  s0
+         (Intercept)       6.242284
+         as.numeric.d1.V2. 4.019605
+         as.numeric.d1.V3. 6.679538
+       */
+      val interceptR1 = 6.242284
+      val coefficientsR1 = Vectors.dense(4.019605, 6.679538)
+      assert(model1.intercept ~== interceptR1 relTol 1E-2)
+      assert(model1.coefficients ~= coefficientsR1 relTol 1E-2)
+
+      /*
+         coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 1.0,
+           lambda = 0.57, standardize=FALSE ))
+         > coefficients
+          3 x 1 sparse Matrix of class "dgCMatrix"
+                                  s0
+         (Intercept)         6.416948
+         as.numeric.data.V2. 3.893869
+         as.numeric.data.V3. 6.724286
+       */
+      val interceptR2 = 6.416948
+      val coefficientsR2 = Vectors.dense(3.893869, 6.724286)
+
+      assert(model2.intercept ~== interceptR2 relTol 1E-3)
+      assert(model2.coefficients ~= coefficientsR2 relTol 1E-3)
+
+      model1.transform(datasetWithDenseFeature).select("features", "prediction")
+        .collect().foreach {
+          case Row(features: DenseVector, prediction1: Double) =>
+            val prediction2 =
+              features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) +
+                model1.intercept
+            assert(prediction1 ~== prediction2 relTol 1E-5)
       }
     }
   }
@@ -309,56 +337,48 @@ class LinearRegressionSuite
       val trainer2 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57)
         .setFitIntercept(false).setStandardization(false).setSolver(solver)
 
-      // Normal optimizer is not supported with only L1 regularization case.
-      if (solver == "normal") {
-        intercept[IllegalArgumentException] {
-            trainer1.fit(datasetWithDenseFeature)
-            trainer2.fit(datasetWithDenseFeature)
-          }
-      } else {
-        val model1 = trainer1.fit(datasetWithDenseFeature)
-        val model2 = trainer2.fit(datasetWithDenseFeature)
-
-        /*
-           coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 1.0,
-             lambda = 0.57, intercept=FALSE ))
-           > coefficients
-            3 x 1 sparse Matrix of class "dgCMatrix"
-                                     s0
-           (Intercept)          .
-           as.numeric.data.V2. 6.272927
-           as.numeric.data.V3. 4.782604
-         */
-        val interceptR1 = 0.0
-        val coefficientsR1 = Vectors.dense(6.272927, 4.782604)
-
-        assert(model1.intercept ~== interceptR1 absTol 1E-2)
-        assert(model1.coefficients ~= coefficientsR1 relTol 1E-2)
-
-        /*
-           coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 1.0,
-             lambda = 0.57, intercept=FALSE, standardize=FALSE ))
-           > coefficients
-            3 x 1 sparse Matrix of class "dgCMatrix"
-                                     s0
-           (Intercept)         .
-           as.numeric.data.V2. 6.207817
-           as.numeric.data.V3. 4.775780
-         */
-        val interceptR2 = 0.0
-        val coefficientsR2 = Vectors.dense(6.207817, 4.775780)
-
-        assert(model2.intercept ~== interceptR2 absTol 1E-2)
-        assert(model2.coefficients ~= coefficientsR2 relTol 1E-2)
-
-        model1.transform(datasetWithDenseFeature).select("features", "prediction")
-          .collect().foreach {
-            case Row(features: DenseVector, prediction1: Double) =>
-              val prediction2 =
-                features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) +
-                  model1.intercept
-              assert(prediction1 ~== prediction2 relTol 1E-5)
-        }
+      val model1 = trainer1.fit(datasetWithDenseFeature)
+      val model2 = trainer2.fit(datasetWithDenseFeature)
+
+      /*
+         coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 1.0,
+           lambda = 0.57, intercept=FALSE ))
+         > coefficients
+          3 x 1 sparse Matrix of class "dgCMatrix"
+                                   s0
+         (Intercept)          .
+         as.numeric.data.V2. 6.272927
+         as.numeric.data.V3. 4.782604
+       */
+      val interceptR1 = 0.0
+      val coefficientsR1 = Vectors.dense(6.272927, 4.782604)
+
+      assert(model1.intercept ~== interceptR1 absTol 1E-2)
+      assert(model1.coefficients ~= coefficientsR1 relTol 1E-2)
+
+      /*
+         coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 1.0,
+           lambda = 0.57, intercept=FALSE, standardize=FALSE ))
+         > coefficients
+          3 x 1 sparse Matrix of class "dgCMatrix"
+                                   s0
+         (Intercept)         .
+         as.numeric.data.V2. 6.207817
+         as.numeric.data.V3. 4.775780
+       */
+      val interceptR2 = 0.0
+      val coefficientsR2 = Vectors.dense(6.207817, 4.775780)
+
+      assert(model2.intercept ~== interceptR2 absTol 1E-2)
+      assert(model2.coefficients ~= coefficientsR2 relTol 1E-2)
+
+      model1.transform(datasetWithDenseFeature).select("features", "prediction")
+        .collect().foreach {
+          case Row(features: DenseVector, prediction1: Double) =>
+            val prediction2 =
+              features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) +
+                model1.intercept
+            assert(prediction1 ~== prediction2 relTol 1E-5)
       }
     }
   }
@@ -471,56 +491,48 @@ class LinearRegressionSuite
       val trainer2 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6)
         .setStandardization(false).setSolver(solver)
 
-      // Normal optimizer is not supported with non-zero elasticnet parameter.
-      if (solver == "normal") {
-        intercept[IllegalArgumentException] {
-            trainer1.fit(datasetWithDenseFeature)
-            trainer2.fit(datasetWithDenseFeature)
-          }
-      } else {
-        val model1 = trainer1.fit(datasetWithDenseFeature)
-        val model2 = trainer2.fit(datasetWithDenseFeature)
-
-        /*
-           coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.3,
-             lambda = 1.6 ))
-           > coefficients
-            3 x 1 sparse Matrix of class "dgCMatrix"
-                                     s0
-           (Intercept)       5.689855
-           as.numeric.d1.V2. 3.661181
-           as.numeric.d1.V3. 6.000274
-         */
-        val interceptR1 = 5.689855
-        val coefficientsR1 = Vectors.dense(3.661181, 6.000274)
-
-        assert(model1.intercept ~== interceptR1 relTol 1E-2)
-        assert(model1.coefficients ~= coefficientsR1 relTol 1E-2)
-
-        /*
-           coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6
-             standardize=FALSE))
-           > coefficients
-            3 x 1 sparse Matrix of class "dgCMatrix"
-                                     s0
-           (Intercept)       6.113890
-           as.numeric.d1.V2. 3.407021
-           as.numeric.d1.V3. 6.152512
-         */
-        val interceptR2 = 6.113890
-        val coefficientsR2 = Vectors.dense(3.407021, 6.152512)
-
-        assert(model2.intercept ~== interceptR2 relTol 1E-2)
-        assert(model2.coefficients ~= coefficientsR2 relTol 1E-2)
-
-        model1.transform(datasetWithDenseFeature).select("features", "prediction")
-          .collect().foreach {
-          case Row(features: DenseVector, prediction1: Double) =>
-            val prediction2 =
-              features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) +
-                model1.intercept
-            assert(prediction1 ~== prediction2 relTol 1E-5)
-        }
+      val model1 = trainer1.fit(datasetWithDenseFeature)
+      val model2 = trainer2.fit(datasetWithDenseFeature)
+
+      /*
+         coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.3,
+           lambda = 1.6 ))
+         > coefficients
+          3 x 1 sparse Matrix of class "dgCMatrix"
+                                   s0
+         (Intercept)       5.689855
+         as.numeric.d1.V2. 3.661181
+         as.numeric.d1.V3. 6.000274
+       */
+      val interceptR1 = 5.689855
+      val coefficientsR1 = Vectors.dense(3.661181, 6.000274)
+
+      assert(model1.intercept ~== interceptR1 relTol 1E-2)
+      assert(model1.coefficients ~= coefficientsR1 relTol 1E-2)
+
+      /*
+         coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6
+           standardize=FALSE))
+         > coefficients
+          3 x 1 sparse Matrix of class "dgCMatrix"
+                                   s0
+         (Intercept)       6.113890
+         as.numeric.d1.V2. 3.407021
+         as.numeric.d1.V3. 6.152512
+       */
+      val interceptR2 = 6.113890
+      val coefficientsR2 = Vectors.dense(3.407021, 6.152512)
+
+      assert(model2.intercept ~== interceptR2 relTol 1E-2)
+      assert(model2.coefficients ~= coefficientsR2 relTol 1E-2)
+
+      model1.transform(datasetWithDenseFeature).select("features", "prediction")
+        .collect().foreach {
+        case Row(features: DenseVector, prediction1: Double) =>
+          val prediction2 =
+            features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) +
+              model1.intercept
+          assert(prediction1 ~== prediction2 relTol 1E-5)
       }
     }
   }
@@ -532,57 +544,49 @@ class LinearRegressionSuite
       val trainer2 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6)
         .setFitIntercept(false).setStandardization(false).setSolver(solver)
 
-      // Normal optimizer is not supported with non-zero elasticnet parameter.
-      if (solver == "normal") {
-        intercept[IllegalArgumentException] {
-            trainer1.fit(datasetWithDenseFeature)
-            trainer2.fit(datasetWithDenseFeature)
-          }
-      } else {
-        val model1 = trainer1.fit(datasetWithDenseFeature)
-        val model2 = trainer2.fit(datasetWithDenseFeature)
-
-        /*
-           coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.3,
-             lambda = 1.6, intercept=FALSE ))
-           > coefficients
-            3 x 1 sparse Matrix of class "dgCMatrix"
-                                      s0
-           (Intercept)       .
-           as.numeric.d1.V2. 5.643748
-           as.numeric.d1.V3. 4.331519
-         */
-        val interceptR1 = 0.0
-        val coefficientsR1 = Vectors.dense(5.643748, 4.331519)
-
-        assert(model1.intercept ~== interceptR1 absTol 1E-2)
-        assert(model1.coefficients ~= coefficientsR1 relTol 1E-2)
-
-        /*
-           coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.3,
-             lambda = 1.6, intercept=FALSE, standardize=FALSE ))
-           > coefficients
-            3 x 1 sparse Matrix of class "dgCMatrix"
-                                     s0
-           (Intercept)         .
-           as.numeric.d1.V2. 5.455902
-           as.numeric.d1.V3. 4.312266
-
-         */
-        val interceptR2 = 0.0
-        val coefficientsR2 = Vectors.dense(5.455902, 4.312266)
-
-        assert(model2.intercept ~== interceptR2 absTol 1E-2)
-        assert(model2.coefficients ~= coefficientsR2 relTol 1E-2)
-
-        model1.transform(datasetWithDenseFeature).select("features", "prediction")
-          .collect().foreach {
-          case Row(features: DenseVector, prediction1: Double) =>
-            val prediction2 =
-              features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) +
-                model1.intercept
-            assert(prediction1 ~== prediction2 relTol 1E-5)
-        }
+      val model1 = trainer1.fit(datasetWithDenseFeature)
+      val model2 = trainer2.fit(datasetWithDenseFeature)
+
+      /*
+         coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.3,
+           lambda = 1.6, intercept=FALSE ))
+         > coefficients
+          3 x 1 sparse Matrix of class "dgCMatrix"
+                                    s0
+         (Intercept)       .
+         as.numeric.d1.V2. 5.643748
+         as.numeric.d1.V3. 4.331519
+       */
+      val interceptR1 = 0.0
+      val coefficientsR1 = Vectors.dense(5.643748, 4.331519)
+
+      assert(model1.intercept ~== interceptR1 absTol 1E-2)
+      assert(model1.coefficients ~= coefficientsR1 relTol 1E-2)
+
+      /*
+         coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.3,
+           lambda = 1.6, intercept=FALSE, standardize=FALSE ))
+         > coefficients
+          3 x 1 sparse Matrix of class "dgCMatrix"
+                                   s0
+         (Intercept)         .
+         as.numeric.d1.V2. 5.455902
+         as.numeric.d1.V3. 4.312266
+
+       */
+      val interceptR2 = 0.0
+      val coefficientsR2 = Vectors.dense(5.455902, 4.312266)
+
+      assert(model2.intercept ~== interceptR2 absTol 1E-2)
+      assert(model2.coefficients ~= coefficientsR2 relTol 1E-2)
+
+      model1.transform(datasetWithDenseFeature).select("features", "prediction")
+        .collect().foreach {
+        case Row(features: DenseVector, prediction1: Double) =>
+          val prediction2 =
+            features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) +
+              model1.intercept
+          assert(prediction1 ~== prediction2 relTol 1E-5)
       }
     }
   }
@@ -757,7 +761,8 @@ class LinearRegressionSuite
       assert(model.summary.meanAbsoluteError ~== 0.07961668 relTol 1E-4)
       assert(model.summary.r2 ~== 0.9998737 relTol 1E-4)
 
-      // Normal solver uses "WeightedLeastSquares". This algorithm does not generate
+      // Normal solver uses "WeightedLeastSquares". If no regularization is applied or only L2
+      // regularization is applied, this algorithm uses a direct solver and does not generate an
       // objective history because it does not run through iterations.
       if (solver == "l-bfgs") {
         // Objective function should be monotonically decreasing for linear regression
@@ -776,7 +781,7 @@ class LinearRegressionSuite
         val pValsR = Array(0, 0, 0)
         model.summary.devianceResiduals.zip(devianceResidualsR).foreach { x =>
           assert(x._1 ~== x._2 absTol 1E-4) }
-        model.summary.coefficientStandardErrors.zip(seCoefR).foreach{ x =>
+        model.summary.coefficientStandardErrors.zip(seCoefR).foreach { x =>
           assert(x._1 ~== x._2 absTol 1E-4) }
         model.summary.tValues.map(_.round).zip(tValsR).foreach{ x => assert(x._1 === x._2) }
         model.summary.pValues.map(_.round).zip(pValsR).foreach{ x => assert(x._1 === x._2) }
@@ -950,6 +955,20 @@ class LinearRegressionSuite
       assert(x._1 ~== x._2 absTol 1E-3) }
     model.summary.tValues.zip(tValsR).foreach{ x => assert(x._1 ~== x._2 absTol 1E-3) }
     model.summary.pValues.zip(pValsR).foreach{ x => assert(x._1 ~== x._2 absTol 1E-3) }
+
+    val modelWithL1 = new LinearRegression()
+      .setWeightCol("weight")
+      .setSolver("normal")
+      .setRegParam(0.5)
+      .setElasticNetParam(1.0)
+      .fit(datasetWithWeight)
+
+    assert(modelWithL1.summary.objectiveHistory !== Array(0.0))
+    assert(
+      modelWithL1.summary
+        .objectiveHistory
+        .sliding(2)
+        .forall(x => x(0) >= x(1)))
   }
 
   test("linear regression summary with weighted samples and w/o intercept by normal solver") {

From 6f31833dbe0b766dfe4540a240fe92ebb7e14737 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Tue, 25 Oct 2016 15:00:33 +0800
Subject: [PATCH 109/162] [SPARK-18026][SQL] should not always lowercase
 partition columns of partition spec in parser

## What changes were proposed in this pull request?

Currently we always lowercase the partition columns of partition spec in parser, with the assumption that table partition columns are always lowercased.

However, this is not true for data source tables, which are case preserving. It's safe for now because data source tables don't store partition spec in metastore and don't support `ADD PARTITION`, `DROP PARTITION`, `RENAME PARTITION`, but we should make our code future-proof.

This PR makes partition spec case preserving at parser, and improve the `PreprocessTableInsertion` analyzer rule to normalize the partition columns in partition spec, w.r.t. the table partition columns.

## How was this patch tested?

existing tests.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #15566 from cloud-fan/partition-spec.
---
 .../sql/catalyst/parser/AstBuilder.scala      |  6 ++-
 .../plans/logical/basicLogicalOperators.scala | 20 +--------
 .../spark/sql/execution/command/ddl.scala     | 34 +++++++++++++--
 .../datasources/PartitioningUtils.scala       | 30 +++++++++++++
 .../sql/execution/datasources/rules.scala     | 41 +++++++++---------
 .../sql/execution/command/DDLSuite.scala      | 42 +++++++++++++++++++
 .../sql/hive/client/HiveClientImpl.scala      |  3 ++
 .../sql/hive/InsertIntoHiveTableSuite.scala   | 15 +------
 .../sql/hive/execution/HiveDDLSuite.scala     |  5 +--
 9 files changed, 136 insertions(+), 60 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 929c1c4f2d9e4..38e9bb6c162ad 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -192,11 +192,13 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
   override def visitPartitionSpec(
       ctx: PartitionSpecContext): Map[String, Option[String]] = withOrigin(ctx) {
     val parts = ctx.partitionVal.asScala.map { pVal =>
-      val name = pVal.identifier.getText.toLowerCase
+      val name = pVal.identifier.getText
       val value = Option(pVal.constant).map(visitStringConstant)
       name -> value
     }
-    // Check for duplicate partition columns in one spec.
+    // Before calling `toMap`, we check duplicated keys to avoid silently ignore partition values
+    // in partition spec like PARTITION(a='1', b='2', a='3'). The real semantical check for
+    // partition columns will be done in analyzer.
     checkDuplicateKeys(parts, ctx)
     parts.toMap
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index 64a787a7ae351..a48974c6322ad 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -356,26 +356,10 @@ case class InsertIntoTable(
   override def children: Seq[LogicalPlan] = child :: Nil
   override def output: Seq[Attribute] = Seq.empty
 
-  lazy val expectedColumns = {
-    if (table.output.isEmpty) {
-      None
-    } else {
-      // Note: The parser (visitPartitionSpec in AstBuilder) already turns
-      // keys in partition to their lowercase forms.
-      val staticPartCols = partition.filter(_._2.isDefined).keySet
-      Some(table.output.filterNot(a => staticPartCols.contains(a.name)))
-    }
-  }
-
   assert(overwrite || !ifNotExists)
   assert(partition.values.forall(_.nonEmpty) || !ifNotExists)
-  override lazy val resolved: Boolean =
-    childrenResolved && table.resolved && expectedColumns.forall { expected =>
-    child.output.size == expected.size && child.output.zip(expected).forall {
-      case (childAttr, tableAttr) =>
-        DataType.equalsIgnoreCompatibleNullability(childAttr.dataType, tableAttr.dataType)
-    }
-  }
+
+  override lazy val resolved: Boolean = childrenResolved && table.resolved
 }
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index 45fa293e58951..15656faa08e4f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -351,8 +351,13 @@ case class AlterTableAddPartitionCommand(
         "ALTER TABLE ADD PARTITION is not allowed for tables defined using the datasource API")
     }
     val parts = partitionSpecsAndLocs.map { case (spec, location) =>
+      val normalizedSpec = PartitioningUtils.normalizePartitionSpec(
+        spec,
+        table.partitionColumnNames,
+        table.identifier.quotedString,
+        sparkSession.sessionState.conf.resolver)
       // inherit table storage format (possibly except for location)
-      CatalogTablePartition(spec, table.storage.copy(locationUri = location))
+      CatalogTablePartition(normalizedSpec, table.storage.copy(locationUri = location))
     }
     catalog.createPartitions(table.identifier, parts, ignoreIfExists = ifNotExists)
     Seq.empty[Row]
@@ -382,8 +387,21 @@ case class AlterTableRenamePartitionCommand(
         "ALTER TABLE RENAME PARTITION is not allowed for tables defined using the datasource API")
     }
     DDLUtils.verifyAlterTableType(catalog, table, isView = false)
+
+    val normalizedOldPartition = PartitioningUtils.normalizePartitionSpec(
+      oldPartition,
+      table.partitionColumnNames,
+      table.identifier.quotedString,
+      sparkSession.sessionState.conf.resolver)
+
+    val normalizedNewPartition = PartitioningUtils.normalizePartitionSpec(
+      newPartition,
+      table.partitionColumnNames,
+      table.identifier.quotedString,
+      sparkSession.sessionState.conf.resolver)
+
     catalog.renamePartitions(
-      tableName, Seq(oldPartition), Seq(newPartition))
+      tableName, Seq(normalizedOldPartition), Seq(normalizedNewPartition))
     Seq.empty[Row]
   }
 
@@ -418,7 +436,17 @@ case class AlterTableDropPartitionCommand(
       throw new AnalysisException(
         "ALTER TABLE DROP PARTITIONS is not allowed for tables defined using the datasource API")
     }
-    catalog.dropPartitions(table.identifier, specs, ignoreIfNotExists = ifExists, purge = purge)
+
+    val normalizedSpecs = specs.map { spec =>
+      PartitioningUtils.normalizePartitionSpec(
+        spec,
+        table.partitionColumnNames,
+        table.identifier.quotedString,
+        sparkSession.sessionState.conf.resolver)
+    }
+
+    catalog.dropPartitions(
+      table.identifier, normalizedSpecs, ignoreIfNotExists = ifExists, purge = purge)
     Seq.empty[Row]
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
index 81bdabb7afdab..f66e8b4e2b551 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
@@ -29,6 +29,7 @@ import org.apache.hadoop.util.Shell
 
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.analysis.Resolver
 import org.apache.spark.sql.catalyst.expressions.{Cast, Literal}
 import org.apache.spark.sql.types._
 
@@ -243,6 +244,35 @@ object PartitioningUtils {
     }
   }
 
+  /**
+   * Normalize the column names in partition specification, w.r.t. the real partition column names
+   * and case sensitivity. e.g., if the partition spec has a column named `monTh`, and there is a
+   * partition column named `month`, and it's case insensitive, we will normalize `monTh` to
+   * `month`.
+   */
+  def normalizePartitionSpec[T](
+      partitionSpec: Map[String, T],
+      partColNames: Seq[String],
+      tblName: String,
+      resolver: Resolver): Map[String, T] = {
+    val normalizedPartSpec = partitionSpec.toSeq.map { case (key, value) =>
+      val normalizedKey = partColNames.find(resolver(_, key)).getOrElse {
+        throw new AnalysisException(s"$key is not a valid partition column in table $tblName.")
+      }
+      normalizedKey -> value
+    }
+
+    if (normalizedPartSpec.map(_._1).distinct.length != normalizedPartSpec.length) {
+      val duplicateColumns = normalizedPartSpec.map(_._1).groupBy(identity).collect {
+        case (x, ys) if ys.length > 1 => x
+      }
+      throw new AnalysisException(s"Found duplicated columns in partition specification: " +
+        duplicateColumns.mkString(", "))
+    }
+
+    normalizedPartSpec.toMap
+  }
+
   /**
    * Resolves possible type conflicts between partitions by up-casting "lower" types.  The up-
    * casting order is:
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
index bd6eb6e0535ab..cf501cdc919ee 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
@@ -187,8 +187,8 @@ case class AnalyzeCreateTable(sparkSession: SparkSession) extends Rule[LogicalPl
       colName: String,
       colType: String): String = {
     val tableCols = schema.map(_.name)
-    val conf = sparkSession.sessionState.conf
-    tableCols.find(conf.resolver(_, colName)).getOrElse {
+    val resolver = sparkSession.sessionState.conf.resolver
+    tableCols.find(resolver(_, colName)).getOrElse {
       failAnalysis(s"$colType column $colName is not defined in table $tableIdent, " +
         s"defined table columns are: ${tableCols.mkString(", ")}")
     }
@@ -209,42 +209,41 @@ case class PreprocessTableInsertion(conf: SQLConf) extends Rule[LogicalPlan] {
       tblName: String,
       partColNames: Seq[String]): InsertIntoTable = {
 
-    val expectedColumns = insert.expectedColumns
-    if (expectedColumns.isDefined && expectedColumns.get.length != insert.child.schema.length) {
+    val normalizedPartSpec = PartitioningUtils.normalizePartitionSpec(
+      insert.partition, partColNames, tblName, conf.resolver)
+
+    val expectedColumns = {
+      val staticPartCols = normalizedPartSpec.filter(_._2.isDefined).keySet
+      insert.table.output.filterNot(a => staticPartCols.contains(a.name))
+    }
+
+    if (expectedColumns.length != insert.child.schema.length) {
       throw new AnalysisException(
         s"Cannot insert into table $tblName because the number of columns are different: " +
-          s"need ${expectedColumns.get.length} columns, " +
+          s"need ${expectedColumns.length} columns, " +
           s"but query has ${insert.child.schema.length} columns.")
     }
 
-    if (insert.partition.nonEmpty) {
-      // the query's partitioning must match the table's partitioning
-      // this is set for queries like: insert into ... partition (one = "a", two = <expr>)
-      val samePartitionColumns =
-        if (conf.caseSensitiveAnalysis) {
-          insert.partition.keySet == partColNames.toSet
-        } else {
-          insert.partition.keySet.map(_.toLowerCase) == partColNames.map(_.toLowerCase).toSet
-        }
-      if (!samePartitionColumns) {
+    if (normalizedPartSpec.nonEmpty) {
+      if (normalizedPartSpec.size != partColNames.length) {
         throw new AnalysisException(
           s"""
              |Requested partitioning does not match the table $tblName:
-             |Requested partitions: ${insert.partition.keys.mkString(",")}
+             |Requested partitions: ${normalizedPartSpec.keys.mkString(",")}
              |Table partitions: ${partColNames.mkString(",")}
            """.stripMargin)
       }
-      expectedColumns.map(castAndRenameChildOutput(insert, _)).getOrElse(insert)
+
+      castAndRenameChildOutput(insert.copy(partition = normalizedPartSpec), expectedColumns)
     } else {
-      // All partition columns are dynamic because because the InsertIntoTable command does
+      // All partition columns are dynamic because the InsertIntoTable command does
       // not explicitly specify partitioning columns.
-      expectedColumns.map(castAndRenameChildOutput(insert, _)).getOrElse(insert)
+      castAndRenameChildOutput(insert, expectedColumns)
         .copy(partition = partColNames.map(_ -> None).toMap)
     }
   }
 
-  // TODO: do we really need to rename?
-  def castAndRenameChildOutput(
+  private def castAndRenameChildOutput(
       insert: InsertIntoTable,
       expectedOutput: Seq[Attribute]): InsertIntoTable = {
     val newChildOutput = expectedOutput.zip(insert.child.output).map {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index d593bfb4ce19a..de326f80f6598 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -926,23 +926,33 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
     val catalog = spark.sessionState.catalog
     val tableIdent = TableIdentifier("tab1", Some("dbx"))
     createPartitionedTable(tableIdent, isDatasourceTable = false)
+
+    // basic rename partition
     sql("ALTER TABLE dbx.tab1 PARTITION (a='1', b='q') RENAME TO PARTITION (a='100', b='p')")
     sql("ALTER TABLE dbx.tab1 PARTITION (a='2', b='c') RENAME TO PARTITION (a='20', b='c')")
     assert(catalog.listPartitions(tableIdent).map(_.spec).toSet ==
       Set(Map("a" -> "100", "b" -> "p"), Map("a" -> "20", "b" -> "c"), Map("a" -> "3", "b" -> "p")))
+
     // rename without explicitly specifying database
     catalog.setCurrentDatabase("dbx")
     sql("ALTER TABLE tab1 PARTITION (a='100', b='p') RENAME TO PARTITION (a='10', b='p')")
     assert(catalog.listPartitions(tableIdent).map(_.spec).toSet ==
       Set(Map("a" -> "10", "b" -> "p"), Map("a" -> "20", "b" -> "c"), Map("a" -> "3", "b" -> "p")))
+
     // table to alter does not exist
     intercept[NoSuchTableException] {
       sql("ALTER TABLE does_not_exist PARTITION (c='3') RENAME TO PARTITION (c='333')")
     }
+
     // partition to rename does not exist
     intercept[NoSuchPartitionException] {
       sql("ALTER TABLE tab1 PARTITION (a='not_found', b='1') RENAME TO PARTITION (a='1', b='2')")
     }
+
+    // partition spec in RENAME PARTITION should be case insensitive by default
+    sql("ALTER TABLE tab1 PARTITION (A='10', B='p') RENAME TO PARTITION (A='1', B='p')")
+    assert(catalog.listPartitions(tableIdent).map(_.spec).toSet ==
+      Set(Map("a" -> "1", "b" -> "p"), Map("a" -> "20", "b" -> "c"), Map("a" -> "3", "b" -> "p")))
   }
 
   test("alter table: rename partition (datasource table)") {
@@ -1334,6 +1344,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
     val part2 = Map("a" -> "2", "b" -> "6")
     val part3 = Map("a" -> "3", "b" -> "7")
     val part4 = Map("a" -> "4", "b" -> "8")
+    val part5 = Map("a" -> "9", "b" -> "9")
     createDatabase(catalog, "dbx")
     createTable(catalog, tableIdent)
     createTablePartition(catalog, part1, tableIdent)
@@ -1341,6 +1352,8 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
       convertToDatasourceTable(catalog, tableIdent)
     }
     assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == Set(part1))
+
+    // basic add partition
     maybeWrapException(isDatasourceTable) {
       sql("ALTER TABLE dbx.tab1 ADD IF NOT EXISTS " +
         "PARTITION (a='2', b='6') LOCATION 'paris' PARTITION (a='3', b='7')")
@@ -1351,6 +1364,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
       assert(catalog.getPartition(tableIdent, part2).storage.locationUri == Option("paris"))
       assert(catalog.getPartition(tableIdent, part3).storage.locationUri.isEmpty)
     }
+
     // add partitions without explicitly specifying database
     catalog.setCurrentDatabase("dbx")
     maybeWrapException(isDatasourceTable) {
@@ -1360,14 +1374,18 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
       assert(catalog.listPartitions(tableIdent).map(_.spec).toSet ==
         Set(part1, part2, part3, part4))
     }
+
     // table to alter does not exist
     intercept[AnalysisException] {
       sql("ALTER TABLE does_not_exist ADD IF NOT EXISTS PARTITION (a='4', b='9')")
     }
+
     // partition to add already exists
     intercept[AnalysisException] {
       sql("ALTER TABLE tab1 ADD PARTITION (a='4', b='8')")
     }
+
+    // partition to add already exists when using IF NOT EXISTS
     maybeWrapException(isDatasourceTable) {
       sql("ALTER TABLE tab1 ADD IF NOT EXISTS PARTITION (a='4', b='8')")
     }
@@ -1375,6 +1393,15 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
       assert(catalog.listPartitions(tableIdent).map(_.spec).toSet ==
         Set(part1, part2, part3, part4))
     }
+
+    // partition spec in ADD PARTITION should be case insensitive by default
+    maybeWrapException(isDatasourceTable) {
+      sql("ALTER TABLE tab1 ADD PARTITION (A='9', B='9')")
+    }
+    if (!isDatasourceTable) {
+      assert(catalog.listPartitions(tableIdent).map(_.spec).toSet ==
+        Set(part1, part2, part3, part4, part5))
+    }
   }
 
   private def testDropPartitions(isDatasourceTable: Boolean): Unit = {
@@ -1395,12 +1422,15 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
     if (isDatasourceTable) {
       convertToDatasourceTable(catalog, tableIdent)
     }
+
+    // basic drop partition
     maybeWrapException(isDatasourceTable) {
       sql("ALTER TABLE dbx.tab1 DROP IF EXISTS PARTITION (a='4', b='8'), PARTITION (a='3', b='7')")
     }
     if (!isDatasourceTable) {
       assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == Set(part1, part2))
     }
+
     // drop partitions without explicitly specifying database
     catalog.setCurrentDatabase("dbx")
     maybeWrapException(isDatasourceTable) {
@@ -1409,20 +1439,32 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
     if (!isDatasourceTable) {
       assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == Set(part1))
     }
+
     // table to alter does not exist
     intercept[AnalysisException] {
       sql("ALTER TABLE does_not_exist DROP IF EXISTS PARTITION (a='2')")
     }
+
     // partition to drop does not exist
     intercept[AnalysisException] {
       sql("ALTER TABLE tab1 DROP PARTITION (a='300')")
     }
+
+    // partition to drop does not exist when using IF EXISTS
     maybeWrapException(isDatasourceTable) {
       sql("ALTER TABLE tab1 DROP IF EXISTS PARTITION (a='300')")
     }
     if (!isDatasourceTable) {
       assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == Set(part1))
     }
+
+    // partition spec in DROP PARTITION should be case insensitive by default
+    maybeWrapException(isDatasourceTable) {
+      sql("ALTER TABLE tab1 DROP PARTITION (A='1', B='5')")
+    }
+    if (!isDatasourceTable) {
+      assert(catalog.listPartitions(tableIdent).isEmpty)
+    }
   }
 
   test("drop build-in function") {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index e745a8c5b3589..8835b266b22a4 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -831,6 +831,9 @@ private[hive] class HiveClientImpl(
     new HivePartition(ht, tpart)
   }
 
+  // TODO (cloud-fan): the column names in partition specification are always lower cased because
+  // Hive metastore is not case preserving. We should normalize them to the actual column names of
+  // the table, once we store partition spec of data source tables.
   private def fromHivePartition(hp: HivePartition): CatalogTablePartition = {
     val apiPartition = hp.getTPartition
     CatalogTablePartition(
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
index d9ce1c3dc18ff..e3ddaf725424d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
@@ -370,17 +370,6 @@ class InsertIntoHiveTableSuite extends QueryTest with TestHiveSingleton with Bef
     assert(cause.getMessage.contains("insertInto() can't be used together with partitionBy()."))
   }
 
-  test("InsertIntoTable#resolved should include dynamic partitions") {
-    withSQLConf(("hive.exec.dynamic.partition.mode", "nonstrict")) {
-      sql("CREATE TABLE partitioned (id bigint, data string) PARTITIONED BY (part string)")
-      val data = (1 to 10).map(i => (i.toLong, s"data-$i")).toDF("id", "data")
-
-      val logical = InsertIntoTable(spark.table("partitioned").logicalPlan,
-        Map("part" -> None), data.logicalPlan, overwrite = false, ifNotExists = false)
-      assert(!logical.resolved, "Should not resolve: missing partition data")
-    }
-  }
-
   testPartitionedTable(
     "SPARK-16036: better error message when insert into a table with mismatch schema") {
     tableName =>
@@ -409,8 +398,8 @@ class InsertIntoHiveTableSuite extends QueryTest with TestHiveSingleton with Bef
 
         sql(s"INSERT INTO TABLE $tableName PARTITION (c=11, b=10) SELECT 9, 12")
 
-        // c is defined twice. Parser will complain.
-        intercept[ParseException] {
+        // c is defined twice. Analyzer will complain.
+        intercept[AnalysisException] {
           sql(s"INSERT INTO TABLE $tableName PARTITION (b=14, c=15, c=16) SELECT 13")
         }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index 3d1712e4354c0..e9268a922cf54 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -200,9 +200,8 @@ class HiveDDLSuite
         val message = intercept[AnalysisException] {
           sql(s"ALTER TABLE $externalTab DROP PARTITION (ds='2008-04-09', unknownCol='12')")
         }
-        assert(message.getMessage.contains(
-          "Partition spec is invalid. The spec (ds, unknowncol) must be contained within the " +
-            "partition spec (ds, hr) defined in table '`default`.`exttable_with_partitions`'"))
+        assert(message.getMessage.contains("unknownCol is not a valid partition column in table " +
+          "`default`.`exttable_with_partitions`"))
 
         sql(
           s"""

From 38cdd6ccdaba7f8da985c4f4efe5bd93a46a2b53 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Tue, 25 Oct 2016 03:19:50 -0700
Subject: [PATCH 110/162] [SPARK-14634][ML][FOLLOWUP] Delete superfluous line
 in BisectingKMeans

## What changes were proposed in this pull request?
As commented by jkbradley in https://github.com/apache/spark/pull/12394, `model.setSummary(summary)` is superfluous

## How was this patch tested?
existing tests

Author: Zheng RuiFeng <ruifengz@foxmail.com>

Closes #15619 from zhengruifeng/del_superfluous.
---
 .../org/apache/spark/ml/clustering/BisectingKMeans.scala    | 5 ++---
 .../main/scala/org/apache/spark/ml/clustering/KMeans.scala  | 6 +++---
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
index add8ee2a4ff8e..ef2d918ea3542 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
@@ -265,9 +265,8 @@ class BisectingKMeans @Since("2.0.0") (
     val summary = new BisectingKMeansSummary(
       model.transform(dataset), $(predictionCol), $(featuresCol), $(k))
     model.setSummary(summary)
-    val m = model.setSummary(summary)
-    instr.logSuccess(m)
-    m
+    instr.logSuccess(model)
+    model
   }
 
   @Since("2.0.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
index b04e82838e714..0d2405b50068e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -324,9 +324,9 @@ class KMeans @Since("1.5.0") (
     val model = copyValues(new KMeansModel(uid, parentModel).setParent(this))
     val summary = new KMeansSummary(
       model.transform(dataset), $(predictionCol), $(featuresCol), $(k))
-    val m = model.setSummary(summary)
-    instr.logSuccess(m)
-    m
+    model.setSummary(summary)
+    instr.logSuccess(model)
+    model
   }
 
   @Since("1.5.0")

From ac8ff920faec6ee06e17212e2b5d2ee117495e87 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Tue, 25 Oct 2016 10:22:02 -0700
Subject: [PATCH 111/162] [SPARK-17748][FOLLOW-UP][ML] Fix build error for
 Scala 2.10.

## What changes were proposed in this pull request?
#15394 introduced build error for Scala 2.10, this PR fix it.

## How was this patch tested?
Existing test.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #15625 from yanboliang/spark-17748-scala.
---
 .../spark/ml/optim/WeightedLeastSquaresSuite.scala  | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala
index 5f638b4880058..3cdab0327991e 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala
@@ -280,7 +280,7 @@ class WeightedLeastSquaresSuite extends SparkFunSuite with MLlibTestSparkContext
     }
 
     // Cholesky also fails when regularization is added but we don't wish to standardize
-    val wls = new WeightedLeastSquares(true, regParam = 0.5, elasticNetParam = 0.0,
+    val wls = new WeightedLeastSquares(fitIntercept = true, regParam = 0.5, elasticNetParam = 0.0,
       standardizeFeatures = false, standardizeLabel = false,
       solverType = WeightedLeastSquares.Cholesky)
     intercept[SingularMatrixException] {
@@ -470,10 +470,11 @@ class WeightedLeastSquaresSuite extends SparkFunSuite with MLlibTestSparkContext
     var idx = 0
     for (fitIntercept <- Seq(false, true);
          regParam <- Seq(0.1, 0.5, 1.0);
-         standardizeFeatures <- Seq(false, true);
+         standardization <- Seq(false, true);
          elasticNetParam <- Seq(0.1, 0.5, 1.0)) {
-      val wls = new WeightedLeastSquares(fitIntercept, regParam, elasticNetParam = elasticNetParam,
-        standardizeFeatures, standardizeLabel = true, solverType = WeightedLeastSquares.Auto)
+      val wls = new WeightedLeastSquares(fitIntercept, regParam, elasticNetParam,
+        standardizeFeatures = standardization, standardizeLabel = true,
+        solverType = WeightedLeastSquares.Auto)
         .fit(instances)
       val actual = Vectors.dense(wls.intercept, wls.coefficients(0), wls.coefficients(1))
       assert(actual ~== expected(idx) absTol 1e-4)
@@ -528,10 +529,10 @@ class WeightedLeastSquaresSuite extends SparkFunSuite with MLlibTestSparkContext
     var idx = 0
     for (fitIntercept <- Seq(false, true);
          regParam <- Seq(0.0, 0.1, 1.0);
-         standardizeFeatures <- Seq(false, true)) {
+         standardization <- Seq(false, true)) {
       for (solver <- WeightedLeastSquares.supportedSolvers) {
         val wls = new WeightedLeastSquares(fitIntercept, regParam, elasticNetParam = 0.0,
-          standardizeFeatures, standardizeLabel = true, solverType = solver)
+          standardizeFeatures = standardization, standardizeLabel = true, solverType = solver)
           .fit(instances)
         val actual = Vectors.dense(wls.intercept, wls.coefficients(0), wls.coefficients(1))
         assert(actual ~== expected(idx) absTol 1e-4)

From c5fe3dd4f59c464c830b414acccd3cca0fdd877c Mon Sep 17 00:00:00 2001
From: Vinayak <vijoshi5@in.ibm.com>
Date: Tue, 25 Oct 2016 10:36:03 -0700
Subject: [PATCH 112/162] [SPARK-18010][CORE] Reduce work performed for
 building up the application list for the History Server app list UI page

## What changes were proposed in this pull request?
allow ReplayListenerBus to skip deserialising and replaying certain events using an inexpensive check of the event log entry. Use this to ensure that when event log replay is triggered for building the application list, we get the ReplayListenerBus to skip over all but the few events needed for our immediate purpose. Refer [SPARK-18010] for the motivation behind this change.

## How was this patch tested?

Tested with existing HistoryServer and ReplayListener unit test suites. All tests pass.

Please review https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark before opening a pull request.

Author: Vinayak <vijoshi5@in.ibm.com>

Closes #15556 from vijoshi/SAAS-467_master.
---
 .../deploy/history/FsHistoryProvider.scala    | 120 ++++++++++--------
 .../spark/scheduler/ReplayListenerBus.scala   |  39 +++++-
 2 files changed, 101 insertions(+), 58 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index 530cc5252214b..dfc1aad64c818 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -36,6 +36,7 @@ import org.apache.spark.{SecurityManager, SparkConf, SparkException}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.scheduler._
+import org.apache.spark.scheduler.ReplayListenerBus._
 import org.apache.spark.ui.SparkUI
 import org.apache.spark.util.{Clock, SystemClock, ThreadUtils, Utils}
 
@@ -78,10 +79,6 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
 
   import FsHistoryProvider._
 
-  private val NOT_STARTED = "<Not Started>"
-
-  private val SPARK_HISTORY_FS_NUM_REPLAY_THREADS = "spark.history.fs.numReplayThreads"
-
   // Interval between safemode checks.
   private val SAFEMODE_CHECK_INTERVAL_S = conf.getTimeAsSeconds(
     "spark.history.fs.safemodeCheck.interval", "5s")
@@ -241,11 +238,12 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
               HistoryServer.getAttemptURI(appId, attempt.attemptId), attempt.startTime)
             // Do not call ui.bind() to avoid creating a new server for each application
           }
-          val appListener = new ApplicationEventListener()
-          replayBus.addListener(appListener)
-          val appAttemptInfo = replay(fs.getFileStatus(new Path(logDir, attempt.logPath)),
-            replayBus)
-          appAttemptInfo.map { info =>
+
+          val fileStatus = fs.getFileStatus(new Path(logDir, attempt.logPath))
+
+          val appListener = replay(fileStatus, isApplicationCompleted(fileStatus), replayBus)
+
+          if (appListener.appId.isDefined) {
             val uiAclsEnabled = conf.getBoolean("spark.history.ui.acls.enable", false)
             ui.getSecurityManager.setAcls(uiAclsEnabled)
             // make sure to set admin acls before view acls so they are properly picked up
@@ -254,8 +252,11 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
               appListener.viewAcls.getOrElse(""))
             ui.getSecurityManager.setAdminAclsGroups(appListener.adminAclsGroups.getOrElse(""))
             ui.getSecurityManager.setViewAclsGroups(appListener.viewAclsGroups.getOrElse(""))
-            LoadedAppUI(ui, updateProbe(appId, attemptId, attempt.fileSize))
+            Some(LoadedAppUI(ui, updateProbe(appId, attemptId, attempt.fileSize)))
+          } else {
+            None
           }
+
         }
       }
     } catch {
@@ -411,28 +412,54 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
     }
   }
 
-
   /**
    * Replay the log files in the list and merge the list of old applications with new ones
    */
   private def mergeApplicationListing(fileStatus: FileStatus): Unit = {
     val newAttempts = try {
-        val bus = new ReplayListenerBus()
-        val res = replay(fileStatus, bus)
-        res match {
-          case Some(r) => logDebug(s"Application log ${r.logPath} loaded successfully: $r")
-          case None => logWarning(s"Failed to load application log ${fileStatus.getPath}. " +
-            "The application may have not started.")
-        }
-        res
-      } catch {
-        case e: Exception =>
-          logError(
-            s"Exception encountered when attempting to load application log ${fileStatus.getPath}",
-            e)
-          None
+      val eventsFilter: ReplayEventsFilter = { eventString =>
+        eventString.startsWith(APPL_START_EVENT_PREFIX) ||
+          eventString.startsWith(APPL_END_EVENT_PREFIX)
+      }
+
+      val logPath = fileStatus.getPath()
+
+      val appCompleted = isApplicationCompleted(fileStatus)
+
+      val appListener = replay(fileStatus, appCompleted, new ReplayListenerBus(), eventsFilter)
+
+      // Without an app ID, new logs will render incorrectly in the listing page, so do not list or
+      // try to show their UI.
+      if (appListener.appId.isDefined) {
+        val attemptInfo = new FsApplicationAttemptInfo(
+          logPath.getName(),
+          appListener.appName.getOrElse(NOT_STARTED),
+          appListener.appId.getOrElse(logPath.getName()),
+          appListener.appAttemptId,
+          appListener.startTime.getOrElse(-1L),
+          appListener.endTime.getOrElse(-1L),
+          fileStatus.getModificationTime(),
+          appListener.sparkUser.getOrElse(NOT_STARTED),
+          appCompleted,
+          fileStatus.getLen()
+        )
+        fileToAppInfo(logPath) = attemptInfo
+        logDebug(s"Application log ${attemptInfo.logPath} loaded successfully: $attemptInfo")
+        Some(attemptInfo)
+      } else {
+        logWarning(s"Failed to load application log ${fileStatus.getPath}. " +
+          "The application may have not started.")
+        None
       }
 
+    } catch {
+      case e: Exception =>
+        logError(
+          s"Exception encountered when attempting to load application log ${fileStatus.getPath}",
+          e)
+        None
+    }
+
     if (newAttempts.isEmpty) {
       return
     }
@@ -564,12 +591,16 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
   }
 
   /**
-   * Replays the events in the specified log file and returns information about the associated
-   * application. Return `None` if the application ID cannot be located.
+   * Replays the events in the specified log file on the supplied `ReplayListenerBus`. Returns
+   * an `ApplicationEventListener` instance with event data captured from the replay.
+   * `ReplayEventsFilter` determines what events are replayed and can therefore limit the
+   * data captured in the returned `ApplicationEventListener` instance.
    */
   private def replay(
       eventLog: FileStatus,
-      bus: ReplayListenerBus): Option[FsApplicationAttemptInfo] = {
+      appCompleted: Boolean,
+      bus: ReplayListenerBus,
+      eventsFilter: ReplayEventsFilter = SELECT_ALL_FILTER): ApplicationEventListener = {
     val logPath = eventLog.getPath()
     logInfo(s"Replaying log path: $logPath")
     // Note that the eventLog may have *increased* in size since when we grabbed the filestatus,
@@ -581,30 +612,9 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
     val logInput = EventLoggingListener.openEventLog(logPath, fs)
     try {
       val appListener = new ApplicationEventListener
-      val appCompleted = isApplicationCompleted(eventLog)
       bus.addListener(appListener)
-      bus.replay(logInput, logPath.toString, !appCompleted)
-
-      // Without an app ID, new logs will render incorrectly in the listing page, so do not list or
-      // try to show their UI.
-      if (appListener.appId.isDefined) {
-        val attemptInfo = new FsApplicationAttemptInfo(
-          logPath.getName(),
-          appListener.appName.getOrElse(NOT_STARTED),
-          appListener.appId.getOrElse(logPath.getName()),
-          appListener.appAttemptId,
-          appListener.startTime.getOrElse(-1L),
-          appListener.endTime.getOrElse(-1L),
-          eventLog.getModificationTime(),
-          appListener.sparkUser.getOrElse(NOT_STARTED),
-          appCompleted,
-          eventLog.getLen()
-        )
-        fileToAppInfo(logPath) = attemptInfo
-        Some(attemptInfo)
-      } else {
-        None
-      }
+      bus.replay(logInput, logPath.toString, !appCompleted, eventsFilter)
+      appListener
     } finally {
       logInput.close()
     }
@@ -689,6 +699,14 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
 
 private[history] object FsHistoryProvider {
   val DEFAULT_LOG_DIR = "file:/tmp/spark-events"
+
+  private val NOT_STARTED = "<Not Started>"
+
+  private val SPARK_HISTORY_FS_NUM_REPLAY_THREADS = "spark.history.fs.numReplayThreads"
+
+  private val APPL_START_EVENT_PREFIX = "{\"Event\":\"SparkListenerApplicationStart\""
+
+  private val APPL_END_EVENT_PREFIX = "{\"Event\":\"SparkListenerApplicationEnd\""
 }
 
 /**
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
index d32f5eb7bfe92..3eff8d952bfd6 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
@@ -25,6 +25,7 @@ import com.fasterxml.jackson.core.JsonParseException
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.internal.Logging
+import org.apache.spark.scheduler.ReplayListenerBus._
 import org.apache.spark.util.JsonProtocol
 
 /**
@@ -43,30 +44,45 @@ private[spark] class ReplayListenerBus extends SparkListenerBus with Logging {
    * @param sourceName Filename (or other source identifier) from whence @logData is being read
    * @param maybeTruncated Indicate whether log file might be truncated (some abnormal situations
    *        encountered, log file might not finished writing) or not
+   * @param eventsFilter Filter function to select JSON event strings in the log data stream that
+   *        should be parsed and replayed. When not specified, all event strings in the log data
+   *        are parsed and replayed.
    */
   def replay(
       logData: InputStream,
       sourceName: String,
-      maybeTruncated: Boolean = false): Unit = {
+      maybeTruncated: Boolean = false,
+      eventsFilter: ReplayEventsFilter = SELECT_ALL_FILTER): Unit = {
+
     var currentLine: String = null
-    var lineNumber: Int = 1
+    var lineNumber: Int = 0
+
     try {
-      val lines = Source.fromInputStream(logData).getLines()
-      while (lines.hasNext) {
-        currentLine = lines.next()
+      val lineEntries = Source.fromInputStream(logData)
+        .getLines()
+        .zipWithIndex
+        .filter { case (line, _) => eventsFilter(line) }
+
+      while (lineEntries.hasNext) {
         try {
+          val entry = lineEntries.next()
+
+          currentLine = entry._1
+          lineNumber = entry._2 + 1
+
           postToAll(JsonProtocol.sparkEventFromJson(parse(currentLine)))
         } catch {
           case jpe: JsonParseException =>
             // We can only ignore exception from last line of the file that might be truncated
-            if (!maybeTruncated || lines.hasNext) {
+            // the last entry may not be the very last line in the event log, but we treat it
+            // as such in a best effort to replay the given input
+            if (!maybeTruncated || lineEntries.hasNext) {
               throw jpe
             } else {
               logWarning(s"Got JsonParseException from log file $sourceName" +
                 s" at line $lineNumber, the file might not have finished writing cleanly.")
             }
         }
-        lineNumber += 1
       }
     } catch {
       case ioe: IOException =>
@@ -78,3 +94,12 @@ private[spark] class ReplayListenerBus extends SparkListenerBus with Logging {
   }
 
 }
+
+
+private[spark] object ReplayListenerBus {
+
+  type ReplayEventsFilter = (String) => Boolean
+
+  // utility filter that selects all event logs during replay
+  val SELECT_ALL_FILTER: ReplayEventsFilter = { (eventString: String) => true }
+}

From a21791e3164f4e6546fbe0a90017a4394a05deb1 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Tue, 25 Oct 2016 12:08:17 -0700
Subject: [PATCH 113/162] [SPARK-18070][SQL] binary operator should not
 consider nullability when comparing input types

## What changes were proposed in this pull request?

Binary operator requires its inputs to be of same type, but it should not consider nullability, e.g. `EqualTo` should be able to compare an element-nullable array and an element-non-nullable array.

## How was this patch tested?

a regression test in `DataFrameSuite`

Author: Wenchen Fan <wenchen@databricks.com>

Closes #15606 from cloud-fan/type-bug.
---
 .../spark/sql/catalyst/expressions/Expression.scala      | 2 +-
 .../test/scala/org/apache/spark/sql/DataFrameSuite.scala | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index fa1a2ad56ccb3..9edc1ceff26a7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -511,7 +511,7 @@ abstract class BinaryOperator extends BinaryExpression with ExpectsInputTypes {
 
   override def checkInputDataTypes(): TypeCheckResult = {
     // First check whether left and right have the same type, then check if the type is acceptable.
-    if (left.dataType != right.dataType) {
+    if (!left.dataType.sameType(right.dataType)) {
       TypeCheckResult.TypeCheckFailure(s"differing types in '$sql' " +
         s"(${left.dataType.simpleString} and ${right.dataType.simpleString}).")
     } else if (!inputType.acceptsType(left.dataType)) {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 3fb7eeefba67f..33b3b78c9f04f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -1649,4 +1649,13 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
     dates.except(widenTypedRows).collect()
     dates.intersect(widenTypedRows).collect()
   }
+
+  test("SPARK-18070 binary operator should not consider nullability when comparing input types") {
+    val rows = Seq(Row(Seq(1), Seq(1)))
+    val schema = new StructType()
+      .add("array1", ArrayType(IntegerType))
+      .add("array2", ArrayType(IntegerType, containsNull = false))
+    val df = spark.createDataFrame(spark.sparkContext.makeRDD(rows), schema)
+    assert(df.filter($"array1" === $"array2").count() == 1)
+  }
 }

From 2c7394ad096201cd721be7f532da9d97028cc577 Mon Sep 17 00:00:00 2001
From: sethah <seth.hendrickson16@gmail.com>
Date: Tue, 25 Oct 2016 13:11:21 -0700
Subject: [PATCH 114/162] [SPARK-18019][ML] Add instrumentation to GBTs

## What changes were proposed in this pull request?

Add instrumentation for logging in ML GBT, part of umbrella ticket [SPARK-14567](https://issues.apache.org/jira/browse/SPARK-14567)

## How was this patch tested?

Tested locally:

````
16/10/20 10:24:51 INFO Instrumentation: GBTRegressor-gbtr_2b460d3e2e93-1207021668-45: training: numPartitions=1 storageLevel=StorageLevel(1 replicas)
16/10/20 10:24:51 INFO Instrumentation: GBTRegressor-gbtr_2b460d3e2e93-1207021668-45: {"maxIter":1}
16/10/20 10:24:51 INFO Instrumentation: GBTRegressor-gbtr_2b460d3e2e93-1207021668-45: {"numFeatures":2}
16/10/20 10:24:51 INFO Instrumentation: GBTRegressor-gbtr_2b460d3e2e93-1207021668-45: {"numClasses":0}
...
16/10/20 15:54:21 INFO Instrumentation: GBTRegressor-gbtr_065fad465377-1922077832-22: training finished
````

Author: sethah <seth.hendrickson16@gmail.com>

Closes #15574 from sethah/gbt_instr.
---
 .../apache/spark/ml/classification/GBTClassifier.scala | 10 +++++++++-
 .../org/apache/spark/ml/regression/GBTRegressor.scala  |  9 ++++++++-
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
index ba70293273f94..8bffe0cda0327 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
@@ -137,9 +137,17 @@ class GBTClassifier @Since("1.4.0") (
       }
     val numFeatures = oldDataset.first().features.size
     val boostingStrategy = super.getOldBoostingStrategy(categoricalFeatures, OldAlgo.Classification)
+
+    val instr = Instrumentation.create(this, oldDataset)
+    instr.logParams(params: _*)
+    instr.logNumFeatures(numFeatures)
+    instr.logNumClasses(2)
+
     val (baseLearners, learnerWeights) = GradientBoostedTrees.run(oldDataset, boostingStrategy,
       $(seed))
-    new GBTClassificationModel(uid, baseLearners, learnerWeights, numFeatures)
+    val m = new GBTClassificationModel(uid, baseLearners, learnerWeights, numFeatures)
+    instr.logSuccess(m)
+    m
   }
 
   @Since("1.4.1")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
index bb01f9d5a364c..fa69d60836e68 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
@@ -123,9 +123,16 @@ class GBTRegressor @Since("1.4.0") (@Since("1.4.0") override val uid: String)
     val oldDataset: RDD[LabeledPoint] = extractLabeledPoints(dataset)
     val numFeatures = oldDataset.first().features.size
     val boostingStrategy = super.getOldBoostingStrategy(categoricalFeatures, OldAlgo.Regression)
+
+    val instr = Instrumentation.create(this, oldDataset)
+    instr.logParams(params: _*)
+    instr.logNumFeatures(numFeatures)
+
     val (baseLearners, learnerWeights) = GradientBoostedTrees.run(oldDataset, boostingStrategy,
       $(seed))
-    new GBTRegressionModel(uid, baseLearners, learnerWeights, numFeatures)
+    val m = new GBTRegressionModel(uid, baseLearners, learnerWeights, numFeatures)
+    instr.logSuccess(m)
+    m
   }
 
   @Since("1.4.0")

From c329a568b58d65c492a43926bf0f588f2ae6a66e Mon Sep 17 00:00:00 2001
From: hayashidac <hayashidac@nttdata.co.jp>
Date: Wed, 26 Oct 2016 07:13:48 +0900
Subject: [PATCH 115/162] [SPARK-16988][SPARK SHELL] spark history server log
 needs to be fixed to show https url when ssl is enabled

spark history server log needs to be fixed to show https url when ssl is enabled

Author: chie8842 <chie@chie-no-Mac-mini.local>

Closes #15611 from hayashidac/SPARK-16988.
---
 core/src/main/scala/org/apache/spark/ui/WebUI.scala    |  5 ++++-
 .../test/scala/org/apache/spark/SSLOptionsSuite.scala  | 10 +++++-----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/WebUI.scala b/core/src/main/scala/org/apache/spark/ui/WebUI.scala
index 4118fcf46b428..a05e0efb7a3e3 100644
--- a/core/src/main/scala/org/apache/spark/ui/WebUI.scala
+++ b/core/src/main/scala/org/apache/spark/ui/WebUI.scala
@@ -147,7 +147,10 @@ private[spark] abstract class WebUI(
   }
 
   /** Return the url of web interface. Only valid after bind(). */
-  def webUrl: String = s"http://$publicHostName:$boundPort"
+  def webUrl: String = {
+    val protocol = if (sslOptions.enabled) "https" else "http"
+    s"$protocol://$publicHostName:$boundPort"
+  }
 
   /** Return the actual port to which this server is bound. Only valid after bind(). */
   def boundPort: Int = serverInfo.map(_.boundPort).getOrElse(-1)
diff --git a/core/src/test/scala/org/apache/spark/SSLOptionsSuite.scala b/core/src/test/scala/org/apache/spark/SSLOptionsSuite.scala
index 159b448e05b02..2b8b1805bc83f 100644
--- a/core/src/test/scala/org/apache/spark/SSLOptionsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SSLOptionsSuite.scala
@@ -79,7 +79,7 @@ class SSLOptionsSuite extends SparkFunSuite with BeforeAndAfterAll {
     conf.set("spark.ssl.protocol", "SSLv3")
 
     val defaultOpts = SSLOptions.parse(conf, "spark.ssl", defaults = None)
-    val opts = SSLOptions.parse(conf, "spark.ui.ssl", defaults = Some(defaultOpts))
+    val opts = SSLOptions.parse(conf, "spark.ssl.ui", defaults = Some(defaultOpts))
 
     assert(opts.enabled === true)
     assert(opts.trustStore.isDefined === true)
@@ -102,20 +102,20 @@ class SSLOptionsSuite extends SparkFunSuite with BeforeAndAfterAll {
 
     val conf = new SparkConf
     conf.set("spark.ssl.enabled", "true")
-    conf.set("spark.ui.ssl.enabled", "false")
+    conf.set("spark.ssl.ui.enabled", "false")
     conf.set("spark.ssl.keyStore", keyStorePath)
     conf.set("spark.ssl.keyStorePassword", "password")
-    conf.set("spark.ui.ssl.keyStorePassword", "12345")
+    conf.set("spark.ssl.ui.keyStorePassword", "12345")
     conf.set("spark.ssl.keyPassword", "password")
     conf.set("spark.ssl.trustStore", trustStorePath)
     conf.set("spark.ssl.trustStorePassword", "password")
     conf.set("spark.ssl.enabledAlgorithms",
       "TLS_RSA_WITH_AES_128_CBC_SHA, TLS_RSA_WITH_AES_256_CBC_SHA")
-    conf.set("spark.ui.ssl.enabledAlgorithms", "ABC, DEF")
+    conf.set("spark.ssl.ui.enabledAlgorithms", "ABC, DEF")
     conf.set("spark.ssl.protocol", "SSLv3")
 
     val defaultOpts = SSLOptions.parse(conf, "spark.ssl", defaults = None)
-    val opts = SSLOptions.parse(conf, "spark.ui.ssl", defaults = Some(defaultOpts))
+    val opts = SSLOptions.parse(conf, "spark.ssl.ui", defaults = Some(defaultOpts))
 
     assert(opts.enabled === false)
     assert(opts.trustStore.isDefined === true)

From 12b3e8d2e02788c3bebfecdd69755e94d80011c9 Mon Sep 17 00:00:00 2001
From: WeichenXu <WeichenXu123@outlook.com>
Date: Tue, 25 Oct 2016 21:42:59 -0700
Subject: [PATCH 116/162] [SPARK-18007][SPARKR][ML] update SparkR MLP - add
 initalWeights parameter

## What changes were proposed in this pull request?

update SparkR MLP, add initalWeights parameter.

## How was this patch tested?

test added.

Author: WeichenXu <WeichenXu123@outlook.com>

Closes #15552 from WeichenXu123/mlp_r_add_initialWeight_param.
---
 R/pkg/R/mllib.R                                   | 14 ++++++++++----
 R/pkg/inst/tests/testthat/test_mllib.R            | 15 +++++++++++++++
 .../r/MultilayerPerceptronClassifierWrapper.scala |  9 ++++++++-
 3 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index b901307f8f409..bf182be8e23d0 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -665,6 +665,8 @@ setMethod("predict", signature(object = "KMeansModel"),
 #' @param tol convergence tolerance of iterations.
 #' @param stepSize stepSize parameter.
 #' @param seed seed parameter for weights initialization.
+#' @param initialWeights initialWeights parameter for weights initialization, it should be a
+#' numeric vector.
 #' @param ... additional arguments passed to the method.
 #' @return \code{spark.mlp} returns a fitted Multilayer Perceptron Classification Model.
 #' @rdname spark.mlp
@@ -677,8 +679,9 @@ setMethod("predict", signature(object = "KMeansModel"),
 #' df <- read.df("data/mllib/sample_multiclass_classification_data.txt", source = "libsvm")
 #'
 #' # fit a Multilayer Perceptron Classification Model
-#' model <- spark.mlp(df, blockSize = 128, layers = c(4, 5, 4, 3), solver = "l-bfgs",
-#'                    maxIter = 100, tol = 0.5, stepSize = 1, seed = 1)
+#' model <- spark.mlp(df, blockSize = 128, layers = c(4, 3), solver = "l-bfgs",
+#'                    maxIter = 100, tol = 0.5, stepSize = 1, seed = 1,
+#'                    initialWeights = c(0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 9, 9, 9, 9, 9))
 #'
 #' # get the summary of the model
 #' summary(model)
@@ -695,7 +698,7 @@ setMethod("predict", signature(object = "KMeansModel"),
 #' @note spark.mlp since 2.1.0
 setMethod("spark.mlp", signature(data = "SparkDataFrame"),
           function(data, layers, blockSize = 128, solver = "l-bfgs", maxIter = 100,
-                   tol = 1E-6, stepSize = 0.03, seed = NULL) {
+                   tol = 1E-6, stepSize = 0.03, seed = NULL, initialWeights = NULL) {
             if (is.null(layers)) {
               stop ("layers must be a integer vector with length > 1.")
             }
@@ -706,10 +709,13 @@ setMethod("spark.mlp", signature(data = "SparkDataFrame"),
             if (!is.null(seed)) {
               seed <- as.character(as.integer(seed))
             }
+            if (!is.null(initialWeights)) {
+              initialWeights <- as.array(as.numeric(na.omit(initialWeights)))
+            }
             jobj <- callJStatic("org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper",
                                 "fit", data@sdf, as.integer(blockSize), as.array(layers),
                                 as.character(solver), as.integer(maxIter), as.numeric(tol),
-                                as.numeric(stepSize), seed)
+                                as.numeric(stepSize), seed, initialWeights)
             new("MultilayerPerceptronClassificationModel", jobj = jobj)
           })
 
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index c99315726a22c..33cc069f14456 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -410,6 +410,21 @@ test_that("spark.mlp", {
   model <- spark.mlp(df, layers = c(4, 5, 4, 3), maxIter = 10, seed = 10)
   mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
   expect_equal(head(mlpPredictions$prediction, 12), c(1, 1, 1, 1, 2, 1, 2, 2, 1, 0, 0, 1))
+
+  # test initialWeights
+  model <- spark.mlp(df, layers = c(4, 3), maxIter = 2, initialWeights =
+    c(0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 9, 9, 9, 9, 9))
+  mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
+  expect_equal(head(mlpPredictions$prediction, 12), c(1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1))
+
+  model <- spark.mlp(df, layers = c(4, 3), maxIter = 2, initialWeights =
+    c(0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 5.0, 5.0, 5.0, 5.0, 9.0, 9.0, 9.0, 9.0, 9.0))
+  mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
+  expect_equal(head(mlpPredictions$prediction, 12), c(1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1))
+
+  model <- spark.mlp(df, layers = c(4, 3), maxIter = 2)
+  mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
+  expect_equal(head(mlpPredictions$prediction, 12), c(1, 1, 1, 1, 0, 1, 0, 2, 1, 0, 0, 1))
 })
 
 test_that("spark.naiveBayes", {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/MultilayerPerceptronClassifierWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/MultilayerPerceptronClassifierWrapper.scala
index 10673003534e6..2193eb80e9fdd 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/MultilayerPerceptronClassifierWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/MultilayerPerceptronClassifierWrapper.scala
@@ -24,6 +24,7 @@ import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.ml.{Pipeline, PipelineModel}
 import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier}
+import org.apache.spark.ml.linalg.Vectors
 import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter}
 import org.apache.spark.sql.{DataFrame, Dataset}
 
@@ -58,7 +59,8 @@ private[r] object MultilayerPerceptronClassifierWrapper
       maxIter: Int,
       tol: Double,
       stepSize: Double,
-      seed: String
+      seed: String,
+      initialWeights: Array[Double]
      ): MultilayerPerceptronClassifierWrapper = {
     // get labels and feature names from output schema
     val schema = data.schema
@@ -73,6 +75,11 @@ private[r] object MultilayerPerceptronClassifierWrapper
       .setStepSize(stepSize)
       .setPredictionCol(PREDICTED_LABEL_COL)
     if (seed != null && seed.length > 0) mlp.setSeed(seed.toInt)
+    if (initialWeights != null) {
+      require(initialWeights.length > 0)
+      mlp.setInitialWeights(Vectors.dense(initialWeights))
+    }
+
     val pipeline = new Pipeline()
       .setStages(Array(mlp))
       .fit(data)

From 93b8ad184aa3634f340d43a8bdf99836ef3d4f6c Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Wed, 26 Oct 2016 00:38:34 -0700
Subject: [PATCH 117/162] [SPARK-17693][SQL] Fixed Insert Failure To Data
 Source Tables when the Schema has the Comment Field

### What changes were proposed in this pull request?
```SQL
CREATE TABLE tab1(col1 int COMMENT 'a', col2 int) USING parquet
INSERT INTO TABLE tab1 SELECT 1, 2
```
The insert attempt will fail if the target table has a column with comments. The error is strange to the external users:
```
assertion failed: No plan for InsertIntoTable Relation[col1#15,col2#16] parquet, false, false
+- Project [1 AS col1#19, 2 AS col2#20]
   +- OneRowRelation$
```

This PR is to fix the above bug by checking the metadata when comparing the schema between the table and the query. If not matched, we also copy the metadata. This is an alternative to https://github.com/apache/spark/pull/15266

### How was this patch tested?
Added a test case

Author: gatorsmile <gatorsmile@gmail.com>

Closes #15615 from gatorsmile/insertDataSourceTableWithCommentSolution2.
---
 .../sql/execution/datasources/rules.scala     | 10 ++++-
 .../spark/sql/sources/InsertSuite.scala       | 42 +++++++++++++++++++
 2 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
index cf501cdc919ee..4647b11af4dfb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
@@ -248,10 +248,16 @@ case class PreprocessTableInsertion(conf: SQLConf) extends Rule[LogicalPlan] {
       expectedOutput: Seq[Attribute]): InsertIntoTable = {
     val newChildOutput = expectedOutput.zip(insert.child.output).map {
       case (expected, actual) =>
-        if (expected.dataType.sameType(actual.dataType) && expected.name == actual.name) {
+        if (expected.dataType.sameType(actual.dataType) &&
+            expected.name == actual.name &&
+            expected.metadata == actual.metadata) {
           actual
         } else {
-          Alias(Cast(actual, expected.dataType), expected.name)()
+          // Renaming is needed for handling the following cases like
+          // 1) Column names/types do not match, e.g., INSERT INTO TABLE tab1 SELECT 1, 2
+          // 2) Target tables have column metadata
+          Alias(Cast(actual, expected.dataType), expected.name)(
+            explicitMetadata = Option(expected.metadata))
         }
     }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
index 5eb54643f204f..4a85b5975ea53 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
@@ -185,6 +185,48 @@ class InsertSuite extends DataSourceTest with SharedSQLContext {
     )
   }
 
+  test("INSERT INTO TABLE with Comment in columns") {
+    val tabName = "tab1"
+    withTable(tabName) {
+      sql(
+        s"""
+           |CREATE TABLE $tabName(col1 int COMMENT 'a', col2 int)
+           |USING parquet
+         """.stripMargin)
+      sql(s"INSERT INTO TABLE $tabName SELECT 1, 2")
+
+      checkAnswer(
+        sql(s"SELECT col1, col2 FROM $tabName"),
+        Row(1, 2) :: Nil
+      )
+    }
+  }
+
+  test("INSERT INTO TABLE - complex type but different names") {
+    val tab1 = "tab1"
+    val tab2 = "tab2"
+    withTable(tab1, tab2) {
+      sql(
+        s"""
+           |CREATE TABLE $tab1 (s struct<a: string, b: string>)
+           |USING parquet
+         """.stripMargin)
+      sql(s"INSERT INTO TABLE $tab1 SELECT named_struct('col1','1','col2','2')")
+
+      sql(
+        s"""
+           |CREATE TABLE $tab2 (p struct<c: string, d: string>)
+           |USING parquet
+         """.stripMargin)
+      sql(s"INSERT INTO TABLE $tab2 SELECT * FROM $tab1")
+
+      checkAnswer(
+        spark.table(tab1),
+        spark.table(tab2)
+      )
+    }
+  }
+
   test("it is not allowed to write to a table while querying it.") {
     val message = intercept[AnalysisException] {
       sql(

From 6c7d094ec4d45a05c1ec8a418e507e45f5a88b7d Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Wed, 26 Oct 2016 14:19:40 +0200
Subject: [PATCH 118/162] [SPARK-18022][SQL] java.lang.NullPointerException
 instead of real exception when saving DF to MySQL

## What changes were proposed in this pull request?

On null next exception in JDBC, don't init it as cause or suppressed

## How was this patch tested?

Existing tests

Author: Sean Owen <sowen@cloudera.com>

Closes #15599 from srowen/SPARK-18022.
---
 .../apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
index e32db73bd6c6a..41edb6511c2ce 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
@@ -607,7 +607,7 @@ object JdbcUtils extends Logging {
     } catch {
       case e: SQLException =>
         val cause = e.getNextException
-        if (e.getCause != cause) {
+        if (cause != null && e.getCause != cause) {
           if (e.getCause == null) {
             e.initCause(cause)
           } else {

From 297813647508480d7b4b5bccd02b93b8b914301f Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Wed, 26 Oct 2016 14:23:11 +0200
Subject: [PATCH 119/162] [SPARK-18027][YARN] .sparkStaging not clean on RM
 ApplicationNotFoundException

## What changes were proposed in this pull request?

Cleanup YARN staging dir on all `KILLED`/`FAILED` paths in `monitorApplication`

## How was this patch tested?

Existing tests

Author: Sean Owen <sowen@cloudera.com>

Closes #15598 from srowen/SPARK-18027.
---
 yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 6e4f68c74c365..55e4a833b6707 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -1059,9 +1059,11 @@ private[spark] class Client(
         } catch {
           case e: ApplicationNotFoundException =>
             logError(s"Application $appId not found.")
+            cleanupStagingDir(appId)
             return (YarnApplicationState.KILLED, FinalApplicationStatus.KILLED)
           case NonFatal(e) =>
             logError(s"Failed to contact YARN for application $appId.", e)
+            // Don't necessarily clean up staging dir because status is unknown
             return (YarnApplicationState.FAILED, FinalApplicationStatus.FAILED)
         }
       val state = report.getYarnApplicationState

From 5d0f81da49e86ee93ecf679a20d024ea2cb8b3d3 Mon Sep 17 00:00:00 2001
From: Alex Bozarth <ajbozart@us.ibm.com>
Date: Wed, 26 Oct 2016 14:26:54 +0200
Subject: [PATCH 120/162] [SPARK-4411][WEB UI] Add "kill" link for jobs in the
 UI

## What changes were proposed in this pull request?

Currently users can kill stages via the web ui but not jobs directly (jobs are killed if one of their stages is). I've added the ability to kill jobs via the web ui. This code change is based on #4823 by lianhuiwang and updated to work with the latest code matching how stages are currently killed. In general I've copied the kill stage code warning and note comments and all. I also updated applicable tests and documentation.

## How was this patch tested?

Manually tested and dev/run-tests

![screen shot 2016-10-11 at 4 49 43 pm](https://cloud.githubusercontent.com/assets/13952758/19292857/12f1b7c0-8fd4-11e6-8982-210249f7b697.png)

Author: Alex Bozarth <ajbozart@us.ibm.com>
Author: Lianhui Wang <lianhuiwang09@gmail.com>

Closes #15441 from ajbozarth/spark4411.
---
 .../scala/org/apache/spark/ui/SparkUI.scala   | 11 +++--
 .../apache/spark/ui/jobs/AllJobsPage.scala    | 34 ++++++++++++--
 .../org/apache/spark/ui/jobs/JobsTab.scala    | 17 +++++++
 .../org/apache/spark/ui/jobs/StageTable.scala |  5 +-
 .../org/apache/spark/ui/jobs/StagesTab.scala  | 17 +++----
 .../org/apache/spark/ui/UISeleniumSuite.scala | 47 +++++++++++++++----
 docs/configuration.md                         |  2 +-
 7 files changed, 104 insertions(+), 29 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/SparkUI.scala b/core/src/main/scala/org/apache/spark/ui/SparkUI.scala
index ef71db89798f1..f631a047a707d 100644
--- a/core/src/main/scala/org/apache/spark/ui/SparkUI.scala
+++ b/core/src/main/scala/org/apache/spark/ui/SparkUI.scala
@@ -58,14 +58,13 @@ private[spark] class SparkUI private (
 
   val killEnabled = sc.map(_.conf.getBoolean("spark.ui.killEnabled", true)).getOrElse(false)
 
-
-  val stagesTab = new StagesTab(this)
-
   var appId: String = _
 
   /** Initialize all components of the server. */
   def initialize() {
-    attachTab(new JobsTab(this))
+    val jobsTab = new JobsTab(this)
+    attachTab(jobsTab)
+    val stagesTab = new StagesTab(this)
     attachTab(stagesTab)
     attachTab(new StorageTab(this))
     attachTab(new EnvironmentTab(this))
@@ -73,7 +72,9 @@ private[spark] class SparkUI private (
     attachHandler(createStaticHandler(SparkUI.STATIC_RESOURCE_DIR, "/static"))
     attachHandler(createRedirectHandler("/", "/jobs/", basePath = basePath))
     attachHandler(ApiRootResource.getServletHandler(this))
-    // This should be POST only, but, the YARN AM proxy won't proxy POSTs
+    // These should be POST only, but, the YARN AM proxy won't proxy POSTs
+    attachHandler(createRedirectHandler(
+      "/jobs/job/kill", "/jobs/", jobsTab.handleKillRequest, httpMethods = Set("GET", "POST")))
     attachHandler(createRedirectHandler(
       "/stages/stage/kill", "/stages/", stagesTab.handleKillRequest,
       httpMethods = Set("GET", "POST")))
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
index f6713097b9349..173fc3cf31ce8 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
@@ -218,7 +218,8 @@ private[ui] class AllJobsPage(parent: JobsTab) extends WebUIPage("") {
       request: HttpServletRequest,
       tableHeaderId: String,
       jobTag: String,
-      jobs: Seq[JobUIData]): Seq[Node] = {
+      jobs: Seq[JobUIData],
+      killEnabled: Boolean): Seq[Node] = {
     val allParameters = request.getParameterMap.asScala.toMap
     val parameterOtherTable = allParameters.filterNot(_._1.startsWith(jobTag))
       .map(para => para._1 + "=" + para._2(0))
@@ -264,6 +265,7 @@ private[ui] class AllJobsPage(parent: JobsTab) extends WebUIPage("") {
         parameterOtherTable,
         parent.jobProgresslistener.stageIdToInfo,
         parent.jobProgresslistener.stageIdToData,
+        killEnabled,
         currentTime,
         jobIdTitle,
         pageSize = jobPageSize,
@@ -290,9 +292,12 @@ private[ui] class AllJobsPage(parent: JobsTab) extends WebUIPage("") {
       val completedJobs = listener.completedJobs.reverse.toSeq
       val failedJobs = listener.failedJobs.reverse.toSeq
 
-      val activeJobsTable = jobsTable(request, "active", "activeJob", activeJobs)
-      val completedJobsTable = jobsTable(request, "completed", "completedJob", completedJobs)
-      val failedJobsTable = jobsTable(request, "failed", "failedJob", failedJobs)
+      val activeJobsTable =
+        jobsTable(request, "active", "activeJob", activeJobs, killEnabled = parent.killEnabled)
+      val completedJobsTable =
+        jobsTable(request, "completed", "completedJob", completedJobs, killEnabled = false)
+      val failedJobsTable =
+        jobsTable(request, "failed", "failedJob", failedJobs, killEnabled = false)
 
       val shouldShowActiveJobs = activeJobs.nonEmpty
       val shouldShowCompletedJobs = completedJobs.nonEmpty
@@ -483,6 +488,7 @@ private[ui] class JobPagedTable(
     parameterOtherTable: Iterable[String],
     stageIdToInfo: HashMap[Int, StageInfo],
     stageIdToData: HashMap[(Int, Int), StageUIData],
+    killEnabled: Boolean,
     currentTime: Long,
     jobIdTitle: String,
     pageSize: Int,
@@ -586,12 +592,30 @@ private[ui] class JobPagedTable(
   override def row(jobTableRow: JobTableRowData): Seq[Node] = {
     val job = jobTableRow.jobData
 
+    val killLink = if (killEnabled) {
+      val confirm =
+        s"if (window.confirm('Are you sure you want to kill job ${job.jobId} ?')) " +
+          "{ this.parentNode.submit(); return true; } else { return false; }"
+      // SPARK-6846 this should be POST-only but YARN AM won't proxy POST
+      /*
+      val killLinkUri = s"$basePathUri/jobs/job/kill/"
+      <form action={killLinkUri} method="POST" style="display:inline">
+        <input type="hidden" name="id" value={job.jobId.toString}/>
+        <a href="#" onclick={confirm} class="kill-link">(kill)</a>
+      </form>
+       */
+      val killLinkUri = s"$basePath/jobs/job/kill/?id=${job.jobId}"
+      <a href={killLinkUri} onclick={confirm} class="kill-link">(kill)</a>
+    } else {
+      Seq.empty
+    }
+
     <tr id={"job-" + job.jobId}>
       <td>
         {job.jobId} {job.jobGroup.map(id => s"($id)").getOrElse("")}
       </td>
       <td>
-        {jobTableRow.jobDescription}
+        {jobTableRow.jobDescription} {killLink}
         <a href={jobTableRow.detailUrl} class="name-link">{jobTableRow.lastStageName}</a>
       </td>
       <td>
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobsTab.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobsTab.scala
index 7b00b558d591a..620c54c2dc0a5 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/JobsTab.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/JobsTab.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.ui.jobs
 
+import javax.servlet.http.HttpServletRequest
+
 import org.apache.spark.scheduler.SchedulingMode
 import org.apache.spark.ui.{SparkUI, SparkUITab}
 
@@ -35,4 +37,19 @@ private[ui] class JobsTab(parent: SparkUI) extends SparkUITab(parent, "jobs") {
 
   attachPage(new AllJobsPage(this))
   attachPage(new JobPage(this))
+
+  def handleKillRequest(request: HttpServletRequest): Unit = {
+    if (killEnabled && parent.securityManager.checkModifyPermissions(request.getRemoteUser)) {
+      val jobId = Option(request.getParameter("id")).map(_.toInt)
+      jobId.foreach { id =>
+        if (jobProgresslistener.activeJobs.contains(id)) {
+          sc.foreach(_.cancelJob(id))
+          // Do a quick pause here to give Spark time to kill the job so it shows up as
+          // killed after the refresh. Note that this will block the serving thread so the
+          // time should be limited in duration.
+          Thread.sleep(100)
+        }
+      }
+    }
+  }
 }
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala
index 9b9b4681ba5db..c9d0431e2d2f7 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala
@@ -353,12 +353,13 @@ private[ui] class StagePagedTable(
       val killLinkUri = s"$basePathUri/stages/stage/kill/"
       <form action={killLinkUri} method="POST" style="display:inline">
         <input type="hidden" name="id" value={s.stageId.toString}/>
-        <input type="hidden" name="terminate" value="true"/>
         <a href="#" onclick={confirm} class="kill-link">(kill)</a>
       </form>
        */
-      val killLinkUri = s"$basePathUri/stages/stage/kill/?id=${s.stageId}&terminate=true"
+      val killLinkUri = s"$basePathUri/stages/stage/kill/?id=${s.stageId}"
       <a href={killLinkUri} onclick={confirm} class="kill-link">(kill)</a>
+    } else {
+      Seq.empty
     }
 
     val nameLinkUri = s"$basePathUri/stages/stage?id=${s.stageId}&attempt=${s.attemptId}"
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala
index 573192ac17d45..c1f25114371f1 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala
@@ -39,15 +39,16 @@ private[ui] class StagesTab(parent: SparkUI) extends SparkUITab(parent, "stages"
 
   def handleKillRequest(request: HttpServletRequest): Unit = {
     if (killEnabled && parent.securityManager.checkModifyPermissions(request.getRemoteUser)) {
-      val killFlag = Option(request.getParameter("terminate")).getOrElse("false").toBoolean
-      val stageId = Option(request.getParameter("id")).getOrElse("-1").toInt
-      if (stageId >= 0 && killFlag && progressListener.activeStages.contains(stageId)) {
-        sc.get.cancelStage(stageId)
+      val stageId = Option(request.getParameter("id")).map(_.toInt)
+      stageId.foreach { id =>
+        if (progressListener.activeStages.contains(id)) {
+          sc.foreach(_.cancelStage(id))
+          // Do a quick pause here to give Spark time to kill the stage so it shows up as
+          // killed after the refresh. Note that this will block the serving thread so the
+          // time should be limited in duration.
+          Thread.sleep(100)
+        }
       }
-      // Do a quick pause here to give Spark time to kill the stage so it shows up as
-      // killed after the refresh. Note that this will block the serving thread so the
-      // time should be limited in duration.
-      Thread.sleep(100)
     }
   }
 
diff --git a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
index fd12a21b7927e..e5d408a167361 100644
--- a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
@@ -194,6 +194,22 @@ class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with B
       sc.parallelize(1 to 10).map{x => Thread.sleep(10000); x}.countAsync()
     }
 
+    withSpark(newSparkContext(killEnabled = true)) { sc =>
+      runSlowJob(sc)
+      eventually(timeout(5 seconds), interval(50 milliseconds)) {
+        goToUi(sc, "/jobs")
+        assert(hasKillLink)
+      }
+    }
+
+    withSpark(newSparkContext(killEnabled = false)) { sc =>
+      runSlowJob(sc)
+      eventually(timeout(5 seconds), interval(50 milliseconds)) {
+        goToUi(sc, "/jobs")
+        assert(!hasKillLink)
+      }
+    }
+
     withSpark(newSparkContext(killEnabled = true)) { sc =>
       runSlowJob(sc)
       eventually(timeout(5 seconds), interval(50 milliseconds)) {
@@ -453,20 +469,24 @@ class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with B
   }
 
   test("kill stage POST/GET response is correct") {
-    def getResponseCode(url: URL, method: String): Int = {
-      val connection = url.openConnection().asInstanceOf[HttpURLConnection]
-      connection.setRequestMethod(method)
-      connection.connect()
-      val code = connection.getResponseCode()
-      connection.disconnect()
-      code
+    withSpark(newSparkContext(killEnabled = true)) { sc =>
+      sc.parallelize(1 to 10).map{x => Thread.sleep(10000); x}.countAsync()
+      eventually(timeout(5 seconds), interval(50 milliseconds)) {
+        val url = new URL(
+          sc.ui.get.appUIAddress.stripSuffix("/") + "/stages/stage/kill/?id=0")
+        // SPARK-6846: should be POST only but YARN AM doesn't proxy POST
+        getResponseCode(url, "GET") should be (200)
+        getResponseCode(url, "POST") should be (200)
+      }
     }
+  }
 
+  test("kill job POST/GET response is correct") {
     withSpark(newSparkContext(killEnabled = true)) { sc =>
       sc.parallelize(1 to 10).map{x => Thread.sleep(10000); x}.countAsync()
       eventually(timeout(5 seconds), interval(50 milliseconds)) {
         val url = new URL(
-          sc.ui.get.appUIAddress.stripSuffix("/") + "/stages/stage/kill/?id=0&terminate=true")
+          sc.ui.get.appUIAddress.stripSuffix("/") + "/jobs/job/kill/?id=0")
         // SPARK-6846: should be POST only but YARN AM doesn't proxy POST
         getResponseCode(url, "GET") should be (200)
         getResponseCode(url, "POST") should be (200)
@@ -651,6 +671,17 @@ class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with B
     }
   }
 
+  def getResponseCode(url: URL, method: String): Int = {
+    val connection = url.openConnection().asInstanceOf[HttpURLConnection]
+    connection.setRequestMethod(method)
+    try {
+      connection.connect()
+      connection.getResponseCode()
+    } finally {
+      connection.disconnect()
+    }
+  }
+
   def goToUi(sc: SparkContext, path: String): Unit = {
     goToUi(sc.ui.get, path)
   }
diff --git a/docs/configuration.md b/docs/configuration.md
index b07867d99aa9d..6600cb6c0ac09 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -632,7 +632,7 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.ui.killEnabled</code></td>
   <td>true</td>
   <td>
-    Allows stages and corresponding jobs to be killed from the web ui.
+    Allows jobs and stages to be killed from the web UI.
   </td>
 </tr>
 <tr>

From 402205ddf749e7478683ce1b0443df63b46b03fd Mon Sep 17 00:00:00 2001
From: Shuai Lin <linshuai2012@gmail.com>
Date: Wed, 26 Oct 2016 14:31:47 +0200
Subject: [PATCH 121/162] [SPARK-17802] Improved caller context logging.

## What changes were proposed in this pull request?

[SPARK-16757](https://issues.apache.org/jira/browse/SPARK-16757) sets the hadoop `CallerContext` when calling hadoop/hdfs apis to make spark applications more diagnosable in hadoop/hdfs logs. However, the `org.apache.hadoop.ipc.CallerContext` class is only added since [hadoop 2.8](https://issues.apache.org/jira/browse/HDFS-9184), which is not officially releaed yet. So each time `utils.CallerContext.setCurrentContext()` is called (e.g [when a task is created](https://github.com/apache/spark/blob/b678e46/core/src/main/scala/org/apache/spark/scheduler/Task.scala#L95-L96)), a "java.lang.ClassNotFoundException: org.apache.hadoop.ipc.CallerContext"
error is logged, which pollutes the spark logs when there are lots of tasks.

This patch improves this behaviour by only logging the `ClassNotFoundException` once.

## How was this patch tested?

Existing tests.

Author: Shuai Lin <linshuai2012@gmail.com>

Closes #15377 from lins05/spark-17802-improve-callercontext-logging.
---
 .../scala/org/apache/spark/util/Utils.scala   | 48 +++++++++++++------
 .../org/apache/spark/util/UtilsSuite.scala    |  7 +--
 2 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index bfc609419ccdb..e57eb0de2689f 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -2508,6 +2508,26 @@ private[spark] object Utils extends Logging {
   }
 }
 
+private[util] object CallerContext extends Logging {
+  val callerContextSupported: Boolean = {
+    SparkHadoopUtil.get.conf.getBoolean("hadoop.caller.context.enabled", false) && {
+      try {
+        // scalastyle:off classforname
+        Class.forName("org.apache.hadoop.ipc.CallerContext")
+        Class.forName("org.apache.hadoop.ipc.CallerContext$Builder")
+        // scalastyle:on classforname
+        true
+      } catch {
+        case _: ClassNotFoundException =>
+          false
+        case NonFatal(e) =>
+          logWarning("Fail to load the CallerContext class", e)
+          false
+      }
+    }
+  }
+}
+
 /**
  * An utility class used to set up Spark caller contexts to HDFS and Yarn. The `context` will be
  * constructed by parameters passed in.
@@ -2554,21 +2574,21 @@ private[spark] class CallerContext(
    * Set up the caller context [[context]] by invoking Hadoop CallerContext API of
    * [[org.apache.hadoop.ipc.CallerContext]], which was added in hadoop 2.8.
    */
-  def setCurrentContext(): Boolean = {
-    var succeed = false
-    try {
-      // scalastyle:off classforname
-      val callerContext = Class.forName("org.apache.hadoop.ipc.CallerContext")
-      val Builder = Class.forName("org.apache.hadoop.ipc.CallerContext$Builder")
-      // scalastyle:on classforname
-      val builderInst = Builder.getConstructor(classOf[String]).newInstance(context)
-      val hdfsContext = Builder.getMethod("build").invoke(builderInst)
-      callerContext.getMethod("setCurrent", callerContext).invoke(null, hdfsContext)
-      succeed = true
-    } catch {
-      case NonFatal(e) => logInfo("Fail to set Spark caller context", e)
+  def setCurrentContext(): Unit = {
+    if (CallerContext.callerContextSupported) {
+      try {
+        // scalastyle:off classforname
+        val callerContext = Class.forName("org.apache.hadoop.ipc.CallerContext")
+        val builder = Class.forName("org.apache.hadoop.ipc.CallerContext$Builder")
+        // scalastyle:on classforname
+        val builderInst = builder.getConstructor(classOf[String]).newInstance(context)
+        val hdfsContext = builder.getMethod("build").invoke(builderInst)
+        callerContext.getMethod("setCurrent", callerContext).invoke(null, hdfsContext)
+      } catch {
+        case NonFatal(e) =>
+          logWarning("Fail to set Spark caller context", e)
+      }
     }
-    succeed
   }
 }
 
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index 4dda80f10a08a..aeb2969fd579e 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -843,14 +843,11 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
 
   test("Set Spark CallerContext") {
     val context = "test"
-    try {
+    new CallerContext(context).setCurrentContext()
+    if (CallerContext.callerContextSupported) {
       val callerContext = Utils.classForName("org.apache.hadoop.ipc.CallerContext")
-      assert(new CallerContext(context).setCurrentContext())
       assert(s"SPARK_$context" ===
         callerContext.getMethod("getCurrent").invoke(null).toString)
-    } catch {
-      case e: ClassNotFoundException =>
-        assert(!new CallerContext(context).setCurrentContext())
     }
   }
 

From 3c023570b28bc1ed24f5b2448311130fd1777fd3 Mon Sep 17 00:00:00 2001
From: jiangxingbo <jiangxb1987@gmail.com>
Date: Wed, 26 Oct 2016 17:09:48 +0200
Subject: [PATCH 122/162] [SPARK-17733][SQL] InferFiltersFromConstraints rule
 never terminates for query

## What changes were proposed in this pull request?

The function `QueryPlan.inferAdditionalConstraints` and `UnaryNode.getAliasedConstraints` can produce a non-converging set of constraints for recursive functions. For instance, if we have two constraints of the form(where a is an alias):
`a = b, a = f(b, c)`
Applying both these rules in the next iteration would infer:
`f(b, c) = f(f(b, c), c)`
This process repeated, the iteration won't converge and the set of constraints will grow larger and larger until OOM.

~~To fix this problem, we collect alias from expressions and skip infer constraints if we are to transform an `Expression` to another which contains it.~~
To fix this problem, we apply additional check in `inferAdditionalConstraints`, when it's possible to generate recursive constraints, we skip generate that.

## How was this patch tested?

Add new testcase in `SQLQuerySuite`/`InferFiltersFromConstraintsSuite`.

Author: jiangxingbo <jiangxb1987@gmail.com>

Closes #15319 from jiangxb1987/constraints.
---
 .../spark/sql/catalyst/plans/QueryPlan.scala  | 88 +++++++++++++++++--
 .../InferFiltersFromConstraintsSuite.scala    | 87 +++++++++++++++++-
 .../spark/sql/catalyst/plans/PlanTest.scala   | 25 +++++-
 .../org/apache/spark/sql/SQLQuerySuite.scala  |  5 +-
 4 files changed, 191 insertions(+), 14 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
index 0fb6e7d2e795a..45ee2964d4db0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
@@ -68,26 +68,104 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] extends TreeNode[PlanT
     case _ => Seq.empty[Attribute]
   }
 
+  // Collect aliases from expressions, so we may avoid producing recursive constraints.
+  private lazy val aliasMap = AttributeMap(
+    (expressions ++ children.flatMap(_.expressions)).collect {
+      case a: Alias => (a.toAttribute, a.child)
+    })
+
   /**
    * Infers an additional set of constraints from a given set of equality constraints.
    * For e.g., if an operator has constraints of the form (`a = 5`, `a = b`), this returns an
-   * additional constraint of the form `b = 5`
+   * additional constraint of the form `b = 5`.
+   *
+   * [SPARK-17733] We explicitly prevent producing recursive constraints of the form `a = f(a, b)`
+   * as they are often useless and can lead to a non-converging set of constraints.
    */
   private def inferAdditionalConstraints(constraints: Set[Expression]): Set[Expression] = {
+    val constraintClasses = generateEquivalentConstraintClasses(constraints)
+
     var inferredConstraints = Set.empty[Expression]
     constraints.foreach {
       case eq @ EqualTo(l: Attribute, r: Attribute) =>
-        inferredConstraints ++= (constraints - eq).map(_ transform {
-          case a: Attribute if a.semanticEquals(l) => r
+        val candidateConstraints = constraints - eq
+        inferredConstraints ++= candidateConstraints.map(_ transform {
+          case a: Attribute if a.semanticEquals(l) &&
+            !isRecursiveDeduction(r, constraintClasses) => r
         })
-        inferredConstraints ++= (constraints - eq).map(_ transform {
-          case a: Attribute if a.semanticEquals(r) => l
+        inferredConstraints ++= candidateConstraints.map(_ transform {
+          case a: Attribute if a.semanticEquals(r) &&
+            !isRecursiveDeduction(l, constraintClasses) => l
         })
       case _ => // No inference
     }
     inferredConstraints -- constraints
   }
 
+  /*
+   * Generate a sequence of expression sets from constraints, where each set stores an equivalence
+   * class of expressions. For example, Set(`a = b`, `b = c`, `e = f`) will generate the following
+   * expression sets: (Set(a, b, c), Set(e, f)). This will be used to search all expressions equal
+   * to an selected attribute.
+   */
+  private def generateEquivalentConstraintClasses(
+      constraints: Set[Expression]): Seq[Set[Expression]] = {
+    var constraintClasses = Seq.empty[Set[Expression]]
+    constraints.foreach {
+      case eq @ EqualTo(l: Attribute, r: Attribute) =>
+        // Transform [[Alias]] to its child.
+        val left = aliasMap.getOrElse(l, l)
+        val right = aliasMap.getOrElse(r, r)
+        // Get the expression set for an equivalence constraint class.
+        val leftConstraintClass = getConstraintClass(left, constraintClasses)
+        val rightConstraintClass = getConstraintClass(right, constraintClasses)
+        if (leftConstraintClass.nonEmpty && rightConstraintClass.nonEmpty) {
+          // Combine the two sets.
+          constraintClasses = constraintClasses
+            .diff(leftConstraintClass :: rightConstraintClass :: Nil) :+
+            (leftConstraintClass ++ rightConstraintClass)
+        } else if (leftConstraintClass.nonEmpty) { // && rightConstraintClass.isEmpty
+          // Update equivalence class of `left` expression.
+          constraintClasses = constraintClasses
+            .diff(leftConstraintClass :: Nil) :+ (leftConstraintClass + right)
+        } else if (rightConstraintClass.nonEmpty) { // && leftConstraintClass.isEmpty
+          // Update equivalence class of `right` expression.
+          constraintClasses = constraintClasses
+            .diff(rightConstraintClass :: Nil) :+ (rightConstraintClass + left)
+        } else { // leftConstraintClass.isEmpty && rightConstraintClass.isEmpty
+          // Create new equivalence constraint class since neither expression presents
+          // in any classes.
+          constraintClasses = constraintClasses :+ Set(left, right)
+        }
+      case _ => // Skip
+    }
+
+    constraintClasses
+  }
+
+  /*
+   * Get all expressions equivalent to the selected expression.
+   */
+  private def getConstraintClass(
+      expr: Expression,
+      constraintClasses: Seq[Set[Expression]]): Set[Expression] =
+    constraintClasses.find(_.contains(expr)).getOrElse(Set.empty[Expression])
+
+  /*
+   *  Check whether replace by an [[Attribute]] will cause a recursive deduction. Generally it
+   *  has the form like: `a -> f(a, b)`, where `a` and `b` are expressions and `f` is a function.
+   *  Here we first get all expressions equal to `attr` and then check whether at least one of them
+   *  is a child of the referenced expression.
+   */
+  private def isRecursiveDeduction(
+      attr: Attribute,
+      constraintClasses: Seq[Set[Expression]]): Boolean = {
+    val expr = aliasMap.getOrElse(attr, attr)
+    getConstraintClass(expr, constraintClasses).exists { e =>
+      expr.children.exists(_.semanticEquals(e))
+    }
+  }
+
   /**
    * An [[ExpressionSet]] that contains invariants about the rows output by this operator. For
    * example, if this set contains the expression `a = 2` then that expression is guaranteed to
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala
index e7fdd5a6202b6..9f57f66a2ea20 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala
@@ -27,9 +27,12 @@ import org.apache.spark.sql.catalyst.rules._
 class InferFiltersFromConstraintsSuite extends PlanTest {
 
   object Optimize extends RuleExecutor[LogicalPlan] {
-    val batches = Batch("InferFilters", FixedPoint(5), InferFiltersFromConstraints) ::
-      Batch("PredicatePushdown", FixedPoint(5), PushPredicateThroughJoin) ::
-      Batch("CombineFilters", FixedPoint(5), CombineFilters) :: Nil
+    val batches =
+      Batch("InferAndPushDownFilters", FixedPoint(100),
+        PushPredicateThroughJoin,
+        PushDownPredicate,
+        InferFiltersFromConstraints,
+        CombineFilters) :: Nil
   }
 
   val testRelation = LocalRelation('a.int, 'b.int, 'c.int)
@@ -120,4 +123,82 @@ class InferFiltersFromConstraintsSuite extends PlanTest {
     val optimized = Optimize.execute(originalQuery)
     comparePlans(optimized, correctAnswer)
   }
+
+  test("inner join with alias: alias contains multiple attributes") {
+    val t1 = testRelation.subquery('t1)
+    val t2 = testRelation.subquery('t2)
+
+    val originalQuery = t1.select('a, Coalesce(Seq('a, 'b)).as('int_col)).as("t")
+      .join(t2, Inner, Some("t.a".attr === "t2.a".attr && "t.int_col".attr === "t2.a".attr))
+      .analyze
+    val correctAnswer = t1
+      .where(IsNotNull('a) && IsNotNull(Coalesce(Seq('a, 'b))) && 'a === Coalesce(Seq('a, 'b)))
+      .select('a, Coalesce(Seq('a, 'b)).as('int_col)).as("t")
+      .join(t2.where(IsNotNull('a)), Inner,
+        Some("t.a".attr === "t2.a".attr && "t.int_col".attr === "t2.a".attr))
+      .analyze
+    val optimized = Optimize.execute(originalQuery)
+    comparePlans(optimized, correctAnswer)
+  }
+
+  test("inner join with alias: alias contains single attributes") {
+    val t1 = testRelation.subquery('t1)
+    val t2 = testRelation.subquery('t2)
+
+    val originalQuery = t1.select('a, 'b.as('d)).as("t")
+      .join(t2, Inner, Some("t.a".attr === "t2.a".attr && "t.d".attr === "t2.a".attr))
+      .analyze
+    val correctAnswer = t1
+      .where(IsNotNull('a) && IsNotNull('b) && 'a <=> 'a && 'b <=> 'b &&'a === 'b)
+      .select('a, 'b.as('d)).as("t")
+      .join(t2.where(IsNotNull('a) && 'a <=> 'a), Inner,
+        Some("t.a".attr === "t2.a".attr && "t.d".attr === "t2.a".attr))
+      .analyze
+    val optimized = Optimize.execute(originalQuery)
+    comparePlans(optimized, correctAnswer)
+  }
+
+  test("inner join with alias: don't generate constraints for recursive functions") {
+    val t1 = testRelation.subquery('t1)
+    val t2 = testRelation.subquery('t2)
+
+    val originalQuery = t1.select('a, 'b.as('d), Coalesce(Seq('a, 'b)).as('int_col)).as("t")
+      .join(t2, Inner,
+        Some("t.a".attr === "t2.a".attr
+          && "t.d".attr === "t2.a".attr
+          && "t.int_col".attr === "t2.a".attr))
+      .analyze
+    val correctAnswer = t1
+      .where(IsNotNull('a) && IsNotNull(Coalesce(Seq('a, 'a)))
+        && 'a === Coalesce(Seq('a, 'a)) && 'a <=> Coalesce(Seq('a, 'a)) && 'a <=> 'a
+        && Coalesce(Seq('a, 'a)) <=> 'b && Coalesce(Seq('a, 'a)) <=> Coalesce(Seq('a, 'a))
+        && 'a === 'b && IsNotNull(Coalesce(Seq('a, 'b))) && 'a === Coalesce(Seq('a, 'b))
+        && Coalesce(Seq('a, 'b)) <=> Coalesce(Seq('b, 'b)) && Coalesce(Seq('a, 'b)) === 'b
+        && IsNotNull('b) && IsNotNull(Coalesce(Seq('b, 'b)))
+        && 'b === Coalesce(Seq('b, 'b)) && 'b <=> Coalesce(Seq('b, 'b))
+        && Coalesce(Seq('b, 'b)) <=> Coalesce(Seq('b, 'b)) && 'b <=> 'b)
+      .select('a, 'b.as('d), Coalesce(Seq('a, 'b)).as('int_col)).as("t")
+      .join(t2
+        .where(IsNotNull('a) && IsNotNull(Coalesce(Seq('a, 'a)))
+          && 'a === Coalesce(Seq('a, 'a)) && 'a <=> Coalesce(Seq('a, 'a)) && 'a <=> 'a
+          && Coalesce(Seq('a, 'a)) <=> Coalesce(Seq('a, 'a))), Inner,
+        Some("t.a".attr === "t2.a".attr
+          && "t.d".attr === "t2.a".attr
+          && "t.int_col".attr === "t2.a".attr
+          && Coalesce(Seq("t.d".attr, "t.d".attr)) <=> "t.int_col".attr))
+      .analyze
+    val optimized = Optimize.execute(originalQuery)
+    comparePlans(optimized, correctAnswer)
+  }
+
+  test("generate correct filters for alias that don't produce recursive constraints") {
+    val t1 = testRelation.subquery('t1)
+
+    val originalQuery = t1.select('a.as('x), 'b.as('y)).where('x === 1 && 'x === 'y).analyze
+    val correctAnswer =
+      t1.where('a === 1 && 'b === 1 && 'a === 'b && IsNotNull('a) && IsNotNull('b))
+        .select('a.as('x), 'b.as('y)).analyze
+    val optimized = Optimize.execute(originalQuery)
+    comparePlans(optimized, correctAnswer)
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
index 6310f0c2bc0ed..64e268703bf5e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.plans
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
-import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, OneRowRelation, Sample}
+import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.util._
 
 /**
@@ -56,16 +56,37 @@ abstract class PlanTest extends SparkFunSuite with PredicateHelper {
    *   ((expr 1 && expr 2) && expr 3), (expr 1 && expr 2 && expr 3), (expr 3 && (expr 1 && expr 2)
    *   etc., will all now be equivalent.
    * - Sample the seed will replaced by 0L.
+   * - Join conditions will be resorted by hashCode.
    */
   private def normalizePlan(plan: LogicalPlan): LogicalPlan = {
     plan transform {
       case filter @ Filter(condition: Expression, child: LogicalPlan) =>
-        Filter(splitConjunctivePredicates(condition).sortBy(_.hashCode()).reduce(And), child)
+        Filter(splitConjunctivePredicates(condition).map(rewriteEqual(_)).sortBy(_.hashCode())
+          .reduce(And), child)
       case sample: Sample =>
         sample.copy(seed = 0L)(true)
+      case join @ Join(left, right, joinType, condition) if condition.isDefined =>
+        val newCondition =
+          splitConjunctivePredicates(condition.get).map(rewriteEqual(_)).sortBy(_.hashCode())
+            .reduce(And)
+        Join(left, right, joinType, Some(newCondition))
     }
   }
 
+  /**
+   * Rewrite [[EqualTo]] and [[EqualNullSafe]] operator to keep order. The following cases will be
+   * equivalent:
+   * 1. (a = b), (b = a);
+   * 2. (a <=> b), (b <=> a).
+   */
+  private def rewriteEqual(condition: Expression): Expression = condition match {
+    case eq @ EqualTo(l: Expression, r: Expression) =>
+      Seq(l, r).sortBy(_.hashCode()).reduce(EqualTo)
+    case eq @ EqualNullSafe(l: Expression, r: Expression) =>
+      Seq(l, r).sortBy(_.hashCode()).reduce(EqualNullSafe)
+    case _ => condition // Don't reorder.
+  }
+
   /** Fails the test if the two plans do not match */
   protected def comparePlans(plan1: LogicalPlan, plan2: LogicalPlan) {
     val normalized1 = normalizePlan(normalizeExprIds(plan1))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 60978efddd7f8..bd4c25315c311 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -19,12 +19,9 @@ package org.apache.spark.sql
 
 import java.io.File
 import java.math.MathContext
-import java.sql.{Date, Timestamp}
+import java.sql.Timestamp
 
 import org.apache.spark.{AccumulatorSuite, SparkException}
-import org.apache.spark.sql.catalyst.analysis.UnresolvedException
-import org.apache.spark.sql.catalyst.expressions.SortOrder
-import org.apache.spark.sql.catalyst.plans.logical.Aggregate
 import org.apache.spark.sql.catalyst.util.StringUtils
 import org.apache.spark.sql.execution.aggregate
 import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, CartesianProductExec, SortMergeJoinExec}

From 4bee9540790a40acb74db4b0b44c364c4b3f537d Mon Sep 17 00:00:00 2001
From: Mark Grover <mark@apache.org>
Date: Wed, 26 Oct 2016 09:07:30 -0700
Subject: [PATCH 123/162] =?UTF-8?q?[SPARK-18093][SQL]=20Fix=20default=20va?=
 =?UTF-8?q?lue=20test=20in=20SQLConfSuite=20to=20work=20rega=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…rdless of warehouse dir's existence

## What changes were proposed in this pull request?
Appending a trailing slash, if there already isn't one for the
sake comparison of the two paths. It doesn't take away from
the essence of the check, but removes any potential mismatch
due to lack of trailing slash.

## How was this patch tested?
Ran unit tests and they passed.

Author: Mark Grover <mark@apache.org>

Closes #15623 from markgrover/spark-18093.
---
 .../scala/org/apache/spark/sql/internal/SQLConfSuite.scala | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
index a89a43fa1e777..11d4693f1c2a3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
@@ -215,12 +215,15 @@ class SQLConfSuite extends QueryTest with SharedSQLContext {
   }
 
   test("default value of WAREHOUSE_PATH") {
+
     val original = spark.conf.get(SQLConf.WAREHOUSE_PATH)
     try {
       // to get the default value, always unset it
       spark.conf.unset(SQLConf.WAREHOUSE_PATH.key)
-      assert(new Path(Utils.resolveURI("spark-warehouse")).toString ===
-        spark.sessionState.conf.warehousePath + "/")
+      // JVM adds a trailing slash if the directory exists and leaves it as-is, if it doesn't
+      // In our comparison, strip trailing slash off of both sides, to account for such cases
+      assert(new Path(Utils.resolveURI("spark-warehouse")).toString.stripSuffix("/") === spark
+        .sessionState.conf.warehousePath.stripSuffix("/"))
     } finally {
       sql(s"set ${SQLConf.WAREHOUSE_PATH}=$original")
     }

From 312ea3f7f65532818e11016d6d780ad47485175f Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 26 Oct 2016 09:28:28 -0700
Subject: [PATCH 124/162] [SPARK-17748][FOLLOW-UP][ML] Reorg variables of
 WeightedLeastSquares.

## What changes were proposed in this pull request?
This is follow-up work of #15394.
Reorg some variables of ```WeightedLeastSquares``` and fix one minor issue of ```WeightedLeastSquaresSuite```.

## How was this patch tested?
Existing tests.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #15621 from yanboliang/spark-17748.
---
 .../spark/ml/optim/WeightedLeastSquares.scala | 139 ++++++++++--------
 .../ml/optim/WeightedLeastSquaresSuite.scala  |  15 +-
 2 files changed, 86 insertions(+), 68 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
index 2223f126f1b69..90c24e1b590ea 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
@@ -101,23 +101,19 @@ private[ml] class WeightedLeastSquares(
     summary.validate()
     logInfo(s"Number of instances: ${summary.count}.")
     val k = if (fitIntercept) summary.k + 1 else summary.k
+    val numFeatures = summary.k
     val triK = summary.triK
     val wSum = summary.wSum
-    val bBar = summary.bBar
-    val bbBar = summary.bbBar
-    val aBar = summary.aBar
-    val aStd = summary.aStd
-    val abBar = summary.abBar
-    val aaBar = summary.aaBar
-    val numFeatures = abBar.size
+
     val rawBStd = summary.bStd
+    val rawBBar = summary.bBar
     // if b is constant (rawBStd is zero), then b cannot be scaled. In this case
-    // setting bStd=abs(bBar) ensures that b is not scaled anymore in l-bfgs algorithm.
-    val bStd = if (rawBStd == 0.0) math.abs(bBar) else rawBStd
+    // setting bStd=abs(rawBBar) ensures that b is not scaled anymore in l-bfgs algorithm.
+    val bStd = if (rawBStd == 0.0) math.abs(rawBBar) else rawBStd
 
     if (rawBStd == 0) {
-      if (fitIntercept || bBar == 0.0) {
-        if (bBar == 0.0) {
+      if (fitIntercept || rawBBar == 0.0) {
+        if (rawBBar == 0.0) {
           logWarning(s"Mean and standard deviation of the label are zero, so the coefficients " +
             s"and the intercept will all be zero; as a result, training is not needed.")
         } else {
@@ -126,7 +122,7 @@ private[ml] class WeightedLeastSquares(
             s"training is not needed.")
         }
         val coefficients = new DenseVector(Array.ofDim(numFeatures))
-        val intercept = bBar
+        val intercept = rawBBar
         val diagInvAtWA = new DenseVector(Array(0D))
         return new WeightedLeastSquaresModel(coefficients, intercept, diagInvAtWA, Array(0D))
       } else {
@@ -137,53 +133,70 @@ private[ml] class WeightedLeastSquares(
       }
     }
 
-    // scale aBar to standardized space in-place
-    val aBarValues = aBar.values
-    var j = 0
-    while (j < numFeatures) {
-      if (aStd(j) == 0.0) {
-        aBarValues(j) = 0.0
-      } else {
-        aBarValues(j) /= aStd(j)
-      }
-      j += 1
-    }
+    val bBar = summary.bBar / bStd
+    val bbBar = summary.bbBar / (bStd * bStd)
 
-    // scale abBar to standardized space in-place
-    val abBarValues = abBar.values
+    val aStd = summary.aStd
     val aStdValues = aStd.values
-    j = 0
-    while (j < numFeatures) {
-      if (aStdValues(j) == 0.0) {
-        abBarValues(j) = 0.0
-      } else {
-        abBarValues(j) /= (aStdValues(j) * bStd)
+
+    val aBar = {
+      val _aBar = summary.aBar
+      val _aBarValues = _aBar.values
+      var i = 0
+      // scale aBar to standardized space in-place
+      while (i < numFeatures) {
+        if (aStdValues(i) == 0.0) {
+          _aBarValues(i) = 0.0
+        } else {
+          _aBarValues(i) /= aStdValues(i)
+        }
+        i += 1
       }
-      j += 1
+      _aBar
     }
+    val aBarValues = aBar.values
 
-    // scale aaBar to standardized space in-place
-    val aaBarValues = aaBar.values
-    j = 0
-    var p = 0
-    while (j < numFeatures) {
-      val aStdJ = aStdValues(j)
+    val abBar = {
+      val _abBar = summary.abBar
+      val _abBarValues = _abBar.values
       var i = 0
-      while (i <= j) {
-        val aStdI = aStdValues(i)
-        if (aStdJ == 0.0 || aStdI == 0.0) {
-          aaBarValues(p) = 0.0
+      // scale abBar to standardized space in-place
+      while (i < numFeatures) {
+        if (aStdValues(i) == 0.0) {
+          _abBarValues(i) = 0.0
         } else {
-          aaBarValues(p) /= (aStdI * aStdJ)
+          _abBarValues(i) /= (aStdValues(i) * bStd)
         }
-        p += 1
         i += 1
       }
-      j += 1
+      _abBar
     }
+    val abBarValues = abBar.values
 
-    val bBarStd = bBar / bStd
-    val bbBarStd = bbBar / (bStd * bStd)
+    val aaBar = {
+      val _aaBar = summary.aaBar
+      val _aaBarValues = _aaBar.values
+      var j = 0
+      var p = 0
+      // scale aaBar to standardized space in-place
+      while (j < numFeatures) {
+        val aStdJ = aStdValues(j)
+        var i = 0
+        while (i <= j) {
+          val aStdI = aStdValues(i)
+          if (aStdJ == 0.0 || aStdI == 0.0) {
+            _aaBarValues(p) = 0.0
+          } else {
+            _aaBarValues(p) /= (aStdI * aStdJ)
+          }
+          p += 1
+          i += 1
+        }
+        j += 1
+      }
+      _aaBar
+    }
+    val aaBarValues = aaBar.values
 
     val effectiveRegParam = regParam / bStd
     val effectiveL1RegParam = elasticNetParam * effectiveRegParam
@@ -191,11 +204,11 @@ private[ml] class WeightedLeastSquares(
 
     // add L2 regularization to diagonals
     var i = 0
-    j = 2
+    var j = 2
     while (i < triK) {
       var lambda = effectiveL2RegParam
       if (!standardizeFeatures) {
-        val std = aStd(j - 2)
+        val std = aStdValues(j - 2)
         if (std != 0.0) {
           lambda /= (std * std)
         } else {
@@ -209,8 +222,9 @@ private[ml] class WeightedLeastSquares(
       i += j
       j += 1
     }
-    val aa = getAtA(aaBar.values, aBar.values)
-    val ab = getAtB(abBar.values, bBarStd)
+
+    val aa = getAtA(aaBarValues, aBarValues)
+    val ab = getAtB(abBarValues, bBar)
 
     val solver = if ((solverType == WeightedLeastSquares.Auto && elasticNetParam != 0.0 &&
       regParam != 0.0) || (solverType == WeightedLeastSquares.QuasiNewton)) {
@@ -237,22 +251,23 @@ private[ml] class WeightedLeastSquares(
     val solution = solver match {
       case cholesky: CholeskySolver =>
         try {
-          cholesky.solve(bBarStd, bbBarStd, ab, aa, aBar)
+          cholesky.solve(bBar, bbBar, ab, aa, aBar)
         } catch {
           // if Auto solver is used and Cholesky fails due to singular AtA, then fall back to
-          // quasi-newton solver
+          // Quasi-Newton solver.
           case _: SingularMatrixException if solverType == WeightedLeastSquares.Auto =>
             logWarning("Cholesky solver failed due to singular covariance matrix. " +
               "Retrying with Quasi-Newton solver.")
             // ab and aa were modified in place, so reconstruct them
-            val _aa = getAtA(aaBar.values, aBar.values)
-            val _ab = getAtB(abBar.values, bBarStd)
+            val _aa = getAtA(aaBarValues, aBarValues)
+            val _ab = getAtB(abBarValues, bBar)
             val newSolver = new QuasiNewtonSolver(fitIntercept, maxIter, tol, None)
-            newSolver.solve(bBarStd, bbBarStd, _ab, _aa, aBar)
+            newSolver.solve(bBar, bbBar, _ab, _aa, aBar)
         }
       case qn: QuasiNewtonSolver =>
-        qn.solve(bBarStd, bbBarStd, ab, aa, aBar)
+        qn.solve(bBar, bbBar, ab, aa, aBar)
     }
+
     val (coefficientArray, intercept) = if (fitIntercept) {
       (solution.coefficients.slice(0, solution.coefficients.length - 1),
         solution.coefficients.last * bStd)
@@ -271,7 +286,11 @@ private[ml] class WeightedLeastSquares(
     // aaInv is a packed upper triangular matrix, here we get all elements on diagonal
     val diagInvAtWA = solution.aaInv.map { inv =>
       new DenseVector((1 to k).map { i =>
-        val multiplier = if (i == k && fitIntercept) 1.0 else aStdValues(i - 1) * aStdValues(i - 1)
+        val multiplier = if (i == k && fitIntercept) {
+          1.0
+        } else {
+          aStdValues(i - 1) * aStdValues(i - 1)
+        }
         inv(i + (i - 1) * i / 2 - 1) / (wSum * multiplier)
       }.toArray)
     }.getOrElse(new DenseVector(Array(0D)))
@@ -280,7 +299,7 @@ private[ml] class WeightedLeastSquares(
       solution.objectiveHistory.getOrElse(Array(0D)))
   }
 
-  /** Construct A^T^ A from summary statistics. */
+  /** Construct A^T^ A (append bias if necessary). */
   private def getAtA(aaBar: Array[Double], aBar: Array[Double]): DenseVector = {
     if (fitIntercept) {
       new DenseVector(Array.concat(aaBar, aBar, Array(1.0)))
@@ -289,7 +308,7 @@ private[ml] class WeightedLeastSquares(
     }
   }
 
-  /** Construct A^T^ b from summary statistics. */
+  /** Construct A^T^ b (append bias if necessary). */
   private def getAtB(abBar: Array[Double], bBar: Double): DenseVector = {
     if (fitIntercept) {
       new DenseVector(Array.concat(abBar, Array(bBar)))
diff --git a/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala
index 3cdab0327991e..093d02ea7a14b 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala
@@ -361,14 +361,13 @@ class WeightedLeastSquaresSuite extends SparkFunSuite with MLlibTestSparkContext
     for (fitIntercept <- Seq(false, true);
          standardization <- Seq(false, true);
          (lambda, alpha) <- Seq((0.0, 0.0), (0.5, 0.0), (0.5, 0.5), (0.5, 1.0))) {
-      for (solver <- Seq(WeightedLeastSquares.Auto, WeightedLeastSquares.Cholesky)) {
-        val wls = new WeightedLeastSquares(fitIntercept, regParam = lambda, elasticNetParam = alpha,
-          standardizeFeatures = standardization, standardizeLabel = true,
-          solverType = WeightedLeastSquares.QuasiNewton)
-        val model = wls.fit(constantFeaturesInstances)
-        val actual = Vectors.dense(model.intercept, model.coefficients(0), model.coefficients(1))
-        assert(actual ~== expectedQuasiNewton(idx) absTol 1e-6)
-      }
+      val wls = new WeightedLeastSquares(fitIntercept, regParam = lambda, elasticNetParam = alpha,
+        standardizeFeatures = standardization, standardizeLabel = true,
+        solverType = WeightedLeastSquares.QuasiNewton)
+      val model = wls.fit(constantFeaturesInstances)
+      val actual = Vectors.dense(model.intercept, model.coefficients(0), model.coefficients(1))
+      assert(actual ~== expectedQuasiNewton(idx) absTol 1e-6)
+
       idx += 1
     }
   }

From 7ac70e7ba8d610a45c21a70dc28e4c989c19451b Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Wed, 26 Oct 2016 10:36:36 -0700
Subject: [PATCH 125/162] [SPARK-13747][SQL] Fix concurrent executions in
 ForkJoinPool for SQL

## What changes were proposed in this pull request?

Calling `Await.result` will allow other tasks to be run on the same thread when using ForkJoinPool. However, SQL uses a `ThreadLocal` execution id to trace Spark jobs launched by a query, which doesn't work perfectly in ForkJoinPool.

This PR just uses `Awaitable.result` instead to  prevent ForkJoinPool from running other tasks in the current waiting thread.

## How was this patch tested?

Jenkins

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #15520 from zsxwing/SPARK-13747.
---
 .../org/apache/spark/util/ThreadUtils.scala   | 21 +++++++++++++++++++
 scalastyle-config.xml                         |  1 +
 .../execution/basicPhysicalOperators.scala    |  2 +-
 .../exchange/BroadcastExchangeExec.scala      |  3 ++-
 4 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
index 5a6dbc830448a..d093e7bfc3dac 100644
--- a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
@@ -194,4 +194,25 @@ private[spark] object ThreadUtils {
         throw new SparkException("Exception thrown in awaitResult: ", t)
     }
   }
+
+  /**
+   * Calls [[Awaitable.result]] directly to avoid using `ForkJoinPool`'s `BlockingContext`, wraps
+   * and re-throws any exceptions with nice stack track.
+   *
+   * Codes running in the user's thread may be in a thread of Scala ForkJoinPool. As concurrent
+   * executions in ForkJoinPool may see some [[ThreadLocal]] value unexpectedly, this method
+   * basically prevents ForkJoinPool from running other tasks in the current waiting thread.
+   */
+  @throws(classOf[SparkException])
+  def awaitResultInForkJoinSafely[T](awaitable: Awaitable[T], atMost: Duration): T = {
+    try {
+      // `awaitPermission` is not actually used anywhere so it's safe to pass in null here.
+      // See SPARK-13747.
+      val awaitPermission = null.asInstanceOf[scala.concurrent.CanAwait]
+      awaitable.result(Duration.Inf)(awaitPermission)
+    } catch {
+      case NonFatal(t) =>
+        throw new SparkException("Exception thrown in awaitResult: ", t)
+    }
+  }
 }
diff --git a/scalastyle-config.xml b/scalastyle-config.xml
index 7fe0697202cd1..81d57d723a720 100644
--- a/scalastyle-config.xml
+++ b/scalastyle-config.xml
@@ -200,6 +200,7 @@ This file is divided into 3 sections:
       // scalastyle:off awaitresult
       Await.result(...)
       // scalastyle:on awaitresult
+      If your codes use ThreadLocal and may run in threads created by the user, use ThreadUtils.awaitResultInForkJoinSafely instead.
     ]]></customMessage>
   </check>
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
index 37d750e621c25..a5291e0c12f88 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
@@ -570,7 +570,7 @@ case class SubqueryExec(name: String, child: SparkPlan) extends UnaryExecNode {
   }
 
   override def executeCollect(): Array[InternalRow] = {
-    ThreadUtils.awaitResult(relationFuture, Duration.Inf)
+    ThreadUtils.awaitResultInForkJoinSafely(relationFuture, Duration.Inf)
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala
index 7be5d31d4a765..ce5013daeb1f9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala
@@ -128,7 +128,8 @@ case class BroadcastExchangeExec(
   }
 
   override protected[sql] def doExecuteBroadcast[T](): broadcast.Broadcast[T] = {
-    ThreadUtils.awaitResult(relationFuture, timeout).asInstanceOf[broadcast.Broadcast[T]]
+    ThreadUtils.awaitResultInForkJoinSafely(relationFuture, timeout)
+      .asInstanceOf[broadcast.Broadcast[T]]
   }
 }
 

From fa7d9d70825a6816495d239da925d0087f7cb94f Mon Sep 17 00:00:00 2001
From: jiangxingbo <jiangxb1987@gmail.com>
Date: Wed, 26 Oct 2016 20:12:20 +0200
Subject: [PATCH 126/162] [SPARK-18063][SQL] Failed to infer constraints over
 multiple aliases

## What changes were proposed in this pull request?

The `UnaryNode.getAliasedConstraints` function fails to replace all expressions by their alias where constraints contains more than one expression to be replaced.
For example:
```
val tr = LocalRelation('a.int, 'b.string, 'c.int)
val multiAlias = tr.where('a === 'c + 10).select('a.as('x), 'c.as('y))
multiAlias.analyze.constraints
```
currently outputs:
```
ExpressionSet(Seq(
    IsNotNull(resolveColumn(multiAlias.analyze, "x")),
    IsNotNull(resolveColumn(multiAlias.analyze, "y"))
)
```
The constraint `resolveColumn(multiAlias.analyze, "x") === resolveColumn(multiAlias.analyze, "y") + 10)` is missing.

## How was this patch tested?

Add new test cases in `ConstraintPropagationSuite`.

Author: jiangxingbo <jiangxb1987@gmail.com>

Closes #15597 from jiangxb1987/alias-constraints.
---
 .../sql/catalyst/plans/logical/LogicalPlan.scala | 16 ++++++++++------
 .../plans/ConstraintPropagationSuite.scala       |  8 ++++++++
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index 09725473a384d..b0a4145f37767 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -293,15 +293,19 @@ abstract class UnaryNode extends LogicalPlan {
    * expressions with the corresponding alias
    */
   protected def getAliasedConstraints(projectList: Seq[NamedExpression]): Set[Expression] = {
-    projectList.flatMap {
+    var allConstraints = child.constraints.asInstanceOf[Set[Expression]]
+    projectList.foreach {
       case a @ Alias(e, _) =>
-        child.constraints.map(_ transform {
+        // For every alias in `projectList`, replace the reference in constraints by its attribute.
+        allConstraints ++= allConstraints.map(_ transform {
           case expr: Expression if expr.semanticEquals(e) =>
             a.toAttribute
-        }).union(Set(EqualNullSafe(e, a.toAttribute)))
-      case _ =>
-        Set.empty[Expression]
-    }.toSet
+        })
+        allConstraints += EqualNullSafe(e, a.toAttribute)
+      case _ => // Don't change.
+    }
+
+    allConstraints -- child.constraints
   }
 
   override protected def validConstraints: Set[Expression] = child.constraints
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala
index 8d6a49a8a37b4..8068ce922e636 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala
@@ -128,8 +128,16 @@ class ConstraintPropagationSuite extends SparkFunSuite {
       ExpressionSet(Seq(resolveColumn(aliasedRelation.analyze, "x") > 10,
         IsNotNull(resolveColumn(aliasedRelation.analyze, "x")),
         resolveColumn(aliasedRelation.analyze, "b") <=> resolveColumn(aliasedRelation.analyze, "y"),
+        resolveColumn(aliasedRelation.analyze, "z") <=> resolveColumn(aliasedRelation.analyze, "x"),
         resolveColumn(aliasedRelation.analyze, "z") > 10,
         IsNotNull(resolveColumn(aliasedRelation.analyze, "z")))))
+
+    val multiAlias = tr.where('a === 'c + 10).select('a.as('x), 'c.as('y))
+    verifyConstraints(multiAlias.analyze.constraints,
+      ExpressionSet(Seq(IsNotNull(resolveColumn(multiAlias.analyze, "x")),
+        IsNotNull(resolveColumn(multiAlias.analyze, "y")),
+        resolveColumn(multiAlias.analyze, "x") === resolveColumn(multiAlias.analyze, "y") + 10))
+    )
   }
 
   test("propagating constraints in union") {

From 7d10631c16b980adf1f55378c128436310daed65 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Wed, 26 Oct 2016 11:16:20 -0700
Subject: [PATCH 127/162] [SPARK-18104][DOC] Don't build KafkaSource doc

## What changes were proposed in this pull request?

Don't need to build doc for KafkaSource because the user should use the data source APIs to use KafkaSource. All KafkaSource APIs are internal.

## How was this patch tested?

Verified manually.

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #15630 from zsxwing/kafka-unidoc.
---
 project/SparkBuild.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 88d5dc9b02dd9..2d3a95b163a76 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -714,9 +714,9 @@ object Unidoc {
     publish := {},
 
     unidocProjectFilter in(ScalaUnidoc, unidoc) :=
-      inAnyProject -- inProjects(OldDeps.project, repl, examples, tools, streamingFlumeSink, yarn, tags, streamingKafka010),
+      inAnyProject -- inProjects(OldDeps.project, repl, examples, tools, streamingFlumeSink, yarn, tags, streamingKafka010, sqlKafka010),
     unidocProjectFilter in(JavaUnidoc, unidoc) :=
-      inAnyProject -- inProjects(OldDeps.project, repl, examples, tools, streamingFlumeSink, yarn, tags, streamingKafka010),
+      inAnyProject -- inProjects(OldDeps.project, repl, examples, tools, streamingFlumeSink, yarn, tags, streamingKafka010, sqlKafka010),
 
     unidocAllClasspaths in (ScalaUnidoc, unidoc) := {
       ignoreClasspaths((unidocAllClasspaths in (ScalaUnidoc, unidoc)).value)

From ea3605e82545031a00235ee0f449e1e2418674e8 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 26 Oct 2016 11:48:54 -0700
Subject: [PATCH 128/162] [MINOR][ML] Refactor clustering summary.

## What changes were proposed in this pull request?
Abstract ```ClusteringSummary``` from ```KMeansSummary```, ```GaussianMixtureSummary``` and ```BisectingSummary```, and eliminate duplicated pieces of code.

## How was this patch tested?
Existing tests.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #15555 from yanboliang/clustering-summary.
---
 .../spark/ml/clustering/BisectingKMeans.scala | 36 +++----------
 .../ml/clustering/ClusteringSummary.scala     | 54 +++++++++++++++++++
 .../spark/ml/clustering/GaussianMixture.scala | 37 ++++---------
 .../apache/spark/ml/clustering/KMeans.scala   | 36 +++----------
 4 files changed, 80 insertions(+), 83 deletions(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
index ef2d918ea3542..2718dd93dcb5a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
@@ -288,35 +288,15 @@ object BisectingKMeans extends DefaultParamsReadable[BisectingKMeans] {
  * :: Experimental ::
  * Summary of BisectingKMeans.
  *
- * @param predictions  [[DataFrame]] produced by [[BisectingKMeansModel.transform()]]
- * @param predictionCol  Name for column of predicted clusters in `predictions`
- * @param featuresCol  Name for column of features in `predictions`
- * @param k  Number of clusters
+ * @param predictions  [[DataFrame]] produced by [[BisectingKMeansModel.transform()]].
+ * @param predictionCol  Name for column of predicted clusters in `predictions`.
+ * @param featuresCol  Name for column of features in `predictions`.
+ * @param k  Number of clusters.
  */
 @Since("2.1.0")
 @Experimental
 class BisectingKMeansSummary private[clustering] (
-    @Since("2.1.0") @transient val predictions: DataFrame,
-    @Since("2.1.0") val predictionCol: String,
-    @Since("2.1.0") val featuresCol: String,
-    @Since("2.1.0") val k: Int) extends Serializable {
-
-  /**
-   * Cluster centers of the transformed data.
-   */
-  @Since("2.1.0")
-  @transient lazy val cluster: DataFrame = predictions.select(predictionCol)
-
-  /**
-   * Size of (number of data points in) each cluster.
-   */
-  @Since("2.1.0")
-  lazy val clusterSizes: Array[Long] = {
-    val sizes = Array.fill[Long](k)(0)
-    cluster.groupBy(predictionCol).count().select(predictionCol, "count").collect().foreach {
-      case Row(cluster: Int, count: Long) => sizes(cluster) = count
-    }
-    sizes
-  }
-
-}
+    predictions: DataFrame,
+    predictionCol: String,
+    featuresCol: String,
+    k: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala
new file mode 100644
index 0000000000000..8b5f525194f28
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.clustering
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.sql.{DataFrame, Row}
+
+/**
+ * :: Experimental ::
+ * Summary of clustering algorithms.
+ *
+ * @param predictions  [[DataFrame]] produced by model.transform().
+ * @param predictionCol  Name for column of predicted clusters in `predictions`.
+ * @param featuresCol  Name for column of features in `predictions`.
+ * @param k  Number of clusters.
+ */
+@Experimental
+class ClusteringSummary private[clustering] (
+    @transient val predictions: DataFrame,
+    val predictionCol: String,
+    val featuresCol: String,
+    val k: Int) extends Serializable {
+
+  /**
+   * Cluster centers of the transformed data.
+   */
+  @transient lazy val cluster: DataFrame = predictions.select(predictionCol)
+
+  /**
+   * Size of (number of data points in) each cluster.
+   */
+  lazy val clusterSizes: Array[Long] = {
+    val sizes = Array.fill[Long](k)(0)
+    cluster.groupBy(predictionCol).count().select(predictionCol, "count").collect().foreach {
+      case Row(cluster: Int, count: Long) => sizes(cluster) = count
+    }
+    sizes
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
index 69f060ad7711e..e3cb92f4f144d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -356,42 +356,25 @@ object GaussianMixture extends DefaultParamsReadable[GaussianMixture] {
  * :: Experimental ::
  * Summary of GaussianMixture.
  *
- * @param predictions  [[DataFrame]] produced by [[GaussianMixtureModel.transform()]]
- * @param predictionCol  Name for column of predicted clusters in `predictions`
- * @param probabilityCol  Name for column of predicted probability of each cluster in `predictions`
- * @param featuresCol  Name for column of features in `predictions`
- * @param k  Number of clusters
+ * @param predictions  [[DataFrame]] produced by [[GaussianMixtureModel.transform()]].
+ * @param predictionCol  Name for column of predicted clusters in `predictions`.
+ * @param probabilityCol  Name for column of predicted probability of each cluster
+ *                        in `predictions`.
+ * @param featuresCol  Name for column of features in `predictions`.
+ * @param k  Number of clusters.
  */
 @Since("2.0.0")
 @Experimental
 class GaussianMixtureSummary private[clustering] (
-    @Since("2.0.0") @transient val predictions: DataFrame,
-    @Since("2.0.0") val predictionCol: String,
+    predictions: DataFrame,
+    predictionCol: String,
     @Since("2.0.0") val probabilityCol: String,
-    @Since("2.0.0") val featuresCol: String,
-    @Since("2.0.0") val k: Int) extends Serializable {
-
-  /**
-   * Cluster centers of the transformed data.
-   */
-  @Since("2.0.0")
-  @transient lazy val cluster: DataFrame = predictions.select(predictionCol)
+    featuresCol: String,
+    k: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k) {
 
   /**
    * Probability of each cluster.
    */
   @Since("2.0.0")
   @transient lazy val probability: DataFrame = predictions.select(probabilityCol)
-
-  /**
-   * Size of (number of data points in) each cluster.
-   */
-  @Since("2.0.0")
-  lazy val clusterSizes: Array[Long] = {
-    val sizes = Array.fill[Long](k)(0)
-    cluster.groupBy(predictionCol).count().select(predictionCol, "count").collect().foreach {
-      case Row(cluster: Int, count: Long) => sizes(cluster) = count
-    }
-    sizes
-  }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
index 0d2405b50068e..05ed3223ae537 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -346,35 +346,15 @@ object KMeans extends DefaultParamsReadable[KMeans] {
  * :: Experimental ::
  * Summary of KMeans.
  *
- * @param predictions  [[DataFrame]] produced by [[KMeansModel.transform()]]
- * @param predictionCol  Name for column of predicted clusters in `predictions`
- * @param featuresCol  Name for column of features in `predictions`
- * @param k  Number of clusters
+ * @param predictions  [[DataFrame]] produced by [[KMeansModel.transform()]].
+ * @param predictionCol  Name for column of predicted clusters in `predictions`.
+ * @param featuresCol  Name for column of features in `predictions`.
+ * @param k  Number of clusters.
  */
 @Since("2.0.0")
 @Experimental
 class KMeansSummary private[clustering] (
-    @Since("2.0.0") @transient val predictions: DataFrame,
-    @Since("2.0.0") val predictionCol: String,
-    @Since("2.0.0") val featuresCol: String,
-    @Since("2.0.0") val k: Int) extends Serializable {
-
-  /**
-   * Cluster centers of the transformed data.
-   */
-  @Since("2.0.0")
-  @transient lazy val cluster: DataFrame = predictions.select(predictionCol)
-
-  /**
-   * Size of (number of data points in) each cluster.
-   */
-  @Since("2.0.0")
-  lazy val clusterSizes: Array[Long] = {
-    val sizes = Array.fill[Long](k)(0)
-    cluster.groupBy(predictionCol).count().select(predictionCol, "count").collect().foreach {
-      case Row(cluster: Int, count: Long) => sizes(cluster) = count
-    }
-    sizes
-  }
-
-}
+    predictions: DataFrame,
+    predictionCol: String,
+    featuresCol: String,
+    k: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k)

From fb0a8a8dd7e8985676a846684b956e2d988875c6 Mon Sep 17 00:00:00 2001
From: WeichenXu <WeichenXu123@outlook.com>
Date: Wed, 26 Oct 2016 13:26:43 -0700
Subject: [PATCH 129/162] [SPARK-17961][SPARKR][SQL] Add storageLevel to
 DataFrame for SparkR

## What changes were proposed in this pull request?

Add storageLevel to DataFrame for SparkR.
This is similar to this RP:  https://github.com/apache/spark/pull/13780

but in R I do not make a class for `StorageLevel`
but add a method `storageToString`

## How was this patch tested?

test added.

Author: WeichenXu <WeichenXu123@outlook.com>

Closes #15516 from WeichenXu123/storageLevel_df_r.
---
 R/pkg/NAMESPACE                           |  1 +
 R/pkg/R/DataFrame.R                       | 28 +++++++++++++++-
 R/pkg/R/RDD.R                             |  2 +-
 R/pkg/R/generics.R                        |  6 +++-
 R/pkg/R/utils.R                           | 41 +++++++++++++++++++++++
 R/pkg/inst/tests/testthat/test_sparkSQL.R |  5 ++-
 6 files changed, 79 insertions(+), 4 deletions(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 87181851714e0..eb314f471893b 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -124,6 +124,7 @@ exportMethods("arrange",
               "selectExpr",
               "show",
               "showDF",
+              "storageLevel",
               "subset",
               "summarize",
               "summary",
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index b6ce838969a44..be34e4b32f6f9 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -633,7 +633,7 @@ setMethod("persist",
 #' @param ... further arguments to be passed to or from other methods.
 #'
 #' @family SparkDataFrame functions
-#' @rdname unpersist-methods
+#' @rdname unpersist
 #' @aliases unpersist,SparkDataFrame-method
 #' @name unpersist
 #' @export
@@ -654,6 +654,32 @@ setMethod("unpersist",
             x
           })
 
+#' StorageLevel
+#'
+#' Get storagelevel of this SparkDataFrame.
+#'
+#' @param x the SparkDataFrame to get the storageLevel.
+#'
+#' @family SparkDataFrame functions
+#' @rdname storageLevel
+#' @aliases storageLevel,SparkDataFrame-method
+#' @name storageLevel
+#' @export
+#' @examples
+#'\dontrun{
+#' sparkR.session()
+#' path <- "path/to/file.json"
+#' df <- read.json(path)
+#' persist(df, "MEMORY_AND_DISK")
+#' storageLevel(df)
+#'}
+#' @note storageLevel since 2.1.0
+setMethod("storageLevel",
+          signature(x = "SparkDataFrame"),
+          function(x) {
+            storageLevelToString(callJMethod(x@sdf, "storageLevel"))
+          })
+
 #' Repartition
 #'
 #' The following options for repartition are possible:
diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R
index 6cd0704003f1a..0f1162fec1df9 100644
--- a/R/pkg/R/RDD.R
+++ b/R/pkg/R/RDD.R
@@ -261,7 +261,7 @@ setMethod("persistRDD",
 #' cache(rdd) # rdd@@env$isCached == TRUE
 #' unpersistRDD(rdd) # rdd@@env$isCached == FALSE
 #'}
-#' @rdname unpersist-methods
+#' @rdname unpersist
 #' @aliases unpersist,RDD-method
 #' @noRd
 setMethod("unpersistRDD",
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 5549cd7cac516..4569fe4890468 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -691,6 +691,10 @@ setGeneric("selectExpr", function(x, expr, ...) { standardGeneric("selectExpr")
 #' @export
 setGeneric("showDF", function(x, ...) { standardGeneric("showDF") })
 
+# @rdname storageLevel
+# @export
+setGeneric("storageLevel", function(x) { standardGeneric("storageLevel") })
+
 #' @rdname subset
 #' @export
 setGeneric("subset", function(x, ...) { standardGeneric("subset") })
@@ -715,7 +719,7 @@ setGeneric("union", function(x, y) { standardGeneric("union") })
 #' @export
 setGeneric("unionAll", function(x, y) { standardGeneric("unionAll") })
 
-#' @rdname unpersist-methods
+#' @rdname unpersist
 #' @export
 setGeneric("unpersist", function(x, ...) { standardGeneric("unpersist") })
 
diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R
index fa8bb0f79ce80..c4e78cbb804d9 100644
--- a/R/pkg/R/utils.R
+++ b/R/pkg/R/utils.R
@@ -385,6 +385,47 @@ getStorageLevel <- function(newLevel = c("DISK_ONLY",
                          "OFF_HEAP" = callJStatic(storageLevelClass, "OFF_HEAP"))
 }
 
+storageLevelToString <- function(levelObj) {
+  useDisk <- callJMethod(levelObj, "useDisk")
+  useMemory <- callJMethod(levelObj, "useMemory")
+  useOffHeap <- callJMethod(levelObj, "useOffHeap")
+  deserialized <- callJMethod(levelObj, "deserialized")
+  replication <- callJMethod(levelObj, "replication")
+  shortName <- if (!useDisk && !useMemory && !useOffHeap && !deserialized && replication == 1) {
+    "NONE"
+  } else if (useDisk && !useMemory && !useOffHeap && !deserialized && replication == 1) {
+    "DISK_ONLY"
+  } else if (useDisk && !useMemory && !useOffHeap && !deserialized && replication == 2) {
+    "DISK_ONLY_2"
+  } else if (!useDisk && useMemory && !useOffHeap && deserialized && replication == 1) {
+    "MEMORY_ONLY"
+  } else if (!useDisk && useMemory && !useOffHeap && deserialized && replication == 2) {
+    "MEMORY_ONLY_2"
+  } else if (!useDisk && useMemory && !useOffHeap && !deserialized && replication == 1) {
+    "MEMORY_ONLY_SER"
+  } else if (!useDisk && useMemory && !useOffHeap && !deserialized && replication == 2) {
+    "MEMORY_ONLY_SER_2"
+  } else if (useDisk && useMemory && !useOffHeap && deserialized && replication == 1) {
+    "MEMORY_AND_DISK"
+  } else if (useDisk && useMemory && !useOffHeap && deserialized && replication == 2) {
+    "MEMORY_AND_DISK_2"
+  } else if (useDisk && useMemory && !useOffHeap && !deserialized && replication == 1) {
+    "MEMORY_AND_DISK_SER"
+  } else if (useDisk && useMemory && !useOffHeap && !deserialized && replication == 2) {
+    "MEMORY_AND_DISK_SER_2"
+  } else if (useDisk && useMemory && useOffHeap && !deserialized && replication == 1) {
+    "OFF_HEAP"
+  } else {
+    NULL
+  }
+  fullInfo <- callJMethod(levelObj, "toString")
+  if (is.null(shortName)) {
+    fullInfo
+  } else {
+    paste(shortName, "-", fullInfo)
+  }
+}
+
 # Utility function for functions where an argument needs to be integer but we want to allow
 # the user to type (for example) `5` instead of `5L` to avoid a confusing error message.
 numToInt <- function(num) {
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index e77dbde44ee66..9289db57b6d63 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -796,7 +796,7 @@ test_that("multiple pipeline transformations result in an RDD with the correct v
   expect_false(collectRDD(second)[[3]]$testCol)
 })
 
-test_that("cache(), persist(), and unpersist() on a DataFrame", {
+test_that("cache(), storageLevel(), persist(), and unpersist() on a DataFrame", {
   df <- read.json(jsonPath)
   expect_false(df@env$isCached)
   cache(df)
@@ -808,6 +808,9 @@ test_that("cache(), persist(), and unpersist() on a DataFrame", {
   persist(df, "MEMORY_AND_DISK")
   expect_true(df@env$isCached)
 
+  expect_equal(storageLevel(df),
+    "MEMORY_AND_DISK - StorageLevel(disk, memory, deserialized, 1 replicas)")
+
   unpersist(df)
   expect_false(df@env$isCached)
 

From dcdda19785a272969fb1e3ec18382403aaad6c91 Mon Sep 17 00:00:00 2001
From: Xin Ren <iamshrek@126.com>
Date: Wed, 26 Oct 2016 13:33:23 -0700
Subject: [PATCH 130/162] [SPARK-14300][DOCS][MLLIB] Scala MLlib examples code
 merge and clean up

## What changes were proposed in this pull request?

https://issues.apache.org/jira/browse/SPARK-14300

Duplicated code found in scala/examples/mllib, below all deleted in this PR:

- DenseGaussianMixture.scala
- StreamingLinearRegression.scala

## delete reasons:

#### delete: mllib/DenseGaussianMixture.scala

- duplicate of mllib/GaussianMixtureExample

#### delete: mllib/StreamingLinearRegression.scala

- duplicate of mllib/StreamingLinearRegressionExample

When merging and cleaning those code, be sure not disturb the previous example on and off blocks.

## How was this patch tested?

Test with `SKIP_API=1 jekyll` manually to make sure that works well.

Author: Xin Ren <iamshrek@126.com>

Closes #12195 from keypointt/SPARK-14300.
---
 .../examples/mllib/DenseGaussianMixture.scala | 75 -------------------
 .../mllib/StreamingLinearRegression.scala     | 73 ------------------
 .../StreamingLinearRegressionExample.scala    | 19 +++++
 3 files changed, 19 insertions(+), 148 deletions(-)
 delete mode 100644 examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala
 delete mode 100644 examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegression.scala

diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala
deleted file mode 100644
index 90b817b23e156..0000000000000
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// scalastyle:off println
-package org.apache.spark.examples.mllib
-
-import org.apache.spark.{SparkConf, SparkContext}
-import org.apache.spark.mllib.clustering.GaussianMixture
-import org.apache.spark.mllib.linalg.Vectors
-
-/**
- * An example Gaussian Mixture Model EM app. Run with
- * {{{
- * ./bin/run-example mllib.DenseGaussianMixture <input> <k> <convergenceTol>
- * }}}
- * If you use it as a template to create your own app, please use `spark-submit` to submit your app.
- */
-object DenseGaussianMixture {
-  def main(args: Array[String]): Unit = {
-    if (args.length < 3) {
-      println("usage: DenseGmmEM <input file> <k> <convergenceTol> [maxIterations]")
-    } else {
-      val maxIterations = if (args.length > 3) args(3).toInt else 100
-      run(args(0), args(1).toInt, args(2).toDouble, maxIterations)
-    }
-  }
-
-  private def run(inputFile: String, k: Int, convergenceTol: Double, maxIterations: Int) {
-    val conf = new SparkConf().setAppName("Gaussian Mixture Model EM example")
-    val ctx = new SparkContext(conf)
-
-    val data = ctx.textFile(inputFile).map { line =>
-      Vectors.dense(line.trim.split(' ').map(_.toDouble))
-    }.cache()
-
-    val clusters = new GaussianMixture()
-      .setK(k)
-      .setConvergenceTol(convergenceTol)
-      .setMaxIterations(maxIterations)
-      .run(data)
-
-    for (i <- 0 until clusters.k) {
-      println("weight=%f\nmu=%s\nsigma=\n%s\n" format
-        (clusters.weights(i), clusters.gaussians(i).mu, clusters.gaussians(i).sigma))
-    }
-
-    println("The membership value of each vector to all mixture components (first <= 100):")
-    val membership = clusters.predictSoft(data)
-    membership.take(100).foreach { x =>
-      print(" " + x.mkString(","))
-    }
-    println()
-    println("Cluster labels (first <= 100):")
-    val clusterLabels = clusters.predict(data)
-    clusterLabels.take(100).foreach { x =>
-      print(" " + x)
-    }
-    println()
-  }
-}
-// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegression.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegression.scala
deleted file mode 100644
index e5592966f13fa..0000000000000
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegression.scala
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// scalastyle:off println
-package org.apache.spark.examples.mllib
-
-import org.apache.spark.SparkConf
-import org.apache.spark.mllib.linalg.Vectors
-import org.apache.spark.mllib.regression.{LabeledPoint, StreamingLinearRegressionWithSGD}
-import org.apache.spark.streaming.{Seconds, StreamingContext}
-
-/**
- * Train a linear regression model on one stream of data and make predictions
- * on another stream, where the data streams arrive as text files
- * into two different directories.
- *
- * The rows of the text files must be labeled data points in the form
- * `(y,[x1,x2,x3,...,xn])`
- * Where n is the number of features. n must be the same for train and test.
- *
- * Usage: StreamingLinearRegression <trainingDir> <testDir> <batchDuration> <numFeatures>
- *
- * To run on your local machine using the two directories `trainingDir` and `testDir`,
- * with updates every 5 seconds, and 2 features per data point, call:
- *    $ bin/run-example mllib.StreamingLinearRegression trainingDir testDir 5 2
- *
- * As you add text files to `trainingDir` the model will continuously update.
- * Anytime you add text files to `testDir`, you'll see predictions from the current model.
- *
- */
-object StreamingLinearRegression {
-
-  def main(args: Array[String]) {
-
-    if (args.length != 4) {
-      System.err.println(
-        "Usage: StreamingLinearRegression <trainingDir> <testDir> <batchDuration> <numFeatures>")
-      System.exit(1)
-    }
-
-    val conf = new SparkConf().setMaster("local").setAppName("StreamingLinearRegression")
-    val ssc = new StreamingContext(conf, Seconds(args(2).toLong))
-
-    val trainingData = ssc.textFileStream(args(0)).map(LabeledPoint.parse)
-    val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse)
-
-    val model = new StreamingLinearRegressionWithSGD()
-      .setInitialWeights(Vectors.zeros(args(3).toInt))
-
-    model.trainOn(trainingData)
-    model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print()
-
-    ssc.start()
-    ssc.awaitTermination()
-
-  }
-
-}
-// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegressionExample.scala
index 0a1cd2d62d5b5..2ba1a62e450ee 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegressionExample.scala
@@ -26,6 +26,25 @@ import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD
 // $example off$
 import org.apache.spark.streaming._
 
+/**
+ * Train a linear regression model on one stream of data and make predictions
+ * on another stream, where the data streams arrive as text files
+ * into two different directories.
+ *
+ * The rows of the text files must be labeled data points in the form
+ * `(y,[x1,x2,x3,...,xn])`
+ * Where n is the number of features. n must be the same for train and test.
+ *
+ * Usage: StreamingLinearRegressionExample <trainingDir> <testDir>
+ *
+ * To run on your local machine using the two directories `trainingDir` and `testDir`,
+ * with updates every 5 seconds, and 2 features per data point, call:
+ *    $ bin/run-example mllib.StreamingLinearRegressionExample trainingDir testDir
+ *
+ * As you add text files to `trainingDir` the model will continuously update.
+ * Anytime you add text files to `testDir`, you'll see predictions from the current model.
+ *
+ */
 object StreamingLinearRegressionExample {
 
   def main(args: Array[String]): Unit = {

From 5b7d403c1819c32a6a5b87d470f8de1a8ad7a987 Mon Sep 17 00:00:00 2001
From: jiangxingbo <jiangxb1987@gmail.com>
Date: Wed, 26 Oct 2016 23:51:16 +0200
Subject: [PATCH 131/162] [SPARK-18094][SQL][TESTS] Move group analytics test
 cases from `SQLQuerySuite` into a query file test.

## What changes were proposed in this pull request?

Currently we have several test cases for group analytics(ROLLUP/CUBE/GROUPING SETS) in `SQLQuerySuite`, should better move them into a query file test.
The following test cases are moved to `group-analytics.sql`:
```
test("rollup")
test("grouping sets when aggregate functions containing groupBy columns")
test("cube")
test("grouping sets")
test("grouping and grouping_id")
test("grouping and grouping_id in having")
test("grouping and grouping_id in sort")
```

This is followup work of #15582

## How was this patch tested?

Modified query file `group-analytics.sql`, which will be tested by `SQLQueryTestSuite`.

Author: jiangxingbo <jiangxb1987@gmail.com>

Closes #15624 from jiangxb1987/group-analytics-test.
---
 .../sql-tests/inputs/group-analytics.sql      |  46 +++-
 .../sql-tests/results/group-analytics.sql.out | 247 +++++++++++++++++-
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 189 --------------
 3 files changed, 290 insertions(+), 192 deletions(-)

diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql b/sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql
index 2f783495ddf96..f8135389a9e5a 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql
@@ -10,4 +10,48 @@ SELECT a, b, SUM(b) FROM testData GROUP BY a, b WITH CUBE;
 -- ROLLUP on overlapping columns
 SELECT a + b, b, SUM(a - b) FROM testData GROUP BY a + b, b WITH ROLLUP;
 
-SELECT a, b, SUM(b) FROM testData GROUP BY a, b WITH ROLLUP;
\ No newline at end of file
+SELECT a, b, SUM(b) FROM testData GROUP BY a, b WITH ROLLUP;
+
+CREATE OR REPLACE TEMPORARY VIEW courseSales AS SELECT * FROM VALUES
+("dotNET", 2012, 10000), ("Java", 2012, 20000), ("dotNET", 2012, 5000), ("dotNET", 2013, 48000), ("Java", 2013, 30000)
+AS courseSales(course, year, earnings);
+
+-- ROLLUP
+SELECT course, year, SUM(earnings) FROM courseSales GROUP BY ROLLUP(course, year) ORDER BY course, year;
+
+-- CUBE
+SELECT course, year, SUM(earnings) FROM courseSales GROUP BY CUBE(course, year) ORDER BY course, year;
+
+-- GROUPING SETS
+SELECT course, year, SUM(earnings) FROM courseSales GROUP BY course, year GROUPING SETS(course, year);
+SELECT course, year, SUM(earnings) FROM courseSales GROUP BY course, year GROUPING SETS(course);
+SELECT course, year, SUM(earnings) FROM courseSales GROUP BY course, year GROUPING SETS(year);
+
+-- GROUPING SETS with aggregate functions containing groupBy columns
+SELECT course, SUM(earnings) AS sum FROM courseSales
+GROUP BY course, earnings GROUPING SETS((), (course), (course, earnings)) ORDER BY course, sum;
+SELECT course, SUM(earnings) AS sum, GROUPING_ID(course, earnings) FROM courseSales
+GROUP BY course, earnings GROUPING SETS((), (course), (course, earnings)) ORDER BY course, sum;
+
+-- GROUPING/GROUPING_ID
+SELECT course, year, GROUPING(course), GROUPING(year), GROUPING_ID(course, year) FROM courseSales
+GROUP BY CUBE(course, year);
+SELECT course, year, GROUPING(course) FROM courseSales GROUP BY course, year;
+SELECT course, year, GROUPING_ID(course, year) FROM courseSales GROUP BY course, year;
+SELECT course, year, grouping__id FROM courseSales GROUP BY CUBE(course, year);
+
+-- GROUPING/GROUPING_ID in having clause
+SELECT course, year FROM courseSales GROUP BY CUBE(course, year)
+HAVING GROUPING(year) = 1 AND GROUPING_ID(course, year) > 0;
+SELECT course, year FROM courseSales GROUP BY course, year HAVING GROUPING(course) > 0;
+SELECT course, year FROM courseSales GROUP BY course, year HAVING GROUPING_ID(course) > 0;
+SELECT course, year FROM courseSales GROUP BY CUBE(course, year) HAVING grouping__id > 0;
+
+-- GROUPING/GROUPING_ID in orderBy clause
+SELECT course, year, GROUPING(course), GROUPING(year) FROM courseSales GROUP BY CUBE(course, year)
+ORDER BY GROUPING(course), GROUPING(year), course, year;
+SELECT course, year, GROUPING_ID(course, year) FROM courseSales GROUP BY CUBE(course, year)
+ORDER BY GROUPING(course), GROUPING(year), course, year;
+SELECT course, year FROM courseSales GROUP BY course, year ORDER BY GROUPING(course);
+SELECT course, year FROM courseSales GROUP BY course, year ORDER BY GROUPING_ID(course);
+SELECT course, year FROM courseSales GROUP BY CUBE(course, year) ORDER BY grouping__id;
\ No newline at end of file
diff --git a/sql/core/src/test/resources/sql-tests/results/group-analytics.sql.out b/sql/core/src/test/resources/sql-tests/results/group-analytics.sql.out
index 8ea7de809d19d..825e8f5488c8b 100644
--- a/sql/core/src/test/resources/sql-tests/results/group-analytics.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/group-analytics.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 5
+-- Number of queries: 26
 
 
 -- !query 0
@@ -32,7 +32,6 @@ NULL	2	0
 NULL	NULL	3
 
 
-
 -- !query 2
 SELECT a, b, SUM(b) FROM testData GROUP BY a, b WITH CUBE
 -- !query 2 schema
@@ -85,3 +84,247 @@ struct<a:int,b:int,sum(b):bigint>
 3	2	2
 3	NULL	3
 NULL	NULL	9
+
+
+-- !query 5
+CREATE OR REPLACE TEMPORARY VIEW courseSales AS SELECT * FROM VALUES
+("dotNET", 2012, 10000), ("Java", 2012, 20000), ("dotNET", 2012, 5000), ("dotNET", 2013, 48000), ("Java", 2013, 30000)
+AS courseSales(course, year, earnings)
+-- !query 5 schema
+struct<>
+-- !query 5 output
+
+
+
+-- !query 6
+SELECT course, year, SUM(earnings) FROM courseSales GROUP BY ROLLUP(course, year) ORDER BY course, year
+-- !query 6 schema
+struct<course:string,year:int,sum(earnings):bigint>
+-- !query 6 output
+NULL	NULL	113000
+Java	NULL	50000
+Java	2012	20000
+Java	2013	30000
+dotNET	NULL	63000
+dotNET	2012	15000
+dotNET	2013	48000
+
+
+-- !query 7
+SELECT course, year, SUM(earnings) FROM courseSales GROUP BY CUBE(course, year) ORDER BY course, year
+-- !query 7 schema
+struct<course:string,year:int,sum(earnings):bigint>
+-- !query 7 output
+NULL	NULL	113000
+NULL	2012	35000
+NULL	2013	78000
+Java	NULL	50000
+Java	2012	20000
+Java	2013	30000
+dotNET	NULL	63000
+dotNET	2012	15000
+dotNET	2013	48000
+
+
+-- !query 8
+SELECT course, year, SUM(earnings) FROM courseSales GROUP BY course, year GROUPING SETS(course, year)
+-- !query 8 schema
+struct<course:string,year:int,sum(earnings):bigint>
+-- !query 8 output
+Java	NULL	50000
+NULL	2012	35000
+NULL	2013	78000
+dotNET	NULL	63000
+
+
+-- !query 9
+SELECT course, year, SUM(earnings) FROM courseSales GROUP BY course, year GROUPING SETS(course)
+-- !query 9 schema
+struct<course:string,year:int,sum(earnings):bigint>
+-- !query 9 output
+Java	NULL	50000
+dotNET	NULL	63000
+
+
+-- !query 10
+SELECT course, year, SUM(earnings) FROM courseSales GROUP BY course, year GROUPING SETS(year)
+-- !query 10 schema
+struct<course:string,year:int,sum(earnings):bigint>
+-- !query 10 output
+NULL	2012	35000
+NULL	2013	78000
+
+
+-- !query 11
+SELECT course, SUM(earnings) AS sum FROM courseSales
+GROUP BY course, earnings GROUPING SETS((), (course), (course, earnings)) ORDER BY course, sum
+-- !query 11 schema
+struct<course:string,sum:bigint>
+-- !query 11 output
+NULL	113000
+Java	20000
+Java	30000
+Java	50000
+dotNET	5000
+dotNET	10000
+dotNET	48000
+dotNET	63000
+
+
+-- !query 12
+SELECT course, SUM(earnings) AS sum, GROUPING_ID(course, earnings) FROM courseSales
+GROUP BY course, earnings GROUPING SETS((), (course), (course, earnings)) ORDER BY course, sum
+-- !query 12 schema
+struct<course:string,sum:bigint,grouping_id(course, earnings):int>
+-- !query 12 output
+NULL	113000	3
+Java	20000	0
+Java	30000	0
+Java	50000	1
+dotNET	5000	0
+dotNET	10000	0
+dotNET	48000	0
+dotNET	63000	1
+
+
+-- !query 13
+SELECT course, year, GROUPING(course), GROUPING(year), GROUPING_ID(course, year) FROM courseSales
+GROUP BY CUBE(course, year)
+-- !query 13 schema
+struct<course:string,year:int,grouping(course):tinyint,grouping(year):tinyint,grouping_id(course, year):int>
+-- !query 13 output
+Java	2012	0	0	0
+Java	2013	0	0	0
+Java	NULL	0	1	1
+NULL	2012	1	0	2
+NULL	2013	1	0	2
+NULL	NULL	1	1	3
+dotNET	2012	0	0	0
+dotNET	2013	0	0	0
+dotNET	NULL	0	1	1
+
+
+-- !query 14
+SELECT course, year, GROUPING(course) FROM courseSales GROUP BY course, year
+-- !query 14 schema
+struct<>
+-- !query 14 output
+org.apache.spark.sql.AnalysisException
+grouping() can only be used with GroupingSets/Cube/Rollup;
+
+
+-- !query 15
+SELECT course, year, GROUPING_ID(course, year) FROM courseSales GROUP BY course, year
+-- !query 15 schema
+struct<>
+-- !query 15 output
+org.apache.spark.sql.AnalysisException
+grouping_id() can only be used with GroupingSets/Cube/Rollup;
+
+
+-- !query 16
+SELECT course, year, grouping__id FROM courseSales GROUP BY CUBE(course, year)
+-- !query 16 schema
+struct<>
+-- !query 16 output
+org.apache.spark.sql.AnalysisException
+grouping__id is deprecated; use grouping_id() instead;
+
+
+-- !query 17
+SELECT course, year FROM courseSales GROUP BY CUBE(course, year)
+HAVING GROUPING(year) = 1 AND GROUPING_ID(course, year) > 0
+-- !query 17 schema
+struct<course:string,year:int>
+-- !query 17 output
+Java	NULL
+NULL	NULL
+dotNET	NULL
+
+
+-- !query 18
+SELECT course, year FROM courseSales GROUP BY course, year HAVING GROUPING(course) > 0
+-- !query 18 schema
+struct<>
+-- !query 18 output
+org.apache.spark.sql.AnalysisException
+grouping()/grouping_id() can only be used with GroupingSets/Cube/Rollup;
+
+
+-- !query 19
+SELECT course, year FROM courseSales GROUP BY course, year HAVING GROUPING_ID(course) > 0
+-- !query 19 schema
+struct<>
+-- !query 19 output
+org.apache.spark.sql.AnalysisException
+grouping()/grouping_id() can only be used with GroupingSets/Cube/Rollup;
+
+
+-- !query 20
+SELECT course, year FROM courseSales GROUP BY CUBE(course, year) HAVING grouping__id > 0
+-- !query 20 schema
+struct<>
+-- !query 20 output
+org.apache.spark.sql.AnalysisException
+grouping__id is deprecated; use grouping_id() instead;
+
+
+-- !query 21
+SELECT course, year, GROUPING(course), GROUPING(year) FROM courseSales GROUP BY CUBE(course, year)
+ORDER BY GROUPING(course), GROUPING(year), course, year
+-- !query 21 schema
+struct<course:string,year:int,grouping(course):tinyint,grouping(year):tinyint>
+-- !query 21 output
+Java	2012	0	0
+Java	2013	0	0
+dotNET	2012	0	0
+dotNET	2013	0	0
+Java	NULL	0	1
+dotNET	NULL	0	1
+NULL	2012	1	0
+NULL	2013	1	0
+NULL	NULL	1	1
+
+
+-- !query 22
+SELECT course, year, GROUPING_ID(course, year) FROM courseSales GROUP BY CUBE(course, year)
+ORDER BY GROUPING(course), GROUPING(year), course, year
+-- !query 22 schema
+struct<course:string,year:int,grouping_id(course, year):int>
+-- !query 22 output
+Java	2012	0
+Java	2013	0
+dotNET	2012	0
+dotNET	2013	0
+Java	NULL	1
+dotNET	NULL	1
+NULL	2012	2
+NULL	2013	2
+NULL	NULL	3
+
+
+-- !query 23
+SELECT course, year FROM courseSales GROUP BY course, year ORDER BY GROUPING(course)
+-- !query 23 schema
+struct<>
+-- !query 23 output
+org.apache.spark.sql.AnalysisException
+grouping()/grouping_id() can only be used with GroupingSets/Cube/Rollup;
+
+
+-- !query 24
+SELECT course, year FROM courseSales GROUP BY course, year ORDER BY GROUPING_ID(course)
+-- !query 24 schema
+struct<>
+-- !query 24 output
+org.apache.spark.sql.AnalysisException
+grouping()/grouping_id() can only be used with GroupingSets/Cube/Rollup;
+
+
+-- !query 25
+SELECT course, year FROM courseSales GROUP BY CUBE(course, year) ORDER BY grouping__id
+-- !query 25 schema
+struct<>
+-- !query 25 output
+org.apache.spark.sql.AnalysisException
+grouping__id is deprecated; use grouping_id() instead;
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index bd4c25315c311..1a43d0b2205ca 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2005,195 +2005,6 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       Row(false) :: Row(true) :: Nil)
   }
 
-  test("rollup") {
-    checkAnswer(
-      sql("select course, year, sum(earnings) from courseSales group by rollup(course, year)" +
-        " order by course, year"),
-      Row(null, null, 113000.0) ::
-        Row("Java", null, 50000.0) ::
-        Row("Java", 2012, 20000.0) ::
-        Row("Java", 2013, 30000.0) ::
-        Row("dotNET", null, 63000.0) ::
-        Row("dotNET", 2012, 15000.0) ::
-        Row("dotNET", 2013, 48000.0) :: Nil
-    )
-  }
-
-  test("grouping sets when aggregate functions containing groupBy columns") {
-    checkAnswer(
-      sql("select course, sum(earnings) as sum from courseSales group by course, earnings " +
-        "grouping sets((), (course), (course, earnings)) " +
-        "order by course, sum"),
-      Row(null, 113000.0) ::
-        Row("Java", 20000.0) ::
-        Row("Java", 30000.0) ::
-        Row("Java", 50000.0) ::
-        Row("dotNET", 5000.0) ::
-        Row("dotNET", 10000.0) ::
-        Row("dotNET", 48000.0) ::
-        Row("dotNET", 63000.0) :: Nil
-    )
-
-    checkAnswer(
-      sql("select course, sum(earnings) as sum, grouping_id(course, earnings) from courseSales " +
-        "group by course, earnings grouping sets((), (course), (course, earnings)) " +
-        "order by course, sum"),
-      Row(null, 113000.0, 3) ::
-        Row("Java", 20000.0, 0) ::
-        Row("Java", 30000.0, 0) ::
-        Row("Java", 50000.0, 1) ::
-        Row("dotNET", 5000.0, 0) ::
-        Row("dotNET", 10000.0, 0) ::
-        Row("dotNET", 48000.0, 0) ::
-        Row("dotNET", 63000.0, 1) :: Nil
-    )
-  }
-
-  test("cube") {
-    checkAnswer(
-      sql("select course, year, sum(earnings) from courseSales group by cube(course, year)"),
-      Row("Java", 2012, 20000.0) ::
-        Row("Java", 2013, 30000.0) ::
-        Row("Java", null, 50000.0) ::
-        Row("dotNET", 2012, 15000.0) ::
-        Row("dotNET", 2013, 48000.0) ::
-        Row("dotNET", null, 63000.0) ::
-        Row(null, 2012, 35000.0) ::
-        Row(null, 2013, 78000.0) ::
-        Row(null, null, 113000.0) :: Nil
-    )
-  }
-
-  test("grouping sets") {
-    checkAnswer(
-      sql("select course, year, sum(earnings) from courseSales group by course, year " +
-        "grouping sets(course, year)"),
-      Row("Java", null, 50000.0) ::
-        Row("dotNET", null, 63000.0) ::
-        Row(null, 2012, 35000.0) ::
-        Row(null, 2013, 78000.0) :: Nil
-    )
-
-    checkAnswer(
-      sql("select course, year, sum(earnings) from courseSales group by course, year " +
-        "grouping sets(course)"),
-      Row("Java", null, 50000.0) ::
-        Row("dotNET", null, 63000.0) :: Nil
-    )
-
-    checkAnswer(
-      sql("select course, year, sum(earnings) from courseSales group by course, year " +
-        "grouping sets(year)"),
-      Row(null, 2012, 35000.0) ::
-        Row(null, 2013, 78000.0) :: Nil
-    )
-  }
-
-  test("grouping and grouping_id") {
-    checkAnswer(
-      sql("select course, year, grouping(course), grouping(year), grouping_id(course, year)" +
-        " from courseSales group by cube(course, year)"),
-      Row("Java", 2012, 0, 0, 0) ::
-        Row("Java", 2013, 0, 0, 0) ::
-        Row("Java", null, 0, 1, 1) ::
-        Row("dotNET", 2012, 0, 0, 0) ::
-        Row("dotNET", 2013, 0, 0, 0) ::
-        Row("dotNET", null, 0, 1, 1) ::
-        Row(null, 2012, 1, 0, 2) ::
-        Row(null, 2013, 1, 0, 2) ::
-        Row(null, null, 1, 1, 3) :: Nil
-    )
-
-    var error = intercept[AnalysisException] {
-      sql("select course, year, grouping(course) from courseSales group by course, year")
-    }
-    assert(error.getMessage contains "grouping() can only be used with GroupingSets/Cube/Rollup")
-    error = intercept[AnalysisException] {
-      sql("select course, year, grouping_id(course, year) from courseSales group by course, year")
-    }
-    assert(error.getMessage contains "grouping_id() can only be used with GroupingSets/Cube/Rollup")
-    error = intercept[AnalysisException] {
-      sql("select course, year, grouping__id from courseSales group by cube(course, year)")
-    }
-    assert(error.getMessage contains "grouping__id is deprecated; use grouping_id() instead")
-  }
-
-  test("grouping and grouping_id in having") {
-    checkAnswer(
-      sql("select course, year from courseSales group by cube(course, year)" +
-        " having grouping(year) = 1 and grouping_id(course, year) > 0"),
-        Row("Java", null) ::
-        Row("dotNET", null) ::
-        Row(null, null) :: Nil
-    )
-
-    var error = intercept[AnalysisException] {
-      sql("select course, year from courseSales group by course, year" +
-        " having grouping(course) > 0")
-    }
-    assert(error.getMessage contains
-      "grouping()/grouping_id() can only be used with GroupingSets/Cube/Rollup")
-    error = intercept[AnalysisException] {
-      sql("select course, year from courseSales group by course, year" +
-        " having grouping_id(course, year) > 0")
-    }
-    assert(error.getMessage contains
-      "grouping()/grouping_id() can only be used with GroupingSets/Cube/Rollup")
-    error = intercept[AnalysisException] {
-      sql("select course, year from courseSales group by cube(course, year)" +
-        " having grouping__id > 0")
-    }
-    assert(error.getMessage contains "grouping__id is deprecated; use grouping_id() instead")
-  }
-
-  test("grouping and grouping_id in sort") {
-    checkAnswer(
-      sql("select course, year, grouping(course), grouping(year) from courseSales" +
-        " group by cube(course, year) order by grouping_id(course, year), course, year"),
-      Row("Java", 2012, 0, 0) ::
-        Row("Java", 2013, 0, 0) ::
-        Row("dotNET", 2012, 0, 0) ::
-        Row("dotNET", 2013, 0, 0) ::
-        Row("Java", null, 0, 1) ::
-        Row("dotNET", null, 0, 1) ::
-        Row(null, 2012, 1, 0) ::
-        Row(null, 2013, 1, 0) ::
-        Row(null, null, 1, 1) :: Nil
-    )
-
-    checkAnswer(
-      sql("select course, year, grouping_id(course, year) from courseSales" +
-        " group by cube(course, year) order by grouping(course), grouping(year), course, year"),
-      Row("Java", 2012, 0) ::
-        Row("Java", 2013, 0) ::
-        Row("dotNET", 2012, 0) ::
-        Row("dotNET", 2013, 0) ::
-        Row("Java", null, 1) ::
-        Row("dotNET", null, 1) ::
-        Row(null, 2012, 2) ::
-        Row(null, 2013, 2) ::
-        Row(null, null, 3) :: Nil
-    )
-
-    var error = intercept[AnalysisException] {
-      sql("select course, year from courseSales group by course, year" +
-        " order by grouping(course)")
-    }
-    assert(error.getMessage contains
-      "grouping()/grouping_id() can only be used with GroupingSets/Cube/Rollup")
-    error = intercept[AnalysisException] {
-      sql("select course, year from courseSales group by course, year" +
-        " order by grouping_id(course, year)")
-    }
-    assert(error.getMessage contains
-      "grouping()/grouping_id() can only be used with GroupingSets/Cube/Rollup")
-    error = intercept[AnalysisException] {
-      sql("select course, year from courseSales group by cube(course, year)" +
-        " order by grouping__id")
-    }
-    assert(error.getMessage contains "grouping__id is deprecated; use grouping_id() instead")
-  }
-
   test("filter on a grouping column that is not presented in SELECT") {
     checkAnswer(
       sql("select count(1) from (select 1 as a) t group by a having a > 0"),

From 29cea8f332aa3750f8ff7c3b9e705d107278da4b Mon Sep 17 00:00:00 2001
From: "wm624@hotmail.com" <wm624@hotmail.com>
Date: Wed, 26 Oct 2016 16:12:55 -0700
Subject: [PATCH 132/162] [SPARK-17157][SPARKR] Add multiclass logistic
 regression SparkR Wrapper

## What changes were proposed in this pull request?

As we discussed in #14818, I added a separate R wrapper spark.logit for logistic regression.

This single interface supports both binary and multinomial logistic regression. It also has "predict" and "summary" for binary logistic regression.

## How was this patch tested?

New unit tests are added.

Author: wm624@hotmail.com <wm624@hotmail.com>

Closes #15365 from wangmiao1981/glm.
---
 R/pkg/NAMESPACE                               |   3 +-
 R/pkg/R/generics.R                            |   4 +
 R/pkg/R/mllib.R                               | 192 +++++++++++++++++-
 R/pkg/inst/tests/testthat/test_mllib.R        |  55 +++++
 .../ml/r/LogisticRegressionWrapper.scala      | 157 ++++++++++++++
 .../org/apache/spark/ml/r/RWrappers.scala     |   2 +
 6 files changed, 410 insertions(+), 3 deletions(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index eb314f471893b..7a89c01fee735 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -43,7 +43,8 @@ exportMethods("glm",
               "spark.isoreg",
               "spark.gaussianMixture",
               "spark.als",
-              "spark.kstest")
+              "spark.kstest",
+              "spark.logit")
 
 # Job group lifecycle management methods
 export("setJobGroup",
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 4569fe4890468..107e1c638be71 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -1375,6 +1375,10 @@ setGeneric("spark.gaussianMixture",
              standardGeneric("spark.gaussianMixture")
            })
 
+#' @rdname spark.logit
+#' @export
+setGeneric("spark.logit", function(data, formula, ...) { standardGeneric("spark.logit") })
+
 #' @param object a fitted ML model object.
 #' @param path the directory where the model is saved.
 #' @param ... additional argument(s) passed to the method.
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index bf182be8e23d0..e441db94998bf 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -95,6 +95,13 @@ setClass("ALSModel", representation(jobj = "jobj"))
 #' @note KSTest since 2.1.0
 setClass("KSTest", representation(jobj = "jobj"))
 
+#' S4 class that represents an LogisticRegressionModel
+#'
+#' @param jobj a Java object reference to the backing Scala LogisticRegressionModel
+#' @export
+#' @note LogisticRegressionModel since 2.1.0
+setClass("LogisticRegressionModel", representation(jobj = "jobj"))
+
 #' Saves the MLlib model to the input path
 #'
 #' Saves the MLlib model to the input path. For more information, see the specific
@@ -105,7 +112,7 @@ setClass("KSTest", representation(jobj = "jobj"))
 #' @seealso \link{spark.glm}, \link{glm},
 #' @seealso \link{spark.als}, \link{spark.gaussianMixture}, \link{spark.isoreg}, \link{spark.kmeans},
 #' @seealso \link{spark.lda}, \link{spark.mlp}, \link{spark.naiveBayes}, \link{spark.survreg}
-#' @seealso \link{read.ml}
+#' @seealso \link{spark.logit}, \link{read.ml}
 NULL
 
 #' Makes predictions from a MLlib model
@@ -117,7 +124,7 @@ NULL
 #' @export
 #' @seealso \link{spark.glm}, \link{glm},
 #' @seealso \link{spark.als}, \link{spark.gaussianMixture}, \link{spark.isoreg}, \link{spark.kmeans},
-#' @seealso \link{spark.mlp}, \link{spark.naiveBayes}, \link{spark.survreg}
+#' @seealso \link{spark.mlp}, \link{spark.naiveBayes}, \link{spark.survreg}, \link{spark.logit}
 NULL
 
 write_internal <- function(object, path, overwrite = FALSE) {
@@ -647,6 +654,170 @@ setMethod("predict", signature(object = "KMeansModel"),
             predict_internal(object, newData)
           })
 
+#' Logistic Regression Model
+#'
+#' Fits an logistic regression model against a Spark DataFrame. It supports "binomial": Binary logistic regression
+#' with pivoting; "multinomial": Multinomial logistic (softmax) regression without pivoting, similar to glmnet.
+#' Users can print, make predictions on the produced model and save the model to the input path.
+#'
+#' @param data SparkDataFrame for training
+#' @param formula A symbolic description of the model to be fitted. Currently only a few formula
+#'                operators are supported, including '~', '.', ':', '+', and '-'.
+#' @param regParam the regularization parameter. Default is 0.0.
+#' @param elasticNetParam the ElasticNet mixing parameter. For alpha = 0.0, the penalty is an L2 penalty.
+#'                        For alpha = 1.0, it is an L1 penalty. For 0.0 < alpha < 1.0, the penalty is a combination
+#'                        of L1 and L2. Default is 0.0 which is an L2 penalty.
+#' @param maxIter maximum iteration number.
+#' @param tol convergence tolerance of iterations.
+#' @param fitIntercept whether to fit an intercept term. Default is TRUE.
+#' @param family the name of family which is a description of the label distribution to be used in the model.
+#'               Supported options:
+#'                 \itemize{
+#'                   \item{"auto": Automatically select the family based on the number of classes:
+#'                           If number of classes == 1 || number of classes == 2, set to "binomial".
+#'                           Else, set to "multinomial".}
+#'                   \item{"binomial": Binary logistic regression with pivoting.}
+#'                   \item{"multinomial": Multinomial logistic (softmax) regression without pivoting.
+#'                           Default is "auto".}
+#'                 }
+#' @param standardization whether to standardize the training features before fitting the model. The coefficients
+#'                        of models will be always returned on the original scale, so it will be transparent for
+#'                        users. Note that with/without standardization, the models should be always converged
+#'                        to the same solution when no regularization is applied. Default is TRUE, same as glmnet.
+#' @param thresholds in binary classification, in range [0, 1]. If the estimated probability of class label 1
+#'                  is > threshold, then predict 1, else 0. A high threshold encourages the model to predict 0
+#'                  more often; a low threshold encourages the model to predict 1 more often. Note: Setting this with
+#'                  threshold p is equivalent to setting thresholds c(1-p, p). When threshold is set, any user-set
+#'                  value for thresholds will be cleared. If both threshold and thresholds are set, then they must be
+#'                  equivalent. In multiclass (or binary) classification to adjust the probability of
+#'                  predicting each class. Array must have length equal to the number of classes, with values > 0,
+#'                  excepting that at most one value may be 0. The class with largest value p/t is predicted, where p
+#'                  is the original probability of that class and t is the class's threshold. Note: When thresholds
+#'                  is set, any user-set value for threshold will be cleared. If both threshold and thresholds are
+#'                  set, then they must be equivalent. Default is 0.5.
+#' @param weightCol The weight column name.
+#' @param aggregationDepth depth for treeAggregate (>= 2). If the dimensions of features or the number of partitions
+#'                         are large, this param could be adjusted to a larger size. Default is 2.
+#' @param probabilityCol column name for predicted class conditional probabilities. Default is "probability".
+#' @param ... additional arguments passed to the method.
+#' @return \code{spark.logit} returns a fitted logistic regression model
+#' @rdname spark.logit
+#' @aliases spark.logit,SparkDataFrame,formula-method
+#' @name spark.logit
+#' @export
+#' @examples
+#' \dontrun{
+#' sparkR.session()
+#' # binary logistic regression
+#' label <- c(1.0, 1.0, 1.0, 0.0, 0.0)
+#' feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776)
+#' binary_data <- as.data.frame(cbind(label, feature))
+#' binary_df <- createDataFrame(binary_data)
+#' blr_model <- spark.logit(binary_df, label ~ feature, thresholds = 1.0)
+#' blr_predict <- collect(select(predict(blr_model, binary_df), "prediction"))
+#'
+#' # summary of binary logistic regression
+#' blr_summary <- summary(blr_model)
+#' blr_fmeasure <- collect(select(blr_summary$fMeasureByThreshold, "threshold", "F-Measure"))
+#' # save fitted model to input path
+#' path <- "path/to/model"
+#' write.ml(blr_model, path)
+#'
+#' # can also read back the saved model and predict
+#' Note that summary deos not work on loaded model
+#' savedModel <- read.ml(path)
+#' blr_predict2 <- collect(select(predict(savedModel, binary_df), "prediction"))
+#'
+#' # multinomial logistic regression
+#'
+#' label <- c(0.0, 1.0, 2.0, 0.0, 0.0)
+#' feature1 <- c(4.845940, 5.64480, 7.430381, 6.464263, 5.555667)
+#' feature2 <- c(2.941319, 2.614812, 2.162451, 3.339474, 2.970987)
+#' feature3 <- c(1.322733, 1.348044, 3.861237, 9.686976, 3.447130)
+#' feature4 <- c(1.3246388, 0.5510444, 0.9225810, 1.2147881, 1.6020842)
+#' data <- as.data.frame(cbind(label, feature1, feature2, feature3, feature4))
+#' df <- createDataFrame(data)
+#'
+#' Note that summary of multinomial logistic regression is not implemented yet
+#' model <- spark.logit(df, label ~ ., family = "multinomial", thresholds=c(0, 1, 1))
+#' predict1 <- collect(select(predict(model, df), "prediction"))
+#' }
+#' @note spark.logit since 2.1.0
+setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula"),
+          function(data, formula, regParam = 0.0, elasticNetParam = 0.0, maxIter = 100,
+                   tol = 1E-6, fitIntercept = TRUE, family = "auto", standardization = TRUE,
+                   thresholds = 0.5, weightCol = NULL, aggregationDepth = 2,
+                   probabilityCol = "probability") {
+            formula <- paste0(deparse(formula), collapse = "")
+
+            if (is.null(weightCol)) {
+              weightCol <- ""
+            }
+
+            jobj <- callJStatic("org.apache.spark.ml.r.LogisticRegressionWrapper", "fit",
+                                data@sdf, formula, as.numeric(regParam),
+                                as.numeric(elasticNetParam), as.integer(maxIter),
+                                as.numeric(tol), as.logical(fitIntercept),
+                                as.character(family), as.logical(standardization),
+                                as.array(thresholds), as.character(weightCol),
+                                as.integer(aggregationDepth), as.character(probabilityCol))
+            new("LogisticRegressionModel", jobj = jobj)
+          })
+
+#  Predicted values based on an LogisticRegressionModel model
+
+#' @param newData a SparkDataFrame for testing.
+#' @return \code{predict} returns the predicted values based on an LogisticRegressionModel.
+#' @rdname spark.logit
+#' @aliases predict,LogisticRegressionModel,SparkDataFrame-method
+#' @export
+#' @note predict(LogisticRegressionModel) since 2.1.0
+setMethod("predict", signature(object = "LogisticRegressionModel"),
+          function(object, newData) {
+            predict_internal(object, newData)
+          })
+
+#  Get the summary of an LogisticRegressionModel
+
+#' @param object an LogisticRegressionModel fitted by \code{spark.logit}
+#' @return \code{summary} returns the Binary Logistic regression results of a given model as lists. Note that
+#'                        Multinomial logistic regression summary is not available now.
+#' @rdname spark.logit
+#' @aliases summary,LogisticRegressionModel-method
+#' @export
+#' @note summary(LogisticRegressionModel) since 2.1.0
+setMethod("summary", signature(object = "LogisticRegressionModel"),
+          function(object) {
+            jobj <- object@jobj
+            is.loaded <- callJMethod(jobj, "isLoaded")
+
+            if (is.loaded) {
+              stop("Loaded model doesn't have training summary.")
+            }
+
+            roc <- dataFrame(callJMethod(jobj, "roc"))
+
+            areaUnderROC <- callJMethod(jobj, "areaUnderROC")
+
+            pr <- dataFrame(callJMethod(jobj, "pr"))
+
+            fMeasureByThreshold <- dataFrame(callJMethod(jobj, "fMeasureByThreshold"))
+
+            precisionByThreshold <- dataFrame(callJMethod(jobj, "precisionByThreshold"))
+
+            recallByThreshold <- dataFrame(callJMethod(jobj, "recallByThreshold"))
+
+            totalIterations <- callJMethod(jobj, "totalIterations")
+
+            objectiveHistory <- callJMethod(jobj, "objectiveHistory")
+
+            list(roc = roc, areaUnderROC = areaUnderROC, pr = pr,
+                 fMeasureByThreshold = fMeasureByThreshold,
+                 precisionByThreshold = precisionByThreshold,
+                 recallByThreshold = recallByThreshold,
+                 totalIterations = totalIterations, objectiveHistory = objectiveHistory)
+          })
+
 #' Multilayer Perceptron Classification Model
 #'
 #' \code{spark.mlp} fits a multi-layer perceptron neural network model against a SparkDataFrame.
@@ -888,6 +1059,21 @@ setMethod("write.ml", signature(object = "IsotonicRegressionModel", path = "char
             write_internal(object, path, overwrite)
           })
 
+#  Save fitted LogisticRegressionModel to the input path
+
+#' @param path The directory where the model is saved
+#' @param overwrite Overwrites or not if the output path already exists. Default is FALSE
+#'                  which means throw exception if the output path exists.
+#'
+#' @rdname spark.logit
+#' @aliases write.ml,LogisticRegressionModel,character-method
+#' @export
+#' @note write.ml(LogisticRegression, character) since 2.1.0
+setMethod("write.ml", signature(object = "LogisticRegressionModel", path = "character"),
+          function(object, path, overwrite = FALSE) {
+            write_internal(object, path, overwrite)
+          })
+
 #  Save fitted MLlib model to the input path
 
 #' @param path the directory where the model is saved.
@@ -938,6 +1124,8 @@ read.ml <- function(path) {
     new("GaussianMixtureModel", jobj = jobj)
   } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.ALSWrapper")) {
     new("ALSModel", jobj = jobj)
+  } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.LogisticRegressionWrapper")) {
+    new("LogisticRegressionModel", jobj = jobj)
   } else {
     stop("Unsupported model: ", jobj)
   }
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index 33cc069f14456..6d1fccc7c0582 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -602,6 +602,61 @@ test_that("spark.isotonicRegression", {
   unlink(modelPath)
 })
 
+test_that("spark.logit", {
+  # test binary logistic regression
+  label <- c(1.0, 1.0, 1.0, 0.0, 0.0)
+  feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776)
+  binary_data <- as.data.frame(cbind(label, feature))
+  binary_df <- createDataFrame(binary_data)
+
+  blr_model <- spark.logit(binary_df, label ~ feature, thresholds = 1.0)
+  blr_predict <- collect(select(predict(blr_model, binary_df), "prediction"))
+  expect_equal(blr_predict$prediction, c(0, 0, 0, 0, 0))
+  blr_model1 <- spark.logit(binary_df, label ~ feature, thresholds = 0.0)
+  blr_predict1 <- collect(select(predict(blr_model1, binary_df), "prediction"))
+  expect_equal(blr_predict1$prediction, c(1, 1, 1, 1, 1))
+
+  # test summary of binary logistic regression
+  blr_summary <- summary(blr_model)
+  blr_fmeasure <- collect(select(blr_summary$fMeasureByThreshold, "threshold", "F-Measure"))
+  expect_equal(blr_fmeasure$threshold, c(0.8221347, 0.7884005, 0.6674709, 0.3785437, 0.3434487),
+               tolerance = 1e-4)
+  expect_equal(blr_fmeasure$"F-Measure", c(0.5000000, 0.8000000, 0.6666667, 0.8571429, 0.7500000),
+               tolerance = 1e-4)
+  blr_precision <- collect(select(blr_summary$precisionByThreshold, "threshold", "precision"))
+  expect_equal(blr_precision$precision, c(1.0000000, 1.0000000, 0.6666667, 0.7500000, 0.6000000),
+               tolerance = 1e-4)
+  blr_recall <- collect(select(blr_summary$recallByThreshold, "threshold", "recall"))
+  expect_equal(blr_recall$recall, c(0.3333333, 0.6666667, 0.6666667, 1.0000000, 1.0000000),
+               tolerance = 1e-4)
+
+  # test model save and read
+  modelPath <- tempfile(pattern = "spark-logisticRegression", fileext = ".tmp")
+  write.ml(blr_model, modelPath)
+  expect_error(write.ml(blr_model, modelPath))
+  write.ml(blr_model, modelPath, overwrite = TRUE)
+  blr_model2 <- read.ml(modelPath)
+  blr_predict2 <- collect(select(predict(blr_model2, binary_df), "prediction"))
+  expect_equal(blr_predict$prediction, blr_predict2$prediction)
+  expect_error(summary(blr_model2))
+  unlink(modelPath)
+
+  # test multinomial logistic regression
+  label <- c(0.0, 1.0, 2.0, 0.0, 0.0)
+  feature1 <- c(4.845940, 5.64480, 7.430381, 6.464263, 5.555667)
+  feature2 <- c(2.941319, 2.614812, 2.162451, 3.339474, 2.970987)
+  feature3 <- c(1.322733, 1.348044, 3.861237, 9.686976, 3.447130)
+  feature4 <- c(1.3246388, 0.5510444, 0.9225810, 1.2147881, 1.6020842)
+  data <- as.data.frame(cbind(label, feature1, feature2, feature3, feature4))
+  df <- createDataFrame(data)
+
+  model <- spark.logit(df, label ~., family = "multinomial", thresholds = c(0, 1, 1))
+  predict1 <- collect(select(predict(model, df), "prediction"))
+  expect_equal(predict1$prediction, c(0, 0, 0, 0, 0))
+  # Summary of multinomial logistic regression is not implemented yet
+  expect_error(summary(model))
+})
+
 test_that("spark.gaussianMixture", {
   # R code to reproduce the result.
   # nolint start
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
new file mode 100644
index 0000000000000..9b352c9863114
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
@@ -0,0 +1,157 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.r
+
+import org.apache.hadoop.fs.Path
+import org.json4s._
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods._
+
+import org.apache.spark.ml.{Pipeline, PipelineModel}
+import org.apache.spark.ml.attribute.AttributeGroup
+import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary, LogisticRegression, LogisticRegressionModel}
+import org.apache.spark.ml.feature.RFormula
+import org.apache.spark.ml.util._
+import org.apache.spark.sql.{DataFrame, Dataset}
+
+private[r] class LogisticRegressionWrapper private (
+    val pipeline: PipelineModel,
+    val features: Array[String],
+    val isLoaded: Boolean = false) extends MLWritable {
+
+  private val logisticRegressionModel: LogisticRegressionModel =
+    pipeline.stages(1).asInstanceOf[LogisticRegressionModel]
+
+  lazy val totalIterations: Int = logisticRegressionModel.summary.totalIterations
+
+  lazy val objectiveHistory: Array[Double] = logisticRegressionModel.summary.objectiveHistory
+
+  lazy val blrSummary =
+    logisticRegressionModel.summary.asInstanceOf[BinaryLogisticRegressionSummary]
+
+  lazy val roc: DataFrame = blrSummary.roc
+
+  lazy val areaUnderROC: Double = blrSummary.areaUnderROC
+
+  lazy val pr: DataFrame = blrSummary.pr
+
+  lazy val fMeasureByThreshold: DataFrame = blrSummary.fMeasureByThreshold
+
+  lazy val precisionByThreshold: DataFrame = blrSummary.precisionByThreshold
+
+  lazy val recallByThreshold: DataFrame = blrSummary.recallByThreshold
+
+  def transform(dataset: Dataset[_]): DataFrame = {
+    pipeline.transform(dataset).drop(logisticRegressionModel.getFeaturesCol)
+  }
+
+  override def write: MLWriter = new LogisticRegressionWrapper.LogisticRegressionWrapperWriter(this)
+}
+
+private[r] object LogisticRegressionWrapper
+    extends MLReadable[LogisticRegressionWrapper] {
+
+  def fit( // scalastyle:ignore
+      data: DataFrame,
+      formula: String,
+      regParam: Double,
+      elasticNetParam: Double,
+      maxIter: Int,
+      tol: Double,
+      fitIntercept: Boolean,
+      family: String,
+      standardization: Boolean,
+      thresholds: Array[Double],
+      weightCol: String,
+      aggregationDepth: Int,
+      probability: String
+      ): LogisticRegressionWrapper = {
+
+    val rFormula = new RFormula()
+      .setFormula(formula)
+    RWrapperUtils.checkDataColumns(rFormula, data)
+    val rFormulaModel = rFormula.fit(data)
+
+    // get feature names from output schema
+    val schema = rFormulaModel.transform(data).schema
+    val featureAttrs = AttributeGroup.fromStructField(schema(rFormulaModel.getFeaturesCol))
+      .attributes.get
+    val features = featureAttrs.map(_.name.get)
+
+    // assemble and fit the pipeline
+    val logisticRegression = new LogisticRegression()
+      .setRegParam(regParam)
+      .setElasticNetParam(elasticNetParam)
+      .setMaxIter(maxIter)
+      .setTol(tol)
+      .setFitIntercept(fitIntercept)
+      .setFamily(family)
+      .setStandardization(standardization)
+      .setWeightCol(weightCol)
+      .setAggregationDepth(aggregationDepth)
+      .setFeaturesCol(rFormula.getFeaturesCol)
+      .setProbabilityCol(probability)
+
+    if (thresholds.length > 1) {
+      logisticRegression.setThresholds(thresholds)
+    } else {
+      logisticRegression.setThreshold(thresholds(0))
+    }
+
+    val pipeline = new Pipeline()
+      .setStages(Array(rFormulaModel, logisticRegression))
+      .fit(data)
+
+    new LogisticRegressionWrapper(pipeline, features)
+  }
+
+  override def read: MLReader[LogisticRegressionWrapper] = new LogisticRegressionWrapperReader
+
+  override def load(path: String): LogisticRegressionWrapper = super.load(path)
+
+  class LogisticRegressionWrapperWriter(instance: LogisticRegressionWrapper) extends MLWriter {
+
+    override protected def saveImpl(path: String): Unit = {
+      val rMetadataPath = new Path(path, "rMetadata").toString
+      val pipelinePath = new Path(path, "pipeline").toString
+
+      val rMetadata = ("class" -> instance.getClass.getName) ~
+        ("features" -> instance.features.toSeq)
+      val rMetadataJson: String = compact(render(rMetadata))
+      sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath)
+
+      instance.pipeline.save(pipelinePath)
+    }
+  }
+
+  class LogisticRegressionWrapperReader extends MLReader[LogisticRegressionWrapper] {
+
+    override def load(path: String): LogisticRegressionWrapper = {
+      implicit val format = DefaultFormats
+      val rMetadataPath = new Path(path, "rMetadata").toString
+      val pipelinePath = new Path(path, "pipeline").toString
+
+      val rMetadataStr = sc.textFile(rMetadataPath, 1).first()
+      val rMetadata = parse(rMetadataStr)
+      val features = (rMetadata \ "features").extract[Array[String]]
+
+      val pipeline = PipelineModel.load(pipelinePath)
+      new LogisticRegressionWrapper(pipeline, features, isLoaded = true)
+    }
+  }
+}
\ No newline at end of file
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/RWrappers.scala b/mllib/src/main/scala/org/apache/spark/ml/r/RWrappers.scala
index d64de1b6abb63..1df3662a5822b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/RWrappers.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/RWrappers.scala
@@ -54,6 +54,8 @@ private[r] object RWrappers extends MLReader[Object] {
         GaussianMixtureWrapper.load(path)
       case "org.apache.spark.ml.r.ALSWrapper" =>
         ALSWrapper.load(path)
+      case "org.apache.spark.ml.r.LogisticRegressionWrapper" =>
+        LogisticRegressionWrapper.load(path)
       case _ =>
         throw new SparkException(s"SparkR read.ml does not support load $className")
     }

From a76846cfb1c2d6c8f4d647426030b59de20d9433 Mon Sep 17 00:00:00 2001
From: Miao Wang <miaowang@Miaos-MacBook-Pro.local>
Date: Thu, 27 Oct 2016 01:17:32 +0200
Subject: [PATCH 133/162] [SPARK-18126][SPARK-CORE] getIteratorZipWithIndex
 accepts negative value as index

## What changes were proposed in this pull request?

(Please fill in changes proposed in this fix)

`Utils.getIteratorZipWithIndex` was added to deal with number of records > 2147483647 in one partition.

method `getIteratorZipWithIndex` accepts `startIndex` < 0, which leads to negative index.

This PR just adds a defensive check on `startIndex` to make sure it is >= 0.

## How was this patch tested?

Add a new unit test.

Author: Miao Wang <miaowang@Miaos-MacBook-Pro.local>

Closes #15639 from wangmiao1981/zip.
---
 core/src/main/scala/org/apache/spark/util/Utils.scala      | 1 +
 core/src/test/scala/org/apache/spark/util/UtilsSuite.scala | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index e57eb0de2689f..6027b07c0fee8 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -1765,6 +1765,7 @@ private[spark] object Utils extends Logging {
    */
   def getIteratorZipWithIndex[T](iterator: Iterator[T], startIndex: Long): Iterator[(T, Long)] = {
     new Iterator[(T, Long)] {
+      require(startIndex >= 0, "startIndex should be >= 0.")
       var index: Long = startIndex - 1L
       def hasNext: Boolean = iterator.hasNext
       def next(): (T, Long) = {
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index aeb2969fd579e..15ef32f21d90c 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -401,6 +401,9 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
     assert(iterator.toArray === Array(
       (0, -1L + Int.MaxValue), (1, 0L + Int.MaxValue), (2, 1L + Int.MaxValue)
     ))
+    intercept[IllegalArgumentException] {
+      Utils.getIteratorZipWithIndex(Iterator(0, 1, 2), -1L)
+    }
   }
 
   test("doesDirectoryContainFilesNewerThan") {

From 5b27598ff50cb08e7570fade458da0a3d4d4eabc Mon Sep 17 00:00:00 2001
From: frreiss <frreiss@us.ibm.com>
Date: Wed, 26 Oct 2016 17:33:08 -0700
Subject: [PATCH 134/162] [SPARK-16963][STREAMING][SQL] Changes to Source trait
 and related implementation classes

## What changes were proposed in this pull request?

This PR contains changes to the Source trait such that the scheduler can notify data sources when it is safe to discard buffered data. Summary of changes:
* Added a method `commit(end: Offset)` that tells the Source that is OK to discard all offsets up `end`, inclusive.
* Changed the semantics of a `None` value for the `getBatch` method to mean "from the very beginning of the stream"; as opposed to "all data present in the Source's buffer".
* Added notes that the upper layers of the system will never call `getBatch` with a start value less than the last value passed to `commit`.
* Added a `lastCommittedOffset` method to allow the scheduler to query the status of each Source on restart. This addition is not strictly necessary, but it seemed like a good idea -- Sources will be maintaining their own persistent state, and there may be bugs in the checkpointing code.
* The scheduler in `StreamExecution.scala` now calls `commit` on its stream sources after marking each batch as complete in its checkpoint.
* `MemoryStream` now cleans committed batches out of its internal buffer.
* `TextSocketSource` now cleans committed batches from its internal buffer.

## How was this patch tested?
Existing regression tests already exercise the new code.

Author: frreiss <frreiss@us.ibm.com>

Closes #14553 from frreiss/fred-16963.
---
 .../streaming/FileStreamSource.scala          |  9 +++
 .../sql/execution/streaming/Source.scala      | 22 ++++--
 .../execution/streaming/StreamExecution.scala | 32 ++++++---
 .../sql/execution/streaming/memory.scala      | 47 ++++++++++--
 .../sql/execution/streaming/socket.scala      | 72 +++++++++++++++----
 .../sql/streaming/StreamingQuerySuite.scala   |  8 +--
 6 files changed, 154 insertions(+), 36 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
index 115edf7ab2b61..a392b82999021 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
@@ -176,6 +176,15 @@ class FileStreamSource(
 
   override def toString: String = s"FileStreamSource[$qualifiedBasePath]"
 
+  /**
+   * Informs the source that Spark has completed processing all data for offsets less than or
+   * equal to `end` and will only request offsets greater than `end` in the future.
+   */
+  override def commit(end: Offset): Unit = {
+    // No-op for now; FileStreamSource currently garbage-collects files based on timestamp
+    // and the value of the maxFileAge parameter.
+  }
+
   override def stop() {}
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala
index 971147840d2fd..f3bd5bfe23fdf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala
@@ -30,16 +30,30 @@ trait Source  {
   /** Returns the schema of the data from this source */
   def schema: StructType
 
-  /** Returns the maximum available offset for this source. */
+  /**
+   * Returns the maximum available offset for this source.
+   * Returns `None` if this source has never received any data.
+   */
   def getOffset: Option[Offset]
 
   /**
-   * Returns the data that is between the offsets (`start`, `end`]. When `start` is `None` then
-   * the batch should begin with the first available record. This method must always return the
-   * same data for a particular `start` and `end` pair.
+   * Returns the data that is between the offsets (`start`, `end`]. When `start` is `None`,
+   * then the batch should begin with the first record. This method must always return the
+   * same data for a particular `start` and `end` pair; even after the Source has been restarted
+   * on a different node.
+   *
+   * Higher layers will always call this method with a value of `start` greater than or equal
+   * to the last value passed to `commit` and a value of `end` less than or equal to the
+   * last value returned by `getOffset`
    */
   def getBatch(start: Option[Offset], end: Offset): DataFrame
 
+  /**
+   * Informs the source that Spark has completed processing all data for offsets less than or
+   * equal to `end` and will only request offsets greater than `end` in the future.
+   */
+  def commit(end: Offset) : Unit = {}
+
   /** Stop this source and free any resources it has allocated. */
   def stop(): Unit
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index ba8cf808e339c..37af1a550aaf1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -73,6 +73,9 @@ class StreamExecution(
   /**
    * Tracks how much data we have processed and committed to the sink or state store from each
    * input source.
+   * Only the scheduler thread should modify this field, and only in atomic steps.
+   * Other threads should make a shallow copy if they are going to access this field more than
+   * once, since the field's value may change at any time.
    */
   @volatile
   var committedOffsets = new StreamProgress
@@ -80,6 +83,9 @@ class StreamExecution(
   /**
    * Tracks the offsets that are available to be processed, but have not yet be committed to the
    * sink.
+   * Only the scheduler thread should modify this field, and only in atomic steps.
+   * Other threads should make a shallow copy if they are going to access this field more than
+   * once, since the field's value may change at any time.
    */
   @volatile
   private var availableOffsets = new StreamProgress
@@ -337,17 +343,27 @@ class StreamExecution(
     }
     if (hasNewData) {
       reportTimeTaken(OFFSET_WAL_WRITE_LATENCY) {
-        assert(
-          offsetLog.add(currentBatchId, availableOffsets.toCompositeOffset(sources)),
+        assert(offsetLog.add(currentBatchId, availableOffsets.toCompositeOffset(sources)),
           s"Concurrent update to the log. Multiple streaming jobs detected for $currentBatchId")
         logInfo(s"Committed offsets for batch $currentBatchId.")
 
+        // NOTE: The following code is correct because runBatches() processes exactly one
+        // batch at a time. If we add pipeline parallelism (multiple batches in flight at
+        // the same time), this cleanup logic will need to change.
+
+        // Now that we've updated the scheduler's persistent checkpoint, it is safe for the
+        // sources to discard data from the previous batch.
+        val prevBatchOff = offsetLog.get(currentBatchId - 1)
+        if (prevBatchOff.isDefined) {
+          prevBatchOff.get.toStreamProgress(sources).foreach {
+            case (src, off) => src.commit(off)
+          }
+        }
+
         // Now that we have logged the new batch, no further processing will happen for
-        // the previous batch, and it is safe to discard the old metadata.
-        // Note that purge is exclusive, i.e. it purges everything before currentBatchId.
-        // NOTE: If StreamExecution implements pipeline parallelism (multiple batches in
-        // flight at the same time), this cleanup logic will need to change.
-        offsetLog.purge(currentBatchId)
+        // the batch before the previous batch, and it is safe to discard the old metadata.
+        // Note that purge is exclusive, i.e. it purges everything before the target ID.
+        offsetLog.purge(currentBatchId - 1)
       }
     } else {
       awaitBatchLock.lock()
@@ -455,7 +471,7 @@ class StreamExecution(
 
   /**
    * Blocks the current thread until processing for data from the given `source` has reached at
-   * least the given `Offset`. This method is indented for use primarily when writing tests.
+   * least the given `Offset`. This method is intended for use primarily when writing tests.
    */
   private[sql] def awaitOffset(source: Source, newOffset: Offset): Unit = {
     def notDone = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
index 788fcd0361bee..48d9791faf1e9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.streaming
 import java.util.concurrent.atomic.AtomicInteger
 import javax.annotation.concurrent.GuardedBy
 
-import scala.collection.mutable.ArrayBuffer
+import scala.collection.mutable.{ArrayBuffer, ListBuffer}
 import scala.util.control.NonFatal
 
 import org.apache.spark.internal.Logging
@@ -51,12 +51,23 @@ case class MemoryStream[A : Encoder](id: Int, sqlContext: SQLContext)
   protected val logicalPlan = StreamingExecutionRelation(this)
   protected val output = logicalPlan.output
 
+  /**
+   * All batches from `lastCommittedOffset + 1` to `currentOffset`, inclusive.
+   * Stored in a ListBuffer to facilitate removing committed batches.
+   */
   @GuardedBy("this")
-  protected val batches = new ArrayBuffer[Dataset[A]]
+  protected val batches = new ListBuffer[Dataset[A]]
 
   @GuardedBy("this")
   protected var currentOffset: LongOffset = new LongOffset(-1)
 
+  /**
+   * Last offset that was discarded, or -1 if no commits have occurred. Note that the value
+   * -1 is used in calculations below and isn't just an arbitrary constant.
+   */
+  @GuardedBy("this")
+  protected var lastOffsetCommitted : LongOffset = new LongOffset(-1)
+
   def schema: StructType = encoder.schema
 
   def toDS()(implicit sqlContext: SQLContext): Dataset[A] = {
@@ -85,21 +96,25 @@ case class MemoryStream[A : Encoder](id: Int, sqlContext: SQLContext)
   override def toString: String = s"MemoryStream[${Utils.truncatedString(output, ",")}]"
 
   override def getOffset: Option[Offset] = synchronized {
-    if (batches.isEmpty) {
+    if (currentOffset.offset == -1) {
       None
     } else {
       Some(currentOffset)
     }
   }
 
-  /**
-   * Returns the data that is between the offsets (`start`, `end`].
-   */
   override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
+    // Compute the internal batch numbers to fetch: [startOrdinal, endOrdinal)
     val startOrdinal =
       start.map(_.asInstanceOf[LongOffset]).getOrElse(LongOffset(-1)).offset.toInt + 1
     val endOrdinal = end.asInstanceOf[LongOffset].offset.toInt + 1
-    val newBlocks = synchronized { batches.slice(startOrdinal, endOrdinal) }
+
+    // Internal buffer only holds the batches after lastCommittedOffset.
+    val newBlocks = synchronized {
+      val sliceStart = startOrdinal - lastOffsetCommitted.offset.toInt - 1
+      val sliceEnd = endOrdinal - lastOffsetCommitted.offset.toInt - 1
+      batches.slice(sliceStart, sliceEnd)
+    }
 
     logDebug(
       s"MemoryBatch [$startOrdinal, $endOrdinal]: ${newBlocks.flatMap(_.collect()).mkString(", ")}")
@@ -111,11 +126,29 @@ case class MemoryStream[A : Encoder](id: Int, sqlContext: SQLContext)
       }
   }
 
+  override def commit(end: Offset): Unit = synchronized {
+    end match {
+      case newOffset: LongOffset =>
+        val offsetDiff = (newOffset.offset - lastOffsetCommitted.offset).toInt
+
+        if (offsetDiff < 0) {
+          sys.error(s"Offsets committed out of order: $lastOffsetCommitted followed by $end")
+        }
+
+        batches.trimStart(offsetDiff)
+        lastOffsetCommitted = newOffset
+      case _ =>
+        sys.error(s"MemoryStream.commit() received an offset ($end) that did not originate with " +
+          "an instance of this class")
+    }
+  }
+
   override def stop() {}
 
   def reset(): Unit = synchronized {
     batches.clear()
     currentOffset = new LongOffset(-1)
+    lastOffsetCommitted = new LongOffset(-1)
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/socket.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/socket.scala
index fb15239f9af98..c662e7c6bc775 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/socket.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/socket.scala
@@ -24,14 +24,15 @@ import java.text.SimpleDateFormat
 import java.util.Calendar
 import javax.annotation.concurrent.GuardedBy
 
-import scala.collection.mutable.ArrayBuffer
+import scala.collection.mutable.ListBuffer
 import scala.util.{Failure, Success, Try}
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.{AnalysisException, DataFrame, SQLContext}
+import org.apache.spark.sql._
 import org.apache.spark.sql.sources.{DataSourceRegister, StreamSourceProvider}
 import org.apache.spark.sql.types.{StringType, StructField, StructType, TimestampType}
 
+
 object TextSocketSource {
   val SCHEMA_REGULAR = StructType(StructField("value", StringType) :: Nil)
   val SCHEMA_TIMESTAMP = StructType(StructField("value", StringType) ::
@@ -53,8 +54,18 @@ class TextSocketSource(host: String, port: Int, includeTimestamp: Boolean, sqlCo
   @GuardedBy("this")
   private var readThread: Thread = null
 
+  /**
+   * All batches from `lastCommittedOffset + 1` to `currentOffset`, inclusive.
+   * Stored in a ListBuffer to facilitate removing committed batches.
+   */
+  @GuardedBy("this")
+  protected val batches = new ListBuffer[(String, Timestamp)]
+
+  @GuardedBy("this")
+  protected var currentOffset: LongOffset = new LongOffset(-1)
+
   @GuardedBy("this")
-  private var lines = new ArrayBuffer[(String, Timestamp)]
+  protected var lastOffsetCommitted : LongOffset = new LongOffset(-1)
 
   initialize()
 
@@ -74,10 +85,12 @@ class TextSocketSource(host: String, port: Int, includeTimestamp: Boolean, sqlCo
               return
             }
             TextSocketSource.this.synchronized {
-              lines += ((line,
+              val newData = (line,
                 Timestamp.valueOf(
                   TextSocketSource.DATE_FORMAT.format(Calendar.getInstance().getTime()))
-                ))
+                )
+              currentOffset = currentOffset + 1
+              batches.append(newData)
             }
           }
         } catch {
@@ -92,21 +105,54 @@ class TextSocketSource(host: String, port: Int, includeTimestamp: Boolean, sqlCo
   override def schema: StructType = if (includeTimestamp) TextSocketSource.SCHEMA_TIMESTAMP
   else TextSocketSource.SCHEMA_REGULAR
 
-  /** Returns the maximum available offset for this source. */
   override def getOffset: Option[Offset] = synchronized {
-    if (lines.isEmpty) None else Some(LongOffset(lines.size - 1))
+    if (currentOffset.offset == -1) {
+      None
+    } else {
+      Some(currentOffset)
+    }
   }
 
   /** Returns the data that is between the offsets (`start`, `end`]. */
   override def getBatch(start: Option[Offset], end: Offset): DataFrame = synchronized {
-    val startIdx = start.map(_.asInstanceOf[LongOffset].offset.toInt + 1).getOrElse(0)
-    val endIdx = end.asInstanceOf[LongOffset].offset.toInt + 1
-    val data = synchronized { lines.slice(startIdx, endIdx) }
+    val startOrdinal =
+      start.map(_.asInstanceOf[LongOffset]).getOrElse(LongOffset(-1)).offset.toInt + 1
+    val endOrdinal = end.asInstanceOf[LongOffset].offset.toInt + 1
+
+    // Internal buffer only holds the batches after lastOffsetCommitted
+    val rawList = synchronized {
+      val sliceStart = startOrdinal - lastOffsetCommitted.offset.toInt - 1
+      val sliceEnd = endOrdinal - lastOffsetCommitted.offset.toInt - 1
+      batches.slice(sliceStart, sliceEnd)
+    }
+
     import sqlContext.implicits._
+    val rawBatch = sqlContext.createDataset(rawList)
+
+    // Underlying MemoryStream has schema (String, Timestamp); strip out the timestamp
+    // if requested.
     if (includeTimestamp) {
-      data.toDF("value", "timestamp")
+      rawBatch.toDF("value", "timestamp")
+    } else {
+      // Strip out timestamp
+      rawBatch.select("_1").toDF("value")
+    }
+  }
+
+  override def commit(end: Offset): Unit = synchronized {
+    if (end.isInstanceOf[LongOffset]) {
+      val newOffset = end.asInstanceOf[LongOffset]
+      val offsetDiff = (newOffset.offset - lastOffsetCommitted.offset).toInt
+
+      if (offsetDiff < 0) {
+        sys.error(s"Offsets committed out of order: $lastOffsetCommitted followed by $end")
+      }
+
+      batches.trimStart(offsetDiff)
+      lastOffsetCommitted = newOffset
     } else {
-      data.map(_._1).toDF("value")
+      sys.error(s"TextSocketStream.commit() received an offset ($end) that did not " +
+        s"originate with an instance of this class")
     }
   }
 
@@ -141,7 +187,7 @@ class TextSocketSourceProvider extends StreamSourceProvider with DataSourceRegis
       providerName: String,
       parameters: Map[String, String]): (String, StructType) = {
     logWarning("The socket source should not be used for production applications! " +
-      "It does not support recovery and stores state indefinitely.")
+      "It does not support recovery.")
     if (!parameters.contains("host")) {
       throw new AnalysisException("Set a host to read from with option(\"host\", ...).")
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
index 92020be9789fe..dad410486ed24 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
@@ -252,8 +252,8 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
     val inputData = MemoryStream[Int]
     val mapped = inputData.toDS().map(6 / _)
 
-    // Run 3 batches, and then assert that only 1 metadata file is left at the end
-    // since the first 2 should have been purged.
+    // Run 3 batches, and then assert that only 2 metadata files is are at the end
+    // since the first should have been purged.
     testStream(mapped)(
       AddData(inputData, 1, 2),
       CheckAnswer(6, 3),
@@ -262,11 +262,11 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
       AddData(inputData, 4, 6),
       CheckAnswer(6, 3, 6, 3, 1, 1),
 
-      AssertOnQuery("metadata log should contain only one file") { q =>
+      AssertOnQuery("metadata log should contain only two files") { q =>
         val metadataLogDir = new java.io.File(q.offsetLog.metadataPath.toString)
         val logFileNames = metadataLogDir.listFiles().toSeq.map(_.getName())
         val toTest = logFileNames.filter(! _.endsWith(".crc"))  // Workaround for SPARK-17475
-        assert(toTest.size == 1 && toTest.head == "2")
+        assert(toTest.size == 2 && toTest.head == "1")
         true
       }
     )

From f1aeed8b022e043de2eb38b30187dcc36ee8dcdb Mon Sep 17 00:00:00 2001
From: ALeksander Eskilson <alek.eskilson@cerner.com>
Date: Wed, 26 Oct 2016 18:03:31 -0700
Subject: [PATCH 135/162] [SPARK-17770][CATALYST] making ObjectType public

## What changes were proposed in this pull request?

In order to facilitate the writing of additional Encoders, I proposed opening up the ObjectType SQL DataType. This DataType is used extensively in the JavaBean Encoder, but would also be useful in writing other custom encoders.

As mentioned by marmbrus, it is understood that the Expressions API is subject to potential change.

## How was this patch tested?

The change only affects the visibility of the ObjectType class, and the existing SQL test suite still runs without error.

Author: ALeksander Eskilson <alek.eskilson@cerner.com>

Closes #15453 from bdrillard/master.
---
 .../org/apache/spark/sql/types/ObjectType.scala      | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala
index c741a2dd3ea30..b18fba29af0f9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala
@@ -19,7 +19,10 @@ package org.apache.spark.sql.types
 
 import scala.language.existentials
 
-private[sql] object ObjectType extends AbstractDataType {
+import org.apache.spark.annotation.InterfaceStability
+
+@InterfaceStability.Evolving
+object ObjectType extends AbstractDataType {
   override private[sql] def defaultConcreteType: DataType =
     throw new UnsupportedOperationException("null literals can't be casted to ObjectType")
 
@@ -32,11 +35,10 @@ private[sql] object ObjectType extends AbstractDataType {
 }
 
 /**
- * Represents a JVM object that is passing through Spark SQL expression evaluation.  Note this
- * is only used internally while converting into the internal format and is not intended for use
- * outside of the execution engine.
+ * Represents a JVM object that is passing through Spark SQL expression evaluation.
  */
-private[sql] case class ObjectType(cls: Class[_]) extends DataType {
+@InterfaceStability.Evolving
+case class ObjectType(cls: Class[_]) extends DataType {
   override def defaultSize: Int = 4096
 
   def asNullable: DataType = this

From dd4f088c1df6abd728e5544a17ba85322bedfe4c Mon Sep 17 00:00:00 2001
From: Dilip Biswal <dbiswal@us.ibm.com>
Date: Thu, 27 Oct 2016 13:12:14 +0800
Subject: [PATCH 136/162] [SPARK-18009][SQL] Fix ClassCastException while
 calling toLocalIterator() on dataframe produced by RunnableCommand

## What changes were proposed in this pull request?
A short code snippet that uses toLocalIterator() on a dataframe produced by a RunnableCommand
reproduces the problem. toLocalIterator() is called by thriftserver when
`spark.sql.thriftServer.incrementalCollect`is set to handle queries producing large result
set.

**Before**
```SQL
scala> spark.sql("show databases")
res0: org.apache.spark.sql.DataFrame = [databaseName: string]

scala> res0.toLocalIterator()
16/10/26 03:00:24 ERROR Executor: Exception in task 0.0 in stage 0.0 (TID 0)
java.lang.ClassCastException: org.apache.spark.sql.catalyst.expressions.GenericInternalRow cannot be cast to org.apache.spark.sql.catalyst.expressions.UnsafeRow
```

**After**
```SQL
scala> spark.sql("drop database databases")
res30: org.apache.spark.sql.DataFrame = []

scala> spark.sql("show databases")
res31: org.apache.spark.sql.DataFrame = [databaseName: string]

scala> res31.toLocalIterator().asScala foreach println
[default]
[parquet]
```
## How was this patch tested?
Added a test in DDLSuite

Author: Dilip Biswal <dbiswal@us.ibm.com>

Closes #15642 from dilipbiswal/SPARK-18009.
---
 .../org/apache/spark/sql/execution/command/commands.scala  | 2 ++
 .../org/apache/spark/sql/execution/command/DDLSuite.scala  | 7 +++++++
 2 files changed, 9 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
index 698c625d617fc..d82e54e57564c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
@@ -66,6 +66,8 @@ case class ExecutedCommandExec(cmd: RunnableCommand) extends SparkPlan {
 
   override def executeCollect(): Array[InternalRow] = sideEffectResult.toArray
 
+  override def executeToIterator: Iterator[InternalRow] = sideEffectResult.toIterator
+
   override def executeTake(limit: Int): Array[InternalRow] = sideEffectResult.take(limit).toArray
 
   protected override def doExecute(): RDD[InternalRow] = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index de326f80f6598..b989d01ec787a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -1805,4 +1805,11 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
       }
     }
   }
+
+  test("SPARK-18009 calling toLocalIterator on commands") {
+    import scala.collection.JavaConverters._
+    val df = sql("show databases")
+    val rows: Seq[Row] = df.toLocalIterator().asScala.toSeq
+    assert(rows.length > 0)
+  }
 }

From d3b4831d009905185ad74096ce3ecfa934bc191d Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Wed, 26 Oct 2016 22:22:23 -0700
Subject: [PATCH 137/162] [SPARK-18132] Fix checkstyle

This PR fixes checkstyle.

Author: Yin Huai <yhuai@databricks.com>

Closes #15656 from yhuai/fix-format.
---
 .../util/collection/unsafe/sort/UnsafeExternalSorter.java    | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
index 7835017910232..dcae4a34c4b0b 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
@@ -143,9 +143,10 @@ private UnsafeExternalSorter(
     this.recordComparator = recordComparator;
     this.prefixComparator = prefixComparator;
     // Use getSizeAsKb (not bytes) to maintain backwards compatibility for units
-    // this.fileBufferSizeBytes = (int) conf.getSizeAsKb("spark.shuffle.file.buffer", "32k") * 1024;
+    // this.fileBufferSizeBytes = (int) conf.getSizeAsKb("spark.shuffle.file.buffer", "32k") * 1024
     this.fileBufferSizeBytes = 32 * 1024;
-    // The spill metrics are stored in a new ShuffleWriteMetrics, and then discarded (this fixes SPARK-16827).
+    // The spill metrics are stored in a new ShuffleWriteMetrics,
+    // and then discarded (this fixes SPARK-16827).
     // TODO: Instead, separate spill metrics should be stored and reported (tracked in SPARK-3577).
     this.writeMetrics = new ShuffleWriteMetrics();
 

From 1dbe9896b7f30538a5fad2f5d718d035c7906936 Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Wed, 26 Oct 2016 23:02:54 -0700
Subject: [PATCH 138/162] [SPARK-17157][SPARKR][FOLLOW-UP] doc fixes

## What changes were proposed in this pull request?

a couple of small late finding fixes for doc

## How was this patch tested?

manually
wangmiao1981

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #15650 from felixcheung/logitfix.
---
 R/pkg/R/mllib.R | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index e441db94998bf..629f284b79f33 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -111,8 +111,9 @@ setClass("LogisticRegressionModel", representation(jobj = "jobj"))
 #' @export
 #' @seealso \link{spark.glm}, \link{glm},
 #' @seealso \link{spark.als}, \link{spark.gaussianMixture}, \link{spark.isoreg}, \link{spark.kmeans},
-#' @seealso \link{spark.lda}, \link{spark.mlp}, \link{spark.naiveBayes}, \link{spark.survreg}
-#' @seealso \link{spark.logit}, \link{read.ml}
+#' @seealso \link{spark.lda}, \link{spark.logit}, \link{spark.mlp}, \link{spark.naiveBayes},
+#' @seealso \link{spark.survreg}
+#' @seealso \link{read.ml}
 NULL
 
 #' Makes predictions from a MLlib model
@@ -124,7 +125,7 @@ NULL
 #' @export
 #' @seealso \link{spark.glm}, \link{glm},
 #' @seealso \link{spark.als}, \link{spark.gaussianMixture}, \link{spark.isoreg}, \link{spark.kmeans},
-#' @seealso \link{spark.mlp}, \link{spark.naiveBayes}, \link{spark.survreg}, \link{spark.logit}
+#' @seealso \link{spark.logit}, \link{spark.mlp}, \link{spark.naiveBayes}, \link{spark.survreg}
 NULL
 
 write_internal <- function(object, path, overwrite = FALSE) {
@@ -671,14 +672,13 @@ setMethod("predict", signature(object = "KMeansModel"),
 #' @param tol convergence tolerance of iterations.
 #' @param fitIntercept whether to fit an intercept term. Default is TRUE.
 #' @param family the name of family which is a description of the label distribution to be used in the model.
-#'               Supported options:
+#'               Supported options: Default is "auto".
 #'                 \itemize{
 #'                   \item{"auto": Automatically select the family based on the number of classes:
 #'                           If number of classes == 1 || number of classes == 2, set to "binomial".
 #'                           Else, set to "multinomial".}
 #'                   \item{"binomial": Binary logistic regression with pivoting.}
-#'                   \item{"multinomial": Multinomial logistic (softmax) regression without pivoting.
-#'                           Default is "auto".}
+#'                   \item{"multinomial": Multinomial logistic (softmax) regression without pivoting.}
 #'                 }
 #' @param standardization whether to standardize the training features before fitting the model. The coefficients
 #'                        of models will be always returned on the original scale, so it will be transparent for
@@ -687,14 +687,10 @@ setMethod("predict", signature(object = "KMeansModel"),
 #' @param thresholds in binary classification, in range [0, 1]. If the estimated probability of class label 1
 #'                  is > threshold, then predict 1, else 0. A high threshold encourages the model to predict 0
 #'                  more often; a low threshold encourages the model to predict 1 more often. Note: Setting this with
-#'                  threshold p is equivalent to setting thresholds c(1-p, p). When threshold is set, any user-set
-#'                  value for thresholds will be cleared. If both threshold and thresholds are set, then they must be
-#'                  equivalent. In multiclass (or binary) classification to adjust the probability of
+#'                  threshold p is equivalent to setting thresholds c(1-p, p). In multiclass (or binary) classification to adjust the probability of
 #'                  predicting each class. Array must have length equal to the number of classes, with values > 0,
 #'                  excepting that at most one value may be 0. The class with largest value p/t is predicted, where p
-#'                  is the original probability of that class and t is the class's threshold. Note: When thresholds
-#'                  is set, any user-set value for threshold will be cleared. If both threshold and thresholds are
-#'                  set, then they must be equivalent. Default is 0.5.
+#'                  is the original probability of that class and t is the class's threshold. Default is 0.5.
 #' @param weightCol The weight column name.
 #' @param aggregationDepth depth for treeAggregate (>= 2). If the dimensions of features or the number of partitions
 #'                         are large, this param could be adjusted to a larger size. Default is 2.
@@ -724,7 +720,7 @@ setMethod("predict", signature(object = "KMeansModel"),
 #' write.ml(blr_model, path)
 #'
 #' # can also read back the saved model and predict
-#' Note that summary deos not work on loaded model
+#' # Note that summary deos not work on loaded model
 #' savedModel <- read.ml(path)
 #' blr_predict2 <- collect(select(predict(savedModel, binary_df), "prediction"))
 #'
@@ -738,8 +734,8 @@ setMethod("predict", signature(object = "KMeansModel"),
 #' data <- as.data.frame(cbind(label, feature1, feature2, feature3, feature4))
 #' df <- createDataFrame(data)
 #'
-#' Note that summary of multinomial logistic regression is not implemented yet
-#' model <- spark.logit(df, label ~ ., family = "multinomial", thresholds=c(0, 1, 1))
+#' # Note that summary of multinomial logistic regression is not implemented yet
+#' model <- spark.logit(df, label ~ ., family = "multinomial", thresholds = c(0, 1, 1))
 #' predict1 <- collect(select(predict(model, df), "prediction"))
 #' }
 #' @note spark.logit since 2.1.0

From 44c8bfda793b7655e2bd1da5e9915a09ed9d42ce Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Wed, 26 Oct 2016 23:06:11 -0700
Subject: [PATCH 139/162] [SQL][DOC] updating doc for JSON source to link to
 jsonlines.org

## What changes were proposed in this pull request?

API and programming guide doc changes for Scala, Python and R.

## How was this patch tested?

manual test

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #15629 from felixcheung/jsondoc.
---
 R/pkg/R/DataFrame.R                           |  3 ++-
 R/pkg/R/SQLContext.R                          |  3 ++-
 docs/sparkr.md                                |  2 +-
 docs/sql-programming-guide.md                 | 22 +++++++++++--------
 python/pyspark/sql/readwriter.py              |  5 +++--
 python/pyspark/sql/streaming.py               |  3 ++-
 .../apache/spark/sql/DataFrameReader.scala    | 14 +++++++-----
 .../apache/spark/sql/DataFrameWriter.scala    |  3 ++-
 .../sql/streaming/DataStreamReader.scala      |  3 ++-
 9 files changed, 35 insertions(+), 23 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index be34e4b32f6f9..1df8bbf9fe604 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -761,7 +761,8 @@ setMethod("toJSON",
 
 #' Save the contents of SparkDataFrame as a JSON file
 #'
-#' Save the contents of a SparkDataFrame as a JSON file (one object per line). Files written out
+#' Save the contents of a SparkDataFrame as a JSON file (\href{http://jsonlines.org/}{
+#' JSON Lines text format or newline-delimited JSON}). Files written out
 #' with this method can be read back in as a SparkDataFrame using read.json().
 #'
 #' @param x A SparkDataFrame
diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index 0d6a229e63455..216ca51666ba8 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -324,7 +324,8 @@ setMethod("toDF", signature(x = "RDD"),
 
 #' Create a SparkDataFrame from a JSON file.
 #'
-#' Loads a JSON file (one object per line), returning the result as a SparkDataFrame
+#' Loads a JSON file (\href{http://jsonlines.org/}{JSON Lines text format or newline-delimited JSON}
+#' ), returning the result as a SparkDataFrame
 #' It goes through the entire dataset once to determine the schema.
 #'
 #' @param path Path of file to read. A vector of multiple paths is allowed.
diff --git a/docs/sparkr.md b/docs/sparkr.md
index c1829efd18f44..f30bd4026fed3 100644
--- a/docs/sparkr.md
+++ b/docs/sparkr.md
@@ -135,7 +135,7 @@ sparkR.session(sparkPackages = "com.databricks:spark-avro_2.11:3.0.0")
 {% endhighlight %}
 </div>
 
-We can see how to use data sources using an example JSON input file. Note that the file that is used here is _not_ a typical JSON file. Each line in the file must contain a separate, self-contained valid JSON object. As a consequence, a regular multi-line JSON file will most often fail.
+We can see how to use data sources using an example JSON input file. Note that the file that is used here is _not_ a typical JSON file. Each line in the file must contain a separate, self-contained valid JSON object. For more information, please see [JSON Lines text format, also called newline-delimited JSON](http://jsonlines.org/). As a consequence, a regular multi-line JSON file will most often fail.
 
 <div data-lang="r"  markdown="1">
 {% highlight r %}
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 064af41965b70..b9be7a7545ef8 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -316,7 +316,7 @@ Serializable and has getters and setters for all of its fields.
 
 Spark SQL can convert an RDD of Row objects to a DataFrame, inferring the datatypes. Rows are constructed by passing a list of
 key/value pairs as kwargs to the Row class. The keys of this list define the column names of the table,
-and the types are inferred by sampling the whole datase, similar to the inference that is performed on JSON files.
+and the types are inferred by sampling the whole dataset, similar to the inference that is performed on JSON files.
 
 {% include_example schema_inferring python/sql/basic.py %}
 </div>
@@ -832,8 +832,9 @@ This conversion can be done using `SparkSession.read.json()` on either an RDD of
 or a JSON file.
 
 Note that the file that is offered as _a json file_ is not a typical JSON file. Each
-line must contain a separate, self-contained valid JSON object. As a consequence,
-a regular multi-line JSON file will most often fail.
+line must contain a separate, self-contained valid JSON object. For more information, please see
+[JSON Lines text format, also called newline-delimited JSON](http://jsonlines.org/). As a
+consequence, a regular multi-line JSON file will most often fail.
 
 {% include_example json_dataset scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala %}
 </div>
@@ -844,8 +845,9 @@ This conversion can be done using `SparkSession.read().json()` on either an RDD
 or a JSON file.
 
 Note that the file that is offered as _a json file_ is not a typical JSON file. Each
-line must contain a separate, self-contained valid JSON object. As a consequence,
-a regular multi-line JSON file will most often fail.
+line must contain a separate, self-contained valid JSON object. For more information, please see
+[JSON Lines text format, also called newline-delimited JSON](http://jsonlines.org/). As a
+consequence, a regular multi-line JSON file will most often fail.
 
 {% include_example json_dataset java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java %}
 </div>
@@ -855,8 +857,9 @@ Spark SQL can automatically infer the schema of a JSON dataset and load it as a
 This conversion can be done using `SparkSession.read.json` on a JSON file.
 
 Note that the file that is offered as _a json file_ is not a typical JSON file. Each
-line must contain a separate, self-contained valid JSON object. As a consequence,
-a regular multi-line JSON file will most often fail.
+line must contain a separate, self-contained valid JSON object. For more information, please see
+[JSON Lines text format, also called newline-delimited JSON](http://jsonlines.org/). As a
+consequence, a regular multi-line JSON file will most often fail.
 
 {% include_example json_dataset python/sql/datasource.py %}
 </div>
@@ -867,8 +870,9 @@ the `read.json()` function, which loads data from a directory of JSON files wher
 files is a JSON object.
 
 Note that the file that is offered as _a json file_ is not a typical JSON file. Each
-line must contain a separate, self-contained valid JSON object. As a consequence,
-a regular multi-line JSON file will most often fail.
+line must contain a separate, self-contained valid JSON object. For more information, please see
+[JSON Lines text format, also called newline-delimited JSON](http://jsonlines.org/). As a
+consequence, a regular multi-line JSON file will most often fail.
 
 {% include_example json_dataset r/RSparkSQLExample.R %}
 
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 91c2b17049fa1..bc786ef95ed03 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -160,8 +160,9 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
              allowNumericLeadingZero=None, allowBackslashEscapingAnyCharacter=None,
              mode=None, columnNameOfCorruptRecord=None, dateFormat=None, timestampFormat=None):
         """
-        Loads a JSON file (one object per line) or an RDD of Strings storing JSON objects
-        (one object per record) and returns the result as a :class`DataFrame`.
+        Loads a JSON file (`JSON Lines text format or newline-delimited JSON
+        <[http://jsonlines.org/>`_) or an RDD of Strings storing JSON objects (one object per
+        record) and returns the result as a :class`DataFrame`.
 
         If the ``schema`` parameter is not specified, this function goes
         through the input once to determine the input schema.
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index 35fc469291684..559647bbabf67 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -640,7 +640,8 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
              mode=None, columnNameOfCorruptRecord=None, dateFormat=None,
              timestampFormat=None):
         """
-        Loads a JSON file stream (one object per line) and returns a :class`DataFrame`.
+        Loads a JSON file stream (`JSON Lines text format or newline-delimited JSON
+        <[http://jsonlines.org/>`_) and returns a :class`DataFrame`.
 
         If the ``schema`` parameter is not specified, this function goes
         through the input once to determine the input schema.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index b7b2203cdd85b..a77937efd7e15 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -239,7 +239,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Loads a JSON file (one object per line) and returns the result as a [[DataFrame]].
+   * Loads a JSON file ([[http://jsonlines.org/ JSON Lines text format or newline-delimited JSON]])
+   * and returns the result as a [[DataFrame]].
    * See the documentation on the overloaded `json()` method with varargs for more details.
    *
    * @since 1.4.0
@@ -250,7 +251,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Loads a JSON file (one object per line) and returns the result as a [[DataFrame]].
+   * Loads a JSON file ([[http://jsonlines.org/ JSON Lines text format or newline-delimited JSON]])
+   * and returns the result as a [[DataFrame]].
    *
    * This function goes through the input once to determine the input schema. If you know the
    * schema in advance, use the version that specifies the schema to avoid the extra scan.
@@ -295,8 +297,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   def json(paths: String*): DataFrame = format("json").load(paths : _*)
 
   /**
-   * Loads a `JavaRDD[String]` storing JSON objects (one object per record) and
-   * returns the result as a [[DataFrame]].
+   * Loads a `JavaRDD[String]` storing JSON objects ([[http://jsonlines.org/ JSON Lines text format
+   * or newline-delimited JSON]]) and returns the result as a [[DataFrame]].
    *
    * Unless the schema is specified using [[schema]] function, this function goes through the
    * input once to determine the input schema.
@@ -307,8 +309,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   def json(jsonRDD: JavaRDD[String]): DataFrame = json(jsonRDD.rdd)
 
   /**
-   * Loads an `RDD[String]` storing JSON objects (one object per record) and
-   * returns the result as a [[DataFrame]].
+   * Loads an `RDD[String]` storing JSON objects ([[http://jsonlines.org/ JSON Lines text format or
+   * newline-delimited JSON]]) and returns the result as a [[DataFrame]].
    *
    * Unless the schema is specified using [[schema]] function, this function goes through the
    * input once to determine the input schema.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 5be3277651d02..4b5f0246b9a1b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -434,7 +434,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
   }
 
   /**
-   * Saves the content of the [[DataFrame]] in JSON format at the specified path.
+   * Saves the content of the [[DataFrame]] in JSON format ([[http://jsonlines.org/ JSON Lines text
+   * format or newline-delimited JSON]]) at the specified path.
    * This is equivalent to:
    * {{{
    *   format("json").save(path)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
index 87b73062180e4..40b482e4c01a5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
@@ -134,7 +134,8 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
   }
 
   /**
-   * Loads a JSON file stream (one object per line) and returns the result as a [[DataFrame]].
+   * Loads a JSON file stream ([[http://jsonlines.org/ JSON Lines text format or newline-delimited
+   * JSON]]) and returns the result as a [[DataFrame]].
    *
    * This function goes through the input once to determine the input schema. If you know the
    * schema in advance, use the version that specifies the schema to avoid the extra scan.

From 701a9d361b3045a25c42b3c0e44e7755d45ff78c Mon Sep 17 00:00:00 2001
From: "wm624@hotmail.com" <wm624@hotmail.com>
Date: Thu, 27 Oct 2016 10:00:37 +0200
Subject: [PATCH 140/162] [SPARK-CORE][TEST][MINOR] Fix the wrong comment in
 test

## What changes were proposed in this pull request?

While learning core scheduler code, I found two lines of wrong comments. This PR simply corrects the comments.

## How was this patch tested?

Author: wm624@hotmail.com <wm624@hotmail.com>

Closes #15631 from wangmiao1981/Rbug.
---
 .../org/apache/spark/scheduler/TaskSetManagerSuite.scala    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
index b49ba085ca5d2..1b1a764ceff95 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
@@ -261,14 +261,14 @@ class TaskSetManagerSuite extends SparkFunSuite with LocalSparkContext with Logg
     assert(manager.resourceOffer("exec1", "host1", PROCESS_LOCAL) == None)
 
     clock.advance(LOCALITY_WAIT_MS)
-    // Offer host1, exec1 again, at NODE_LOCAL level: the node local (task 2) should
+    // Offer host1, exec1 again, at NODE_LOCAL level: the node local (task 3) should
     // get chosen before the noPref task
     assert(manager.resourceOffer("exec1", "host1", NODE_LOCAL).get.index == 2)
 
-    // Offer host2, exec3 again, at NODE_LOCAL level: we should choose task 2
+    // Offer host2, exec2, at NODE_LOCAL level: we should choose task 2
     assert(manager.resourceOffer("exec2", "host2", NODE_LOCAL).get.index == 1)
 
-    // Offer host2, exec3 again, at NODE_LOCAL level: we should get noPref task
+    // Offer host2, exec2 again, at NODE_LOCAL level: we should get noPref task
     // after failing to find a node_Local task
     assert(manager.resourceOffer("exec2", "host2", NODE_LOCAL) == None)
     clock.advance(LOCALITY_WAIT_MS)

From 104232580528c097a284d753adb5795f6de8b0a5 Mon Sep 17 00:00:00 2001
From: cody koeninger <cody@koeninger.org>
Date: Thu, 27 Oct 2016 10:30:59 -0700
Subject: [PATCH 141/162] [SPARK-17813][SQL][KAFKA] Maximum data per trigger

## What changes were proposed in this pull request?

maxOffsetsPerTrigger option for rate limiting, proportionally based on volume of different topicpartitions.

## How was this patch tested?

Added unit test

Author: cody koeninger <cody@koeninger.org>

Closes #15527 from koeninger/SPARK-17813.
---
 .../structured-streaming-kafka-integration.md |   6 +
 .../spark/sql/kafka010/KafkaSource.scala      | 107 ++++++++++++++----
 .../spark/sql/kafka010/KafkaSourceSuite.scala |  71 +++++++++++-
 3 files changed, 157 insertions(+), 27 deletions(-)

diff --git a/docs/structured-streaming-kafka-integration.md b/docs/structured-streaming-kafka-integration.md
index e851f210c92c4..a6c3b3a9024d8 100644
--- a/docs/structured-streaming-kafka-integration.md
+++ b/docs/structured-streaming-kafka-integration.md
@@ -221,6 +221,12 @@ The following configurations are optional:
   <td>10</td>
   <td>milliseconds to wait before retrying to fetch Kafka offsets</td>
 </tr>
+<tr>
+  <td>maxOffsetsPerTrigger</td>
+  <td>long</td>
+  <td>none</td>
+  <td>Rate limit on maximum number of offsets processed per trigger interval. The specified total number of offsets will be proportionally split across topicPartitions of different volume.</td>
+</tr>
 </table>
 
 Kafka's own configurations can be set via `DataStreamReader.option` with `kafka.` prefix, e.g, 
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
index 537b7b0baa1b1..61cba737d148a 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
@@ -96,6 +96,9 @@ private[kafka010] case class KafkaSource(
   private val offsetFetchAttemptIntervalMs =
     sourceOptions.getOrElse("fetchOffset.retryIntervalMs", "10").toLong
 
+  private val maxOffsetsPerTrigger =
+    sourceOptions.get("maxOffsetsPerTrigger").map(_.toLong)
+
   /**
    * A KafkaConsumer used in the driver to query the latest Kafka offsets. This only queries the
    * offsets and never commits them.
@@ -121,6 +124,8 @@ private[kafka010] case class KafkaSource(
     }.partitionToOffsets
   }
 
+  private var currentPartitionOffsets: Option[Map[TopicPartition, Long]] = None
+
   override def schema: StructType = KafkaSource.kafkaSchema
 
   /** Returns the maximum available offset for this source. */
@@ -128,9 +133,54 @@ private[kafka010] case class KafkaSource(
     // Make sure initialPartitionOffsets is initialized
     initialPartitionOffsets
 
-    val offset = KafkaSourceOffset(fetchLatestOffsets())
-    logDebug(s"GetOffset: ${offset.partitionToOffsets.toSeq.map(_.toString).sorted}")
-    Some(offset)
+    val latest = fetchLatestOffsets()
+    val offsets = maxOffsetsPerTrigger match {
+      case None =>
+        latest
+      case Some(limit) if currentPartitionOffsets.isEmpty =>
+        rateLimit(limit, initialPartitionOffsets, latest)
+      case Some(limit) =>
+        rateLimit(limit, currentPartitionOffsets.get, latest)
+    }
+
+    currentPartitionOffsets = Some(offsets)
+    logDebug(s"GetOffset: ${offsets.toSeq.map(_.toString).sorted}")
+    Some(KafkaSourceOffset(offsets))
+  }
+
+  /** Proportionally distribute limit number of offsets among topicpartitions */
+  private def rateLimit(
+      limit: Long,
+      from: Map[TopicPartition, Long],
+      until: Map[TopicPartition, Long]): Map[TopicPartition, Long] = {
+    val fromNew = fetchNewPartitionEarliestOffsets(until.keySet.diff(from.keySet).toSeq)
+    val sizes = until.flatMap {
+      case (tp, end) =>
+        // If begin isn't defined, something's wrong, but let alert logic in getBatch handle it
+        from.get(tp).orElse(fromNew.get(tp)).flatMap { begin =>
+          val size = end - begin
+          logDebug(s"rateLimit $tp size is $size")
+          if (size > 0) Some(tp -> size) else None
+        }
+    }
+    val total = sizes.values.sum.toDouble
+    if (total < 1) {
+      until
+    } else {
+      until.map {
+        case (tp, end) =>
+          tp -> sizes.get(tp).map { size =>
+            val begin = from.get(tp).getOrElse(fromNew(tp))
+            val prorate = limit * (size / total)
+            logDebug(s"rateLimit $tp prorated amount is $prorate")
+            // Don't completely starve small topicpartitions
+            val off = begin + (if (prorate < 1) Math.ceil(prorate) else Math.floor(prorate)).toLong
+            logDebug(s"rateLimit $tp new offset is $off")
+            // Paranoia, make sure not to return an offset that's past end
+            Math.min(end, off)
+          }.getOrElse(end)
+      }
+    }
   }
 
   /**
@@ -153,11 +203,7 @@ private[kafka010] case class KafkaSource(
 
     // Find the new partitions, and get their earliest offsets
     val newPartitions = untilPartitionOffsets.keySet.diff(fromPartitionOffsets.keySet)
-    val newPartitionOffsets = if (newPartitions.nonEmpty) {
-      fetchNewPartitionEarliestOffsets(newPartitions.toSeq)
-    } else {
-      Map.empty[TopicPartition, Long]
-    }
+    val newPartitionOffsets = fetchNewPartitionEarliestOffsets(newPartitions.toSeq)
     if (newPartitionOffsets.keySet != newPartitions) {
       // We cannot get from offsets for some partitions. It means they got deleted.
       val deletedPartitions = newPartitions.diff(newPartitionOffsets.keySet)
@@ -221,6 +267,12 @@ private[kafka010] case class KafkaSource(
 
     logInfo("GetBatch generating RDD of offset range: " +
       offsetRanges.sortBy(_.topicPartition.toString).mkString(", "))
+
+    // On recovery, getBatch will get called before getOffset
+    if (currentPartitionOffsets.isEmpty) {
+      currentPartitionOffsets = Some(untilPartitionOffsets)
+    }
+
     sqlContext.createDataFrame(rdd, schema)
   }
 
@@ -305,23 +357,28 @@ private[kafka010] case class KafkaSource(
    * some partitions if they are deleted.
    */
   private def fetchNewPartitionEarliestOffsets(
-      newPartitions: Seq[TopicPartition]): Map[TopicPartition, Long] = withRetriesWithoutInterrupt {
-    // Poll to get the latest assigned partitions
-    consumer.poll(0)
-    val partitions = consumer.assignment()
-    consumer.pause(partitions)
-    logDebug(s"\tPartitions assigned to consumer: $partitions")
-
-    // Get the earliest offset of each partition
-    consumer.seekToBeginning(partitions)
-    val partitionOffsets = newPartitions.filter { p =>
-      // When deleting topics happen at the same time, some partitions may not be in `partitions`.
-      // So we need to ignore them
-      partitions.contains(p)
-    }.map(p => p -> consumer.position(p)).toMap
-    logDebug(s"Got earliest offsets for new partitions: $partitionOffsets")
-    partitionOffsets
-  }
+      newPartitions: Seq[TopicPartition]): Map[TopicPartition, Long] =
+    if (newPartitions.isEmpty) {
+      Map.empty[TopicPartition, Long]
+    } else {
+      withRetriesWithoutInterrupt {
+        // Poll to get the latest assigned partitions
+        consumer.poll(0)
+        val partitions = consumer.assignment()
+        consumer.pause(partitions)
+        logDebug(s"\tPartitions assigned to consumer: $partitions")
+
+        // Get the earliest offset of each partition
+        consumer.seekToBeginning(partitions)
+        val partitionOffsets = newPartitions.filter { p =>
+          // When deleting topics happen at the same time, some partitions may not be in
+          // `partitions`. So we need to ignore them
+          partitions.contains(p)
+        }.map(p => p -> consumer.position(p)).toMap
+        logDebug(s"Got earliest offsets for new partitions: $partitionOffsets")
+        partitionOffsets
+      }
+    }
 
   /**
    * Helper function that does multiple retries on the a body of code that returns offsets.
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
index b50688ecb7743..ed4cc75920e8e 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
@@ -23,13 +23,14 @@ import scala.util.Random
 
 import org.apache.kafka.clients.producer.RecordMetadata
 import org.apache.kafka.common.TopicPartition
+import org.scalatest.concurrent.Eventually._
+import org.scalatest.concurrent.PatienceConfiguration.Timeout
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.sql.execution.streaming._
-import org.apache.spark.sql.streaming.StreamTest
+import org.apache.spark.sql.streaming.{ ProcessingTime, StreamTest }
 import org.apache.spark.sql.test.SharedSQLContext
 
-
 abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
 
   protected var testUtils: KafkaTestUtils = _
@@ -133,6 +134,72 @@ class KafkaSourceSuite extends KafkaSourceTest {
 
   private val topicId = new AtomicInteger(0)
 
+  test("maxOffsetsPerTrigger") {
+    val topic = newTopic()
+    testUtils.createTopic(topic, partitions = 3)
+    testUtils.sendMessages(topic, (100 to 200).map(_.toString).toArray, Some(0))
+    testUtils.sendMessages(topic, (10 to 20).map(_.toString).toArray, Some(1))
+    testUtils.sendMessages(topic, Array("1"), Some(2))
+
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("kafka.metadata.max.age.ms", "1")
+      .option("maxOffsetsPerTrigger", 10)
+      .option("subscribe", topic)
+      .option("startingOffsets", "earliest")
+    val kafka = reader.load()
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .as[(String, String)]
+    val mapped: org.apache.spark.sql.Dataset[_] = kafka.map(kv => kv._2.toInt)
+
+    val clock = new StreamManualClock
+
+    val waitUntilBatchProcessed = AssertOnQuery { q =>
+      eventually(Timeout(streamingTimeout)) {
+        if (!q.exception.isDefined) {
+          assert(clock.isStreamWaitingAt(clock.getTimeMillis()))
+        }
+      }
+      if (q.exception.isDefined) {
+        throw q.exception.get
+      }
+      true
+    }
+
+    testStream(mapped)(
+      StartStream(ProcessingTime(100), clock),
+      waitUntilBatchProcessed,
+      // 1 from smallest, 1 from middle, 8 from biggest
+      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107),
+      AdvanceManualClock(100),
+      waitUntilBatchProcessed,
+      // smallest now empty, 1 more from middle, 9 more from biggest
+      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
+        11, 108, 109, 110, 111, 112, 113, 114, 115, 116
+      ),
+      StopStream,
+      StartStream(ProcessingTime(100), clock),
+      waitUntilBatchProcessed,
+      AdvanceManualClock(100),
+      waitUntilBatchProcessed,
+      // smallest now empty, 1 more from middle, 9 more from biggest
+      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
+        11, 108, 109, 110, 111, 112, 113, 114, 115, 116,
+        12, 117, 118, 119, 120, 121, 122, 123, 124, 125
+      ),
+      AdvanceManualClock(100),
+      waitUntilBatchProcessed,
+      // smallest now empty, 1 more from middle, 9 more from biggest
+      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
+        11, 108, 109, 110, 111, 112, 113, 114, 115, 116,
+        12, 117, 118, 119, 120, 121, 122, 123, 124, 125,
+        13, 126, 127, 128, 129, 130, 131, 132, 133, 134
+      )
+    )
+  }
+
   test("cannot stop Kafka stream") {
     val topic = newTopic()
     testUtils.createTopic(newTopic(), partitions = 5)

From 0b076d4cb6afde2946124e6411ed6a6ce7b8b1a7 Mon Sep 17 00:00:00 2001
From: VinceShieh <vincent.xie@intel.com>
Date: Thu, 27 Oct 2016 11:52:15 -0700
Subject: [PATCH 142/162] [SPARK-17219][ML] enhanced NaN value handling in
 Bucketizer

## What changes were proposed in this pull request?

This PR is an enhancement of PR with commit ID:57dc326bd00cf0a49da971e9c573c48ae28acaa2.
NaN is a special type of value which is commonly seen as invalid. But We find that there are certain cases where NaN are also valuable, thus need special handling. We provided user when dealing NaN values with 3 options, to either reserve an extra bucket for NaN values, or remove the NaN values, or report an error, by setting handleNaN "keep", "skip", or "error"(default) respectively.

'''Before:
val bucketizer: Bucketizer = new Bucketizer()
          .setInputCol("feature")
          .setOutputCol("result")
          .setSplits(splits)
'''After:
val bucketizer: Bucketizer = new Bucketizer()
          .setInputCol("feature")
          .setOutputCol("result")
          .setSplits(splits)
          .setHandleNaN("keep")

## How was this patch tested?
Tests added in QuantileDiscretizerSuite, BucketizerSuite and DataFrameStatSuite

Signed-off-by: VinceShieh <vincent.xieintel.com>

Author: VinceShieh <vincent.xie@intel.com>
Author: Vincent Xie <vincent.xie@intel.com>
Author: Joseph K. Bradley <joseph@databricks.com>

Closes #15428 from VinceShieh/spark-17219_followup.
---
 docs/ml-features.md                           | 15 ++--
 .../apache/spark/ml/feature/Bucketizer.scala  | 71 +++++++++++++++++--
 .../ml/feature/QuantileDiscretizer.scala      | 47 ++++++++++--
 .../spark/ml/feature/BucketizerSuite.scala    | 26 +++++--
 .../ml/feature/QuantileDiscretizerSuite.scala | 35 ++++++---
 python/pyspark/ml/feature.py                  |  5 --
 .../apache/spark/sql/DataFrameStatSuite.scala |  4 ++
 7 files changed, 161 insertions(+), 42 deletions(-)

diff --git a/docs/ml-features.md b/docs/ml-features.md
index a7f710fa52e64..64c6a160239cc 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -1103,11 +1103,16 @@ for more details on the API.
 
 `QuantileDiscretizer` takes a column with continuous features and outputs a column with binned
 categorical features. The number of bins is set by the `numBuckets` parameter. It is possible
-that the number of buckets used will be less than this value, for example, if there are too few
-distinct values of the input to create enough distinct quantiles. Note also that NaN values are
-handled specially and placed into their own bucket. For example, if 4 buckets are used, then
-non-NaN data will be put into buckets[0-3], but NaNs will be counted in a special bucket[4].
-The bin ranges are chosen using an approximate algorithm (see the documentation for
+that the number of buckets used will be smaller than this value, for example, if there are too few
+distinct values of the input to create enough distinct quantiles.
+
+NaN values: Note also that QuantileDiscretizer
+will raise an error when it finds NaN values in the dataset, but the user can also choose to either
+keep or remove NaN values within the dataset by setting `handleInvalid`. If the user chooses to keep
+NaN values, they will be handled specially and placed into their own bucket, for example, if 4 buckets
+are used, then non-NaN data will be put into buckets[0-3], but NaNs will be counted in a special bucket[4].
+
+Algorithm: The bin ranges are chosen using an approximate algorithm (see the documentation for
 [approxQuantile](api/scala/index.html#org.apache.spark.sql.DataFrameStatFunctions) for a
 detailed description). The precision of the approximation can be controlled with the
 `relativeError` parameter. When set to zero, exact quantiles are calculated
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
index ec0ea05f9e1b1..1143f0f565ebd 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
@@ -27,6 +27,7 @@ import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
 import org.apache.spark.ml.util._
 import org.apache.spark.sql._
+import org.apache.spark.sql.expressions.UserDefinedFunction
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
 
@@ -46,6 +47,9 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String
    * also includes y. Splits should be of length >= 3 and strictly increasing.
    * Values at -inf, inf must be explicitly provided to cover all Double values;
    * otherwise, values outside the splits specified will be treated as errors.
+   *
+   * See also [[handleInvalid]], which can optionally create an additional bucket for NaN values.
+   *
    * @group param
    */
   @Since("1.4.0")
@@ -73,15 +77,47 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String
   @Since("1.4.0")
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
+  /**
+   * Param for how to handle invalid entries. Options are skip (filter out rows with
+   * invalid values), error (throw an error), or keep (keep invalid values in a special additional
+   * bucket).
+   * Default: "error"
+   * @group param
+   */
+  @Since("2.1.0")
+  val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle" +
+    "invalid entries. Options are skip (filter out rows with invalid values), " +
+    "error (throw an error), or keep (keep invalid values in a special additional bucket).",
+    ParamValidators.inArray(Bucketizer.supportedHandleInvalid))
+
+  /** @group getParam */
+  @Since("2.1.0")
+  def getHandleInvalid: String = $(handleInvalid)
+
+  /** @group setParam */
+  @Since("2.1.0")
+  def setHandleInvalid(value: String): this.type = set(handleInvalid, value)
+  setDefault(handleInvalid, Bucketizer.ERROR_INVALID)
+
   @Since("2.0.0")
   override def transform(dataset: Dataset[_]): DataFrame = {
     transformSchema(dataset.schema)
-    val bucketizer = udf { feature: Double =>
-      Bucketizer.binarySearchForBuckets($(splits), feature)
+    val (filteredDataset, keepInvalid) = {
+      if (getHandleInvalid == Bucketizer.SKIP_INVALID) {
+        // "skip" NaN option is set, will filter out NaN values in the dataset
+        (dataset.na.drop().toDF(), false)
+      } else {
+        (dataset.toDF(), getHandleInvalid == Bucketizer.KEEP_INVALID)
+      }
+    }
+
+    val bucketizer: UserDefinedFunction = udf { (feature: Double) =>
+      Bucketizer.binarySearchForBuckets($(splits), feature, keepInvalid)
     }
-    val newCol = bucketizer(dataset($(inputCol)))
-    val newField = prepOutputField(dataset.schema)
-    dataset.withColumn($(outputCol), newCol, newField.metadata)
+
+    val newCol = bucketizer(filteredDataset($(inputCol)))
+    val newField = prepOutputField(filteredDataset.schema)
+    filteredDataset.withColumn($(outputCol), newCol, newField.metadata)
   }
 
   private def prepOutputField(schema: StructType): StructField = {
@@ -106,6 +142,12 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String
 @Since("1.6.0")
 object Bucketizer extends DefaultParamsReadable[Bucketizer] {
 
+  private[feature] val SKIP_INVALID: String = "skip"
+  private[feature] val ERROR_INVALID: String = "error"
+  private[feature] val KEEP_INVALID: String = "keep"
+  private[feature] val supportedHandleInvalid: Array[String] =
+    Array(SKIP_INVALID, ERROR_INVALID, KEEP_INVALID)
+
   /**
    * We require splits to be of length >= 3 and to be in strictly increasing order.
    * No NaN split should be accepted.
@@ -126,11 +168,26 @@ object Bucketizer extends DefaultParamsReadable[Bucketizer] {
 
   /**
    * Binary searching in several buckets to place each data point.
+   * @param splits array of split points
+   * @param feature data point
+   * @param keepInvalid NaN flag.
+   *                    Set "true" to make an extra bucket for NaN values;
+   *                    Set "false" to report an error for NaN values
+   * @return bucket for each data point
    * @throws SparkException if a feature is < splits.head or > splits.last
    */
-  private[feature] def binarySearchForBuckets(splits: Array[Double], feature: Double): Double = {
+
+  private[feature] def binarySearchForBuckets(
+      splits: Array[Double],
+      feature: Double,
+      keepInvalid: Boolean): Double = {
     if (feature.isNaN) {
-      splits.length - 1
+      if (keepInvalid) {
+        splits.length - 1
+      } else {
+        throw new SparkException("Bucketizer encountered NaN value. To handle or skip NaNs," +
+          " try setting Bucketizer.handleInvalid.")
+      }
     } else if (feature == splits.last) {
       splits.length - 2
     } else {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
index 05e034d90f6a3..b9e01dde70d85 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
@@ -36,6 +36,9 @@ private[feature] trait QuantileDiscretizerBase extends Params
   /**
    * Number of buckets (quantiles, or categories) into which data points are grouped. Must
    * be >= 2.
+   *
+   * See also [[handleInvalid]], which can optionally create an additional bucket for NaN values.
+   *
    * default: 2
    * @group param
    */
@@ -61,17 +64,41 @@ private[feature] trait QuantileDiscretizerBase extends Params
 
   /** @group getParam */
   def getRelativeError: Double = getOrDefault(relativeError)
+
+  /**
+   * Param for how to handle invalid entries. Options are skip (filter out rows with
+   * invalid values), error (throw an error), or keep (keep invalid values in a special additional
+   * bucket).
+   * Default: "error"
+   * @group param
+   */
+  @Since("2.1.0")
+  val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle" +
+    "invalid entries. Options are skip (filter out rows with invalid values), " +
+    "error (throw an error), or keep (keep invalid values in a special additional bucket).",
+    ParamValidators.inArray(Bucketizer.supportedHandleInvalid))
+  setDefault(handleInvalid, Bucketizer.ERROR_INVALID)
+
+  /** @group getParam */
+  @Since("2.1.0")
+  def getHandleInvalid: String = $(handleInvalid)
+
 }
 
 /**
  * `QuantileDiscretizer` takes a column with continuous features and outputs a column with binned
  * categorical features. The number of bins can be set using the `numBuckets` parameter. It is
- * possible that the number of buckets used will be less than this value, for example, if there
- * are too few distinct values of the input to create enough distinct quantiles. Note also that
- * NaN values are handled specially and placed into their own bucket. For example, if 4 buckets
- * are used, then non-NaN data will be put into buckets(0-3), but NaNs will be counted in a special
- * bucket(4).
- * The bin ranges are chosen using an approximate algorithm (see the documentation for
+ * possible that the number of buckets used will be smaller than this value, for example, if there
+ * are too few distinct values of the input to create enough distinct quantiles.
+ *
+ * NaN handling: Note also that
+ * QuantileDiscretizer will raise an error when it finds NaN values in the dataset, but the user can
+ * also choose to either keep or remove NaN values within the dataset by setting `handleInvalid`.
+ * If the user chooses to keep NaN values, they will be handled specially and placed into their own
+ * bucket, for example, if 4 buckets are used, then non-NaN data will be put into buckets[0-3],
+ * but NaNs will be counted in a special bucket[4].
+ *
+ * Algorithm: The bin ranges are chosen using an approximate algorithm (see the documentation for
  * [[org.apache.spark.sql.DataFrameStatFunctions.approxQuantile approxQuantile]]
  * for a detailed description). The precision of the approximation can be controlled with the
  * `relativeError` parameter. The lower and upper bin bounds will be `-Infinity` and `+Infinity`,
@@ -100,6 +127,10 @@ final class QuantileDiscretizer @Since("1.6.0") (@Since("1.6.0") override val ui
   @Since("1.6.0")
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
+  /** @group setParam */
+  @Since("2.1.0")
+  def setHandleInvalid(value: String): this.type = set(handleInvalid, value)
+
   @Since("1.6.0")
   override def transformSchema(schema: StructType): StructType = {
     SchemaUtils.checkNumericType(schema, $(inputCol))
@@ -124,7 +155,9 @@ final class QuantileDiscretizer @Since("1.6.0") (@Since("1.6.0") override val ui
       log.warn(s"Some quantiles were identical. Bucketing to ${distinctSplits.length - 1}" +
         s" buckets as a result.")
     }
-    val bucketizer = new Bucketizer(uid).setSplits(distinctSplits.sorted)
+    val bucketizer = new Bucketizer(uid)
+      .setSplits(distinctSplits.sorted)
+      .setHandleInvalid($(handleInvalid))
     copyValues(bucketizer.setParent(this))
   }
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala
index 87cdceb267387..aac29137d7911 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala
@@ -99,21 +99,32 @@ class BucketizerSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
       .setOutputCol("result")
       .setSplits(splits)
 
+    bucketizer.setHandleInvalid("keep")
     bucketizer.transform(dataFrame).select("result", "expected").collect().foreach {
       case Row(x: Double, y: Double) =>
         assert(x === y,
           s"The feature value is not correct after bucketing.  Expected $y but found $x")
     }
+
+    bucketizer.setHandleInvalid("skip")
+    val skipResults: Array[Double] = bucketizer.transform(dataFrame)
+      .select("result").as[Double].collect()
+    assert(skipResults.length === 7)
+    assert(skipResults.forall(_ !== 4.0))
+
+    bucketizer.setHandleInvalid("error")
+    withClue("Bucketizer should throw error when setHandleInvalid=error and given NaN values") {
+      intercept[SparkException] {
+        bucketizer.transform(dataFrame).collect()
+      }
+    }
   }
 
   test("Bucket continuous features, with NaN splits") {
     val splits = Array(Double.NegativeInfinity, -0.5, 0.0, 0.5, Double.PositiveInfinity, Double.NaN)
-    withClue("Invalid NaN split was not caught as an invalid split!") {
+    withClue("Invalid NaN split was not caught during Bucketizer initialization") {
       intercept[IllegalArgumentException] {
-        val bucketizer: Bucketizer = new Bucketizer()
-          .setInputCol("feature")
-          .setOutputCol("result")
-          .setSplits(splits)
+        new Bucketizer().setSplits(splits)
       }
     }
   }
@@ -138,7 +149,8 @@ class BucketizerSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
     val data = Array.fill(100)(Random.nextDouble())
     val splits: Array[Double] = Double.NegativeInfinity +:
       Array.fill(10)(Random.nextDouble()).sorted :+ Double.PositiveInfinity
-    val bsResult = Vectors.dense(data.map(x => Bucketizer.binarySearchForBuckets(splits, x)))
+    val bsResult = Vectors.dense(data.map(x =>
+      Bucketizer.binarySearchForBuckets(splits, x, false)))
     val lsResult = Vectors.dense(data.map(x => BucketizerSuite.linearSearchForBuckets(splits, x)))
     assert(bsResult ~== lsResult absTol 1e-5)
   }
@@ -169,7 +181,7 @@ private object BucketizerSuite extends SparkFunSuite {
   /** Check all values in splits, plus values between all splits. */
   def checkBinarySearch(splits: Array[Double]): Unit = {
     def testFeature(feature: Double, expectedBucket: Double): Unit = {
-      assert(Bucketizer.binarySearchForBuckets(splits, feature) === expectedBucket,
+      assert(Bucketizer.binarySearchForBuckets(splits, feature, false) === expectedBucket,
         s"Expected feature value $feature to be in bucket $expectedBucket with splits:" +
           s" ${splits.mkString(", ")}")
     }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala
index 6822594044a56..f219f775b2186 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala
@@ -17,10 +17,10 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.SparkFunSuite
+import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.ml.util.DefaultReadWriteTest
 import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql._
 import org.apache.spark.sql.functions.udf
 
 class QuantileDiscretizerSuite
@@ -76,20 +76,33 @@ class QuantileDiscretizerSuite
     import spark.implicits._
 
     val numBuckets = 3
-    val df = sc.parallelize(Array(1.0, 1.0, 1.0, Double.NaN))
-      .map(Tuple1.apply).toDF("input")
+    val validData = Array(-0.9, -0.5, -0.3, 0.0, 0.2, 0.5, 0.9, Double.NaN, Double.NaN, Double.NaN)
+    val expectedKeep = Array(0.0, 0.0, 1.0, 1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0)
+    val expectedSkip = Array(0.0, 0.0, 1.0, 1.0, 2.0, 2.0, 2.0)
+
     val discretizer = new QuantileDiscretizer()
       .setInputCol("input")
       .setOutputCol("result")
       .setNumBuckets(numBuckets)
 
-    // Reserve extra one bucket for NaN
-    val expectedNumBuckets = discretizer.fit(df).getSplits.length - 1
-    val result = discretizer.fit(df).transform(df)
-    val observedNumBuckets = result.select("result").distinct.count
-    assert(observedNumBuckets == expectedNumBuckets,
-      s"Observed number of buckets are not correct." +
-        s" Expected $expectedNumBuckets but found $observedNumBuckets")
+    withClue("QuantileDiscretizer with handleInvalid=error should throw exception for NaN values") {
+      val dataFrame: DataFrame = validData.toSeq.toDF("input")
+      intercept[SparkException] {
+        discretizer.fit(dataFrame).transform(dataFrame).collect()
+      }
+    }
+
+    List(("keep", expectedKeep), ("skip", expectedSkip)).foreach{
+      case(u, v) =>
+        discretizer.setHandleInvalid(u)
+        val dataFrame: DataFrame = validData.zip(v).toSeq.toDF("input", "expected")
+        val result = discretizer.fit(dataFrame).transform(dataFrame)
+        result.select("result", "expected").collect().foreach {
+          case Row(x: Double, y: Double) =>
+            assert(x === y,
+              s"The feature value is not correct after bucketing.  Expected $y but found $x")
+        }
+    }
   }
 
   test("Test transform method on unseen data") {
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 7683360664ebd..94afe82a36472 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -1155,11 +1155,6 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadab
 
     `QuantileDiscretizer` takes a column with continuous features and outputs a column with binned
     categorical features. The number of bins can be set using the :py:attr:`numBuckets` parameter.
-    It is possible that the number of buckets used will be less than this value, for example, if
-    there are too few distinct values of the input to create enough distinct quantiles. Note also
-    that NaN values are handled specially and placed into their own bucket. For example, if 4
-    buckets are used, then non-NaN data will be put into buckets(0-3), but NaNs will be counted in
-    a special bucket(4).
     The bin ranges are chosen using an approximate algorithm (see the documentation for
     :py:meth:`~.DataFrameStatFunctions.approxQuantile` for a detailed description).
     The precision of the approximation can be controlled with the
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
index 73026c749db45..1383208874a19 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
@@ -150,6 +150,10 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext {
       assert(math.abs(d1 - 2 * q1 * n) < error_double)
       assert(math.abs(d2 - 2 * q2 * n) < error_double)
     }
+    // test approxQuantile on NaN values
+    val dfNaN = Seq(Double.NaN, 1.0, Double.NaN, Double.NaN).toDF("input")
+    val resNaN = dfNaN.stat.approxQuantile("input", Array(q1, q2), epsilons.head)
+    assert(resNaN.count(_.isNaN) === 0)
   }
 
   test("crosstab") {

From 79fd0cc0584e48fb021c4237877b15abbffb319a Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Thu, 27 Oct 2016 12:32:58 -0700
Subject: [PATCH 143/162] [SPARK-16963][SQL] Fix test "StreamExecution metadata
 garbage collection"

## What changes were proposed in this pull request?

A follow up PR for #14553 to fix the flaky test. It's flaky because the file list API doesn't guarantee any order of the return list.

## How was this patch tested?

Jenkins

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #15661 from zsxwing/fix-StreamingQuerySuite.
---
 .../org/apache/spark/sql/streaming/StreamingQuerySuite.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
index dad410486ed24..464c443beb6e7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
@@ -265,7 +265,7 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
       AssertOnQuery("metadata log should contain only two files") { q =>
         val metadataLogDir = new java.io.File(q.offsetLog.metadataPath.toString)
         val logFileNames = metadataLogDir.listFiles().toSeq.map(_.getName())
-        val toTest = logFileNames.filter(! _.endsWith(".crc"))  // Workaround for SPARK-17475
+        val toTest = logFileNames.filter(! _.endsWith(".crc")).sorted  // Workaround for SPARK-17475
         assert(toTest.size == 2 && toTest.head == "1")
         true
       }

From ccb11543048dccd4cc590a8db1df1d9d5847d112 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Thu, 27 Oct 2016 14:22:30 -0700
Subject: [PATCH 144/162] [SPARK-17970][SQL] store partition spec in metastore
 for data source table

## What changes were proposed in this pull request?

We should follow hive table and also store partition spec in metastore for data source table.
This brings 2 benefits:

1. It's more flexible to manage the table data files, as users can use `ADD PARTITION`, `DROP PARTITION` and `RENAME PARTITION`
2. We don't need to cache all file status for data source table anymore.

## How was this patch tested?

existing tests.

Author: Eric Liang <ekl@databricks.com>
Author: Michael Allman <michael@videoamp.com>
Author: Eric Liang <ekhliang@gmail.com>
Author: Wenchen Fan <wenchen@databricks.com>

Closes #15515 from cloud-fan/partition.
---
 .../sql/catalyst/catalog/interface.scala      |  12 +-
 .../sql/catalyst/trees/TreeNodeSuite.scala    |   1 +
 .../apache/spark/sql/DataFrameWriter.scala    |  13 +-
 .../command/AnalyzeColumnCommand.scala        |   3 +-
 .../command/AnalyzeTableCommand.scala         |   3 +-
 .../command/createDataSourceTables.scala      |  17 +-
 .../spark/sql/execution/command/ddl.scala     |  90 ++++----
 .../spark/sql/execution/command/tables.scala  |  39 ++--
 .../execution/datasources/DataSource.scala    |  20 +-
 .../datasources/DataSourceStrategy.scala      |  15 +-
 .../execution/datasources/FileCatalog.scala   |   4 +
 .../datasources/FileStatusCache.scala         |   2 +-
 .../PartitioningAwareFileCatalog.scala        |  12 +-
 .../datasources/TableFileCatalog.scala        |   4 +-
 .../apache/spark/sql/internal/SQLConf.scala   |  16 +-
 .../apache/spark/sql/SQLQueryTestSuite.scala  |   2 +-
 .../sql/execution/command/DDLSuite.scala      | 200 +++++++-----------
 .../spark/sql/hive/HiveExternalCatalog.scala  | 129 +++++++----
 .../spark/sql/hive/HiveMetastoreCatalog.scala |   9 +-
 .../sql/hive/client/HiveClientImpl.scala      |   5 +-
 .../sql/hive/HiveMetadataCacheSuite.scala     |   2 +-
 .../PartitionProviderCompatibilitySuite.scala | 137 ++++++++++++
 ...a => PartitionedTablePerfStatsSuite.scala} | 112 +++++++---
 .../spark/sql/hive/StatisticsSuite.scala      |  65 +++---
 .../sql/hive/execution/HiveCommandSuite.scala |   5 +-
 .../sql/hive/execution/SQLQuerySuite.scala    |   8 +-
 26 files changed, 596 insertions(+), 329 deletions(-)
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
 rename sql/hive/src/test/scala/org/apache/spark/sql/hive/{HiveTablePerfStatsSuite.scala => PartitionedTablePerfStatsSuite.scala} (68%)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index a97ed701c4207..7c3bec897956a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -89,9 +89,10 @@ case class CatalogTablePartition(
     parameters: Map[String, String] = Map.empty) {
 
   override def toString: String = {
+    val specString = spec.map { case (k, v) => s"$k=$v" }.mkString(", ")
     val output =
       Seq(
-        s"Partition Values: [${spec.values.mkString(", ")}]",
+        s"Partition Values: [$specString]",
         s"$storage",
         s"Partition Parameters:{${parameters.map(p => p._1 + "=" + p._2).mkString(", ")}}")
 
@@ -137,6 +138,8 @@ case class BucketSpec(
  *                 Can be None if this table is a View, should be "hive" for hive serde tables.
  * @param unsupportedFeatures is a list of string descriptions of features that are used by the
  *        underlying table but not supported by Spark SQL yet.
+ * @param partitionProviderIsHive whether this table's partition metadata is stored in the Hive
+ *                                metastore.
  */
 case class CatalogTable(
     identifier: TableIdentifier,
@@ -154,7 +157,8 @@ case class CatalogTable(
     viewOriginalText: Option[String] = None,
     viewText: Option[String] = None,
     comment: Option[String] = None,
-    unsupportedFeatures: Seq[String] = Seq.empty) {
+    unsupportedFeatures: Seq[String] = Seq.empty,
+    partitionProviderIsHive: Boolean = false) {
 
   /** schema of this table's partition columns */
   def partitionSchema: StructType = StructType(schema.filter {
@@ -212,11 +216,11 @@ case class CatalogTable(
         comment.map("Comment: " + _).getOrElse(""),
         if (properties.nonEmpty) s"Properties: $tableProperties" else "",
         if (stats.isDefined) s"Statistics: ${stats.get.simpleString}" else "",
-        s"$storage")
+        s"$storage",
+        if (partitionProviderIsHive) "Partition Provider: Hive" else "")
 
     output.filter(_.nonEmpty).mkString("CatalogTable(\n\t", "\n\t", ")")
   }
-
 }
 
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
index cb0426c7a98a1..3eff12f9eed14 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
@@ -489,6 +489,7 @@ class TreeNodeSuite extends SparkFunSuite {
         "owner" -> "",
         "createTime" -> 0,
         "lastAccessTime" -> -1,
+        "partitionProviderIsHive" -> false,
         "properties" -> JNull,
         "unsupportedFeatures" -> List.empty[String]))
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 4b5f0246b9a1b..7ff3522f547d3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -25,7 +25,8 @@ import org.apache.spark.annotation.InterfaceStability
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
 import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, CatalogTable, CatalogTableType}
-import org.apache.spark.sql.catalyst.plans.logical.InsertIntoTable
+import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, Union}
+import org.apache.spark.sql.execution.command.AlterTableRecoverPartitionsCommand
 import org.apache.spark.sql.execution.datasources.{CaseInsensitiveMap, CreateTable, DataSource, HadoopFsRelation}
 import org.apache.spark.sql.types.StructType
 
@@ -387,7 +388,15 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
           partitionColumnNames = partitioningColumns.getOrElse(Nil),
           bucketSpec = getBucketSpec
         )
-        val cmd = CreateTable(tableDesc, mode, Some(df.logicalPlan))
+        val createCmd = CreateTable(tableDesc, mode, Some(df.logicalPlan))
+        val cmd = if (tableDesc.partitionColumnNames.nonEmpty &&
+            df.sparkSession.sqlContext.conf.manageFilesourcePartitions) {
+          // Need to recover partitions into the metastore so our saved data is visible.
+          val recoverPartitionCmd = AlterTableRecoverPartitionsCommand(tableDesc.identifier)
+          Union(createCmd, recoverPartitionCmd)
+        } else {
+          createCmd
+        }
         df.sparkSession.sessionState.executePlan(cmd).toRdd
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
index 488138709a12b..f873f34a845ef 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
@@ -50,7 +50,8 @@ case class AnalyzeColumnCommand(
           AnalyzeTableCommand.calculateTotalSize(sessionState, catalogRel.catalogTable))
 
       case logicalRel: LogicalRelation if logicalRel.catalogTable.isDefined =>
-        updateStats(logicalRel.catalogTable.get, logicalRel.relation.sizeInBytes)
+        updateStats(logicalRel.catalogTable.get,
+          AnalyzeTableCommand.calculateTotalSize(sessionState, logicalRel.catalogTable.get))
 
       case otherRelation =>
         throw new AnalysisException("ANALYZE TABLE is not supported for " +
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
index 7b0e49b665f42..52a8fc88c56cd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
@@ -51,7 +51,8 @@ case class AnalyzeTableCommand(
 
       // data source tables have been converted into LogicalRelations
       case logicalRel: LogicalRelation if logicalRel.catalogTable.isDefined =>
-        updateTableStats(logicalRel.catalogTable.get, logicalRel.relation.sizeInBytes)
+        updateTableStats(logicalRel.catalogTable.get,
+          AnalyzeTableCommand.calculateTotalSize(sessionState, logicalRel.catalogTable.get))
 
       case otherRelation =>
         throw new AnalysisException("ANALYZE TABLE is not supported for " +
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
index a8c75a7f29cef..2a9743130d4c4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -94,10 +94,16 @@ case class CreateDataSourceTableCommand(table: CatalogTable, ignoreIfExists: Boo
     val newTable = table.copy(
       storage = table.storage.copy(properties = optionsWithPath),
       schema = dataSource.schema,
-      partitionColumnNames = partitionColumnNames)
+      partitionColumnNames = partitionColumnNames,
+      // If metastore partition management for file source tables is enabled, we start off with
+      // partition provider hive, but no partitions in the metastore. The user has to call
+      // `msck repair table` to populate the table partitions.
+      partitionProviderIsHive = partitionColumnNames.nonEmpty &&
+        sparkSession.sessionState.conf.manageFilesourcePartitions)
     // We will return Nil or throw exception at the beginning if the table already exists, so when
     // we reach here, the table should not exist and we should set `ignoreIfExists` to false.
     sessionState.catalog.createTable(newTable, ignoreIfExists = false)
+
     Seq.empty[Row]
   }
 }
@@ -232,6 +238,15 @@ case class CreateDataSourceTableAsSelectCommand(
       sessionState.catalog.createTable(newTable, ignoreIfExists = false)
     }
 
+    result match {
+      case fs: HadoopFsRelation if table.partitionColumnNames.nonEmpty &&
+          sparkSession.sqlContext.conf.manageFilesourcePartitions =>
+        // Need to recover partitions into the metastore so our saved data is visible.
+        sparkSession.sessionState.executePlan(
+          AlterTableRecoverPartitionsCommand(table.identifier)).toRdd
+      case _ =>
+    }
+
     // Refresh the cache of the table in the catalog.
     sessionState.catalog.refreshTable(tableIdentWithDB)
     Seq.empty[Row]
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index 15656faa08e4f..61e0550cef5e3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -28,10 +28,11 @@ import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
 
 import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
 import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.analysis.Resolver
 import org.apache.spark.sql.catalyst.catalog.{CatalogDatabase, CatalogTable, CatalogTablePartition, CatalogTableType, SessionCatalog}
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
-import org.apache.spark.sql.execution.datasources.PartitioningUtils
+import org.apache.spark.sql.execution.datasources.{CaseInsensitiveMap, PartitioningUtils}
 import org.apache.spark.sql.types._
 import org.apache.spark.util.SerializableConfiguration
 
@@ -346,10 +347,7 @@ case class AlterTableAddPartitionCommand(
     val catalog = sparkSession.sessionState.catalog
     val table = catalog.getTableMetadata(tableName)
     DDLUtils.verifyAlterTableType(catalog, table, isView = false)
-    if (DDLUtils.isDatasourceTable(table)) {
-      throw new AnalysisException(
-        "ALTER TABLE ADD PARTITION is not allowed for tables defined using the datasource API")
-    }
+    DDLUtils.verifyPartitionProviderIsHive(sparkSession, table, "ALTER TABLE ADD PARTITION")
     val parts = partitionSpecsAndLocs.map { case (spec, location) =>
       val normalizedSpec = PartitioningUtils.normalizePartitionSpec(
         spec,
@@ -382,11 +380,8 @@ case class AlterTableRenamePartitionCommand(
   override def run(sparkSession: SparkSession): Seq[Row] = {
     val catalog = sparkSession.sessionState.catalog
     val table = catalog.getTableMetadata(tableName)
-    if (DDLUtils.isDatasourceTable(table)) {
-      throw new AnalysisException(
-        "ALTER TABLE RENAME PARTITION is not allowed for tables defined using the datasource API")
-    }
     DDLUtils.verifyAlterTableType(catalog, table, isView = false)
+    DDLUtils.verifyPartitionProviderIsHive(sparkSession, table, "ALTER TABLE RENAME PARTITION")
 
     val normalizedOldPartition = PartitioningUtils.normalizePartitionSpec(
       oldPartition,
@@ -432,10 +427,7 @@ case class AlterTableDropPartitionCommand(
     val catalog = sparkSession.sessionState.catalog
     val table = catalog.getTableMetadata(tableName)
     DDLUtils.verifyAlterTableType(catalog, table, isView = false)
-    if (DDLUtils.isDatasourceTable(table)) {
-      throw new AnalysisException(
-        "ALTER TABLE DROP PARTITIONS is not allowed for tables defined using the datasource API")
-    }
+    DDLUtils.verifyPartitionProviderIsHive(sparkSession, table, "ALTER TABLE DROP PARTITION")
 
     val normalizedSpecs = specs.map { spec =>
       PartitioningUtils.normalizePartitionSpec(
@@ -493,33 +485,39 @@ case class AlterTableRecoverPartitionsCommand(
     }
   }
 
+  private def getBasePath(table: CatalogTable): Option[String] = {
+    if (table.provider == Some("hive")) {
+      table.storage.locationUri
+    } else {
+      new CaseInsensitiveMap(table.storage.properties).get("path")
+    }
+  }
+
   override def run(spark: SparkSession): Seq[Row] = {
     val catalog = spark.sessionState.catalog
     val table = catalog.getTableMetadata(tableName)
     val tableIdentWithDB = table.identifier.quotedString
     DDLUtils.verifyAlterTableType(catalog, table, isView = false)
-    if (DDLUtils.isDatasourceTable(table)) {
-      throw new AnalysisException(
-        s"Operation not allowed: $cmd on datasource tables: $tableIdentWithDB")
-    }
     if (table.partitionColumnNames.isEmpty) {
       throw new AnalysisException(
         s"Operation not allowed: $cmd only works on partitioned tables: $tableIdentWithDB")
     }
-    if (table.storage.locationUri.isEmpty) {
+
+    val tablePath = getBasePath(table)
+    if (tablePath.isEmpty) {
       throw new AnalysisException(s"Operation not allowed: $cmd only works on table with " +
         s"location provided: $tableIdentWithDB")
     }
 
-    val root = new Path(table.storage.locationUri.get)
+    val root = new Path(tablePath.get)
     logInfo(s"Recover all the partitions in $root")
     val fs = root.getFileSystem(spark.sparkContext.hadoopConfiguration)
 
     val threshold = spark.conf.get("spark.rdd.parallelListingThreshold", "10").toInt
     val hadoopConf = spark.sparkContext.hadoopConfiguration
     val pathFilter = getPathFilter(hadoopConf)
-    val partitionSpecsAndLocs = scanPartitions(
-      spark, fs, pathFilter, root, Map(), table.partitionColumnNames.map(_.toLowerCase), threshold)
+    val partitionSpecsAndLocs = scanPartitions(spark, fs, pathFilter, root, Map(),
+      table.partitionColumnNames, threshold, spark.sessionState.conf.resolver)
     val total = partitionSpecsAndLocs.length
     logInfo(s"Found $total partitions in $root")
 
@@ -531,6 +529,11 @@ case class AlterTableRecoverPartitionsCommand(
     logInfo(s"Finished to gather the fast stats for all $total partitions.")
 
     addPartitions(spark, table, partitionSpecsAndLocs, partitionStats)
+    // Updates the table to indicate that its partition metadata is stored in the Hive metastore.
+    // This is always the case for Hive format tables, but is not true for Datasource tables created
+    // before Spark 2.1 unless they are converted via `msck repair table`.
+    spark.sessionState.catalog.alterTable(table.copy(partitionProviderIsHive = true))
+    catalog.refreshTable(tableName)
     logInfo(s"Recovered all partitions ($total).")
     Seq.empty[Row]
   }
@@ -544,7 +547,8 @@ case class AlterTableRecoverPartitionsCommand(
       path: Path,
       spec: TablePartitionSpec,
       partitionNames: Seq[String],
-      threshold: Int): GenSeq[(TablePartitionSpec, Path)] = {
+      threshold: Int,
+      resolver: Resolver): GenSeq[(TablePartitionSpec, Path)] = {
     if (partitionNames.isEmpty) {
       return Seq(spec -> path)
     }
@@ -563,15 +567,15 @@ case class AlterTableRecoverPartitionsCommand(
       val name = st.getPath.getName
       if (st.isDirectory && name.contains("=")) {
         val ps = name.split("=", 2)
-        val columnName = PartitioningUtils.unescapePathName(ps(0)).toLowerCase
+        val columnName = PartitioningUtils.unescapePathName(ps(0))
         // TODO: Validate the value
         val value = PartitioningUtils.unescapePathName(ps(1))
-        // comparing with case-insensitive, but preserve the case
-        if (columnName == partitionNames.head) {
-          scanPartitions(spark, fs, filter, st.getPath, spec ++ Map(columnName -> value),
-            partitionNames.drop(1), threshold)
+        if (resolver(columnName, partitionNames.head)) {
+          scanPartitions(spark, fs, filter, st.getPath, spec ++ Map(partitionNames.head -> value),
+            partitionNames.drop(1), threshold, resolver)
         } else {
-          logWarning(s"expect partition column ${partitionNames.head}, but got ${ps(0)}, ignore it")
+          logWarning(
+            s"expected partition column ${partitionNames.head}, but got ${ps(0)}, ignoring it")
           Seq()
         }
       } else {
@@ -676,16 +680,11 @@ case class AlterTableSetLocationCommand(
     DDLUtils.verifyAlterTableType(catalog, table, isView = false)
     partitionSpec match {
       case Some(spec) =>
+        DDLUtils.verifyPartitionProviderIsHive(
+          sparkSession, table, "ALTER TABLE ... SET LOCATION")
         // Partition spec is specified, so we set the location only for this partition
         val part = catalog.getPartition(table.identifier, spec)
-        val newPart =
-          if (DDLUtils.isDatasourceTable(table)) {
-            throw new AnalysisException(
-              "ALTER TABLE SET LOCATION for partition is not allowed for tables defined " +
-              "using the datasource API")
-          } else {
-            part.copy(storage = part.storage.copy(locationUri = Some(location)))
-          }
+        val newPart = part.copy(storage = part.storage.copy(locationUri = Some(location)))
         catalog.alterPartitions(table.identifier, Seq(newPart))
       case None =>
         // No partition spec is specified, so we set the location for the table itself
@@ -709,6 +708,25 @@ object DDLUtils {
     table.provider.isDefined && table.provider.get != "hive"
   }
 
+  /**
+   * Throws a standard error for actions that require partitionProvider = hive.
+   */
+  def verifyPartitionProviderIsHive(
+      spark: SparkSession, table: CatalogTable, action: String): Unit = {
+    val tableName = table.identifier.table
+    if (!spark.sqlContext.conf.manageFilesourcePartitions && isDatasourceTable(table)) {
+      throw new AnalysisException(
+        s"$action is not allowed on $tableName since filesource partition management is " +
+          "disabled (spark.sql.hive.manageFilesourcePartitions = false).")
+    }
+    if (!table.partitionProviderIsHive && isDatasourceTable(table)) {
+      throw new AnalysisException(
+        s"$action is not allowed on $tableName since its partition metadata is not stored in " +
+          "the Hive metastore. To import this information into the metastore, run " +
+          s"`msck repair table $tableName`")
+    }
+  }
+
   /**
    * If the command ALTER VIEW is to alter a table or ALTER TABLE is to alter a view,
    * issue an exception [[AnalysisException]].
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index aec25430b719d..4acfffb628047 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -358,19 +358,16 @@ case class TruncateTableCommand(
       throw new AnalysisException(
         s"Operation not allowed: TRUNCATE TABLE on views: $tableIdentwithDB")
     }
-    val isDatasourceTable = DDLUtils.isDatasourceTable(table)
-    if (isDatasourceTable && partitionSpec.isDefined) {
-      throw new AnalysisException(
-        s"Operation not allowed: TRUNCATE TABLE ... PARTITION is not supported " +
-        s"for tables created using the data sources API: $tableIdentwithDB")
-    }
     if (table.partitionColumnNames.isEmpty && partitionSpec.isDefined) {
       throw new AnalysisException(
         s"Operation not allowed: TRUNCATE TABLE ... PARTITION is not supported " +
         s"for tables that are not partitioned: $tableIdentwithDB")
     }
+    if (partitionSpec.isDefined) {
+      DDLUtils.verifyPartitionProviderIsHive(spark, table, "TRUNCATE TABLE ... PARTITION")
+    }
     val locations =
-      if (isDatasourceTable) {
+      if (DDLUtils.isDatasourceTable(table)) {
         Seq(table.storage.properties.get("path"))
       } else if (table.partitionColumnNames.isEmpty) {
         Seq(table.storage.locationUri)
@@ -453,7 +450,7 @@ case class DescribeTableCommand(
           describeFormattedTableInfo(metadata, result)
         }
       } else {
-        describeDetailedPartitionInfo(catalog, metadata, result)
+        describeDetailedPartitionInfo(sparkSession, catalog, metadata, result)
       }
     }
 
@@ -492,6 +489,10 @@ case class DescribeTableCommand(
     describeStorageInfo(table, buffer)
 
     if (table.tableType == CatalogTableType.VIEW) describeViewInfo(table, buffer)
+
+    if (DDLUtils.isDatasourceTable(table) && table.partitionProviderIsHive) {
+      append(buffer, "Partition Provider:", "Hive", "")
+    }
   }
 
   private def describeStorageInfo(metadata: CatalogTable, buffer: ArrayBuffer[Row]): Unit = {
@@ -528,6 +529,7 @@ case class DescribeTableCommand(
   }
 
   private def describeDetailedPartitionInfo(
+      spark: SparkSession,
       catalog: SessionCatalog,
       metadata: CatalogTable,
       result: ArrayBuffer[Row]): Unit = {
@@ -535,10 +537,7 @@ case class DescribeTableCommand(
       throw new AnalysisException(
         s"DESC PARTITION is not allowed on a view: ${table.identifier}")
     }
-    if (DDLUtils.isDatasourceTable(metadata)) {
-      throw new AnalysisException(
-        s"DESC PARTITION is not allowed on a datasource table: ${table.identifier}")
-    }
+    DDLUtils.verifyPartitionProviderIsHive(spark, metadata, "DESC PARTITION")
     val partition = catalog.getPartition(table, partitionSpec)
     if (isExtended) {
       describeExtendedDetailedPartitionInfo(table, metadata, partition, result)
@@ -743,10 +742,7 @@ case class ShowPartitionsCommand(
         s"SHOW PARTITIONS is not allowed on a table that is not partitioned: $tableIdentWithDB")
     }
 
-    if (DDLUtils.isDatasourceTable(table)) {
-      throw new AnalysisException(
-        s"SHOW PARTITIONS is not allowed on a datasource table: $tableIdentWithDB")
-    }
+    DDLUtils.verifyPartitionProviderIsHive(sparkSession, table, "SHOW PARTITIONS")
 
     /**
      * Validate the partitioning spec by making sure all the referenced columns are
@@ -894,18 +890,11 @@ case class ShowCreateTableCommand(table: TableIdentifier) extends RunnableComman
 
   private def showHiveTableProperties(metadata: CatalogTable, builder: StringBuilder): Unit = {
     if (metadata.properties.nonEmpty) {
-      val filteredProps = metadata.properties.filterNot {
-        // Skips "EXTERNAL" property for external tables
-        case (key, _) => key == "EXTERNAL" && metadata.tableType == EXTERNAL
-      }
-
-      val props = filteredProps.map { case (key, value) =>
+      val props = metadata.properties.map { case (key, value) =>
         s"'${escapeSingleQuotedString(key)}' = '${escapeSingleQuotedString(value)}'"
       }
 
-      if (props.nonEmpty) {
-        builder ++= props.mkString("TBLPROPERTIES (\n  ", ",\n  ", "\n)\n")
-      }
+      builder ++= props.mkString("TBLPROPERTIES (\n  ", ",\n  ", "\n)\n")
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index 17da606580eea..5b8f05a396241 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -30,7 +30,7 @@ import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
-import org.apache.spark.sql.catalyst.catalog.BucketSpec
+import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable}
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat
 import org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider
@@ -65,6 +65,8 @@ import org.apache.spark.util.Utils
  * @param partitionColumns A list of column names that the relation is partitioned by. When this
  *                         list is empty, the relation is unpartitioned.
  * @param bucketSpec An optional specification for bucketing (hash-partitioning) of the data.
+ * @param catalogTable Optional catalog table reference that can be used to push down operations
+ *                     over the datasource to the catalog service.
  */
 case class DataSource(
     sparkSession: SparkSession,
@@ -73,7 +75,8 @@ case class DataSource(
     userSpecifiedSchema: Option[StructType] = None,
     partitionColumns: Seq[String] = Seq.empty,
     bucketSpec: Option[BucketSpec] = None,
-    options: Map[String, String] = Map.empty) extends Logging {
+    options: Map[String, String] = Map.empty,
+    catalogTable: Option[CatalogTable] = None) extends Logging {
 
   case class SourceInfo(name: String, schema: StructType, partitionColumns: Seq[String])
 
@@ -412,9 +415,16 @@ case class DataSource(
             })
         }
 
-        val fileCatalog =
+        val fileCatalog = if (sparkSession.sqlContext.conf.manageFilesourcePartitions &&
+            catalogTable.isDefined && catalogTable.get.partitionProviderIsHive) {
+          new TableFileCatalog(
+            sparkSession,
+            catalogTable.get,
+            catalogTable.get.stats.map(_.sizeInBytes.toLong).getOrElse(0L))
+        } else {
           new ListingFileCatalog(
             sparkSession, globbedPaths, options, partitionSchema)
+        }
 
         val dataSchema = userSpecifiedSchema.map { schema =>
           val equality = sparkSession.sessionState.conf.resolver
@@ -423,7 +433,7 @@ case class DataSource(
           format.inferSchema(
             sparkSession,
             caseInsensitiveOptions,
-            fileCatalog.allFiles())
+            fileCatalog.asInstanceOf[ListingFileCatalog].allFiles())
         }.getOrElse {
           throw new AnalysisException(
             s"Unable to infer schema for $format at ${allPaths.take(2).mkString(",")}. " +
@@ -432,7 +442,7 @@ case class DataSource(
 
         HadoopFsRelation(
           fileCatalog,
-          partitionSchema = fileCatalog.partitionSpec().partitionColumns,
+          partitionSchema = fileCatalog.partitionSchema,
           dataSchema = dataSchema.asNullable,
           bucketSpec = bucketSpec,
           format,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 7d0abe86a44df..f0bcf94eadc96 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -30,11 +30,11 @@ import org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical
-import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project}
+import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, Union}
 import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, UnknownPartitioning}
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution.{RowDataSourceScanExec, SparkPlan}
-import org.apache.spark.sql.execution.command.{DDLUtils, ExecutedCommandExec}
+import org.apache.spark.sql.execution.command.{AlterTableRecoverPartitionsCommand, DDLUtils, ExecutedCommandExec}
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
@@ -179,7 +179,7 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] {
           "Cannot overwrite a path that is also being read from.")
       }
 
-      InsertIntoHadoopFsRelationCommand(
+      val insertCmd = InsertIntoHadoopFsRelationCommand(
         outputPath,
         query.resolve(t.partitionSchema, t.sparkSession.sessionState.analyzer.resolver),
         t.bucketSpec,
@@ -188,6 +188,15 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] {
         t.options,
         query,
         mode)
+
+      if (l.catalogTable.isDefined && l.catalogTable.get.partitionColumnNames.nonEmpty &&
+          l.catalogTable.get.partitionProviderIsHive) {
+        // TODO(ekl) we should be more efficient here and only recover the newly added partitions
+        val recoverPartitionCmd = AlterTableRecoverPartitionsCommand(l.catalogTable.get.identifier)
+        Union(insertCmd, recoverPartitionCmd)
+      } else {
+        insertCmd
+      }
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCatalog.scala
index 2bc66ceeebdb4..dba64624c34b3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCatalog.scala
@@ -21,6 +21,7 @@ import org.apache.hadoop.fs._
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.types.StructType
 
 /**
  * A collection of data files from a partitioned relation, along with the partition values in the
@@ -63,4 +64,7 @@ trait FileCatalog {
 
   /** Sum of table file sizes, in bytes */
   def sizeInBytes: Long
+
+  /** Schema of the partitioning columns, or the empty schema if the table is not partitioned. */
+  def partitionSchema: StructType
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
index e0ec748a0b34d..7c2e6fd04d5db 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
@@ -64,7 +64,7 @@ object FileStatusCache {
    */
   def newCache(session: SparkSession): FileStatusCache = {
     synchronized {
-      if (session.sqlContext.conf.filesourcePartitionPruning &&
+      if (session.sqlContext.conf.manageFilesourcePartitions &&
           session.sqlContext.conf.filesourcePartitionFileCacheSize > 0) {
         if (sharedCache == null) {
           sharedCache = new SharedInMemoryCache(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
index 9b1903c47119e..cc4049e925905 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
@@ -38,19 +38,21 @@ import org.apache.spark.util.SerializableConfiguration
  * It provides the necessary methods to parse partition data based on a set of files.
  *
  * @param parameters as set of options to control partition discovery
- * @param partitionSchema an optional partition schema that will be use to provide types for the
- *                        discovered partitions
-*/
+ * @param userPartitionSchema an optional partition schema that will be use to provide types for
+ *                            the discovered partitions
+ */
 abstract class PartitioningAwareFileCatalog(
     sparkSession: SparkSession,
     parameters: Map[String, String],
-    partitionSchema: Option[StructType],
+    userPartitionSchema: Option[StructType],
     fileStatusCache: FileStatusCache = NoopCache) extends FileCatalog with Logging {
   import PartitioningAwareFileCatalog.BASE_PATH_PARAM
 
   /** Returns the specification of the partitions inferred from the data. */
   def partitionSpec(): PartitionSpec
 
+  override def partitionSchema: StructType = partitionSpec().partitionColumns
+
   protected val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(parameters)
 
   protected def leafFiles: mutable.LinkedHashMap[Path, FileStatus]
@@ -122,7 +124,7 @@ abstract class PartitioningAwareFileCatalog(
     val leafDirs = leafDirToChildrenFiles.filter { case (_, files) =>
       files.exists(f => isDataPath(f.getPath))
     }.keys.toSeq
-    partitionSchema match {
+    userPartitionSchema match {
       case Some(userProvidedSchema) if userProvidedSchema.nonEmpty =>
         val spec = PartitioningUtils.parsePartitions(
           leafDirs,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
index 667379b222c48..b459df5734d43 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
@@ -22,6 +22,7 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.catalog.CatalogTable
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.types.StructType
 
 
 /**
@@ -45,6 +46,8 @@ class TableFileCatalog(
 
   private val baseLocation = table.storage.locationUri
 
+  override def partitionSchema: StructType = table.partitionSchema
+
   override def rootPaths: Seq[Path] = baseLocation.map(new Path(_)).toSeq
 
   override def listFiles(filters: Seq[Expression]): Seq[PartitionDirectory] = {
@@ -63,7 +66,6 @@ class TableFileCatalog(
     if (table.partitionColumnNames.nonEmpty) {
       val selectedPartitions = sparkSession.sessionState.catalog.listPartitionsByFilter(
         table.identifier, filters)
-      val partitionSchema = table.partitionSchema
       val partitions = selectedPartitions.map { p =>
         PartitionPath(p.toRow(partitionSchema), p.storage.locationUri.get)
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index f47ec7f3963a4..dc31f3bc323f6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -272,18 +272,20 @@ object SQLConf {
       .booleanConf
       .createWithDefault(true)
 
-  val HIVE_FILESOURCE_PARTITION_PRUNING =
-    SQLConfigBuilder("spark.sql.hive.filesourcePartitionPruning")
-      .doc("When true, enable metastore partition pruning for filesource relations as well. " +
-           "This is currently implemented for converted Hive tables only.")
+  val HIVE_MANAGE_FILESOURCE_PARTITIONS =
+    SQLConfigBuilder("spark.sql.hive.manageFilesourcePartitions")
+      .doc("When true, enable metastore partition management for file source tables as well. " +
+           "This includes both datasource and converted Hive tables. When partition managment " +
+           "is enabled, datasource tables store partition in the Hive metastore, and use the " +
+           "metastore to prune partitions during query planning.")
       .booleanConf
       .createWithDefault(true)
 
   val HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE =
     SQLConfigBuilder("spark.sql.hive.filesourcePartitionFileCacheSize")
-      .doc("When nonzero, enable caching of partition file metadata in memory. All table share " +
+      .doc("When nonzero, enable caching of partition file metadata in memory. All tables share " +
            "a cache that can use up to specified num bytes for file metadata. This conf only " +
-           "applies if filesource partition pruning is also enabled.")
+           "has an effect when hive filesource partition management is enabled.")
       .longConf
       .createWithDefault(250 * 1024 * 1024)
 
@@ -679,7 +681,7 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
 
   def metastorePartitionPruning: Boolean = getConf(HIVE_METASTORE_PARTITION_PRUNING)
 
-  def filesourcePartitionPruning: Boolean = getConf(HIVE_FILESOURCE_PARTITION_PRUNING)
+  def manageFilesourcePartitions: Boolean = getConf(HIVE_MANAGE_FILESOURCE_PARTITIONS)
 
   def filesourcePartitionFileCacheSize: Long = getConf(HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
index 6857dd37286dd..2d73d9f1fc802 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
@@ -197,7 +197,7 @@ class SQLQueryTestSuite extends QueryTest with SharedSQLContext {
       assertResult(expected.schema, s"Schema did not match for query #$i\n${expected.sql}") {
         output.schema
       }
-      assertResult(expected.output, s"Result dit not match for query #$i\n${expected.sql}") {
+      assertResult(expected.output, s"Result did not match for query #$i\n${expected.sql}") {
         output.output
       }
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index b989d01ec787a..9fb0f5384d889 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -95,7 +95,8 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
         .add("b", "int"),
       provider = Some("hive"),
       partitionColumnNames = Seq("a", "b"),
-      createTime = 0L)
+      createTime = 0L,
+      partitionProviderIsHive = true)
   }
 
   private def createTable(catalog: SessionCatalog, name: TableIdentifier): Unit = {
@@ -923,68 +924,11 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
   }
 
   test("alter table: rename partition") {
-    val catalog = spark.sessionState.catalog
-    val tableIdent = TableIdentifier("tab1", Some("dbx"))
-    createPartitionedTable(tableIdent, isDatasourceTable = false)
-
-    // basic rename partition
-    sql("ALTER TABLE dbx.tab1 PARTITION (a='1', b='q') RENAME TO PARTITION (a='100', b='p')")
-    sql("ALTER TABLE dbx.tab1 PARTITION (a='2', b='c') RENAME TO PARTITION (a='20', b='c')")
-    assert(catalog.listPartitions(tableIdent).map(_.spec).toSet ==
-      Set(Map("a" -> "100", "b" -> "p"), Map("a" -> "20", "b" -> "c"), Map("a" -> "3", "b" -> "p")))
-
-    // rename without explicitly specifying database
-    catalog.setCurrentDatabase("dbx")
-    sql("ALTER TABLE tab1 PARTITION (a='100', b='p') RENAME TO PARTITION (a='10', b='p')")
-    assert(catalog.listPartitions(tableIdent).map(_.spec).toSet ==
-      Set(Map("a" -> "10", "b" -> "p"), Map("a" -> "20", "b" -> "c"), Map("a" -> "3", "b" -> "p")))
-
-    // table to alter does not exist
-    intercept[NoSuchTableException] {
-      sql("ALTER TABLE does_not_exist PARTITION (c='3') RENAME TO PARTITION (c='333')")
-    }
-
-    // partition to rename does not exist
-    intercept[NoSuchPartitionException] {
-      sql("ALTER TABLE tab1 PARTITION (a='not_found', b='1') RENAME TO PARTITION (a='1', b='2')")
-    }
-
-    // partition spec in RENAME PARTITION should be case insensitive by default
-    sql("ALTER TABLE tab1 PARTITION (A='10', B='p') RENAME TO PARTITION (A='1', B='p')")
-    assert(catalog.listPartitions(tableIdent).map(_.spec).toSet ==
-      Set(Map("a" -> "1", "b" -> "p"), Map("a" -> "20", "b" -> "c"), Map("a" -> "3", "b" -> "p")))
+    testRenamePartitions(isDatasourceTable = false)
   }
 
   test("alter table: rename partition (datasource table)") {
-    createPartitionedTable(TableIdentifier("tab1", Some("dbx")), isDatasourceTable = true)
-    val e = intercept[AnalysisException] {
-      sql("ALTER TABLE dbx.tab1 PARTITION (a='1', b='q') RENAME TO PARTITION (a='100', b='p')")
-    }.getMessage
-    assert(e.contains(
-      "ALTER TABLE RENAME PARTITION is not allowed for tables defined using the datasource API"))
-    // table to alter does not exist
-    intercept[NoSuchTableException] {
-      sql("ALTER TABLE does_not_exist PARTITION (c='3') RENAME TO PARTITION (c='333')")
-    }
-  }
-
-  private def createPartitionedTable(
-      tableIdent: TableIdentifier,
-      isDatasourceTable: Boolean): Unit = {
-    val catalog = spark.sessionState.catalog
-    val part1 = Map("a" -> "1", "b" -> "q")
-    val part2 = Map("a" -> "2", "b" -> "c")
-    val part3 = Map("a" -> "3", "b" -> "p")
-    createDatabase(catalog, "dbx")
-    createTable(catalog, tableIdent)
-    createTablePartition(catalog, part1, tableIdent)
-    createTablePartition(catalog, part2, tableIdent)
-    createTablePartition(catalog, part3, tableIdent)
-    assert(catalog.listPartitions(tableIdent).map(_.spec).toSet ==
-      Set(part1, part2, part3))
-    if (isDatasourceTable) {
-      convertToDatasourceTable(catalog, tableIdent)
-    }
+    testRenamePartitions(isDatasourceTable = true)
   }
 
   test("show tables") {
@@ -1199,7 +1143,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
       if (isDatasourceTable) {
         if (spec.isDefined) {
           assert(storageFormat.properties.isEmpty)
-          assert(storageFormat.locationUri.isEmpty)
+          assert(storageFormat.locationUri === Some(expected))
         } else {
           assert(storageFormat.properties.get("path") === Some(expected))
           assert(storageFormat.locationUri === Some(expected))
@@ -1212,18 +1156,14 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
     sql("ALTER TABLE dbx.tab1 SET LOCATION '/path/to/your/lovely/heart'")
     verifyLocation("/path/to/your/lovely/heart")
     // set table partition location
-    maybeWrapException(isDatasourceTable) {
-      sql("ALTER TABLE dbx.tab1 PARTITION (a='1', b='2') SET LOCATION '/path/to/part/ways'")
-    }
+    sql("ALTER TABLE dbx.tab1 PARTITION (a='1', b='2') SET LOCATION '/path/to/part/ways'")
     verifyLocation("/path/to/part/ways", Some(partSpec))
     // set table location without explicitly specifying database
     catalog.setCurrentDatabase("dbx")
     sql("ALTER TABLE tab1 SET LOCATION '/swanky/steak/place'")
     verifyLocation("/swanky/steak/place")
     // set table partition location without explicitly specifying database
-    maybeWrapException(isDatasourceTable) {
-      sql("ALTER TABLE tab1 PARTITION (a='1', b='2') SET LOCATION 'vienna'")
-    }
+    sql("ALTER TABLE tab1 PARTITION (a='1', b='2') SET LOCATION 'vienna'")
     verifyLocation("vienna", Some(partSpec))
     // table to alter does not exist
     intercept[AnalysisException] {
@@ -1354,26 +1294,18 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
     assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == Set(part1))
 
     // basic add partition
-    maybeWrapException(isDatasourceTable) {
-      sql("ALTER TABLE dbx.tab1 ADD IF NOT EXISTS " +
-        "PARTITION (a='2', b='6') LOCATION 'paris' PARTITION (a='3', b='7')")
-    }
-    if (!isDatasourceTable) {
-      assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == Set(part1, part2, part3))
-      assert(catalog.getPartition(tableIdent, part1).storage.locationUri.isEmpty)
-      assert(catalog.getPartition(tableIdent, part2).storage.locationUri == Option("paris"))
-      assert(catalog.getPartition(tableIdent, part3).storage.locationUri.isEmpty)
-    }
+    sql("ALTER TABLE dbx.tab1 ADD IF NOT EXISTS " +
+      "PARTITION (a='2', b='6') LOCATION 'paris' PARTITION (a='3', b='7')")
+    assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == Set(part1, part2, part3))
+    assert(catalog.getPartition(tableIdent, part1).storage.locationUri.isEmpty)
+    assert(catalog.getPartition(tableIdent, part2).storage.locationUri == Option("paris"))
+    assert(catalog.getPartition(tableIdent, part3).storage.locationUri.isEmpty)
 
     // add partitions without explicitly specifying database
     catalog.setCurrentDatabase("dbx")
-    maybeWrapException(isDatasourceTable) {
-      sql("ALTER TABLE tab1 ADD IF NOT EXISTS PARTITION (a='4', b='8')")
-    }
-    if (!isDatasourceTable) {
-      assert(catalog.listPartitions(tableIdent).map(_.spec).toSet ==
-        Set(part1, part2, part3, part4))
-    }
+    sql("ALTER TABLE tab1 ADD IF NOT EXISTS PARTITION (a='4', b='8')")
+    assert(catalog.listPartitions(tableIdent).map(_.spec).toSet ==
+      Set(part1, part2, part3, part4))
 
     // table to alter does not exist
     intercept[AnalysisException] {
@@ -1386,22 +1318,14 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
     }
 
     // partition to add already exists when using IF NOT EXISTS
-    maybeWrapException(isDatasourceTable) {
-      sql("ALTER TABLE tab1 ADD IF NOT EXISTS PARTITION (a='4', b='8')")
-    }
-    if (!isDatasourceTable) {
-      assert(catalog.listPartitions(tableIdent).map(_.spec).toSet ==
-        Set(part1, part2, part3, part4))
-    }
+    sql("ALTER TABLE tab1 ADD IF NOT EXISTS PARTITION (a='4', b='8')")
+    assert(catalog.listPartitions(tableIdent).map(_.spec).toSet ==
+      Set(part1, part2, part3, part4))
 
     // partition spec in ADD PARTITION should be case insensitive by default
-    maybeWrapException(isDatasourceTable) {
-      sql("ALTER TABLE tab1 ADD PARTITION (A='9', B='9')")
-    }
-    if (!isDatasourceTable) {
-      assert(catalog.listPartitions(tableIdent).map(_.spec).toSet ==
-        Set(part1, part2, part3, part4, part5))
-    }
+    sql("ALTER TABLE tab1 ADD PARTITION (A='9', B='9')")
+    assert(catalog.listPartitions(tableIdent).map(_.spec).toSet ==
+      Set(part1, part2, part3, part4, part5))
   }
 
   private def testDropPartitions(isDatasourceTable: Boolean): Unit = {
@@ -1424,21 +1348,13 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
     }
 
     // basic drop partition
-    maybeWrapException(isDatasourceTable) {
-      sql("ALTER TABLE dbx.tab1 DROP IF EXISTS PARTITION (a='4', b='8'), PARTITION (a='3', b='7')")
-    }
-    if (!isDatasourceTable) {
-      assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == Set(part1, part2))
-    }
+    sql("ALTER TABLE dbx.tab1 DROP IF EXISTS PARTITION (a='4', b='8'), PARTITION (a='3', b='7')")
+    assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == Set(part1, part2))
 
     // drop partitions without explicitly specifying database
     catalog.setCurrentDatabase("dbx")
-    maybeWrapException(isDatasourceTable) {
-      sql("ALTER TABLE tab1 DROP IF EXISTS PARTITION (a='2', b ='6')")
-    }
-    if (!isDatasourceTable) {
-      assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == Set(part1))
-    }
+    sql("ALTER TABLE tab1 DROP IF EXISTS PARTITION (a='2', b ='6')")
+    assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == Set(part1))
 
     // table to alter does not exist
     intercept[AnalysisException] {
@@ -1451,20 +1367,56 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
     }
 
     // partition to drop does not exist when using IF EXISTS
-    maybeWrapException(isDatasourceTable) {
-      sql("ALTER TABLE tab1 DROP IF EXISTS PARTITION (a='300')")
-    }
-    if (!isDatasourceTable) {
-      assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == Set(part1))
-    }
+    sql("ALTER TABLE tab1 DROP IF EXISTS PARTITION (a='300')")
+    assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == Set(part1))
 
     // partition spec in DROP PARTITION should be case insensitive by default
-    maybeWrapException(isDatasourceTable) {
-      sql("ALTER TABLE tab1 DROP PARTITION (A='1', B='5')")
+    sql("ALTER TABLE tab1 DROP PARTITION (A='1', B='5')")
+    assert(catalog.listPartitions(tableIdent).isEmpty)
+  }
+
+  private def testRenamePartitions(isDatasourceTable: Boolean): Unit = {
+    val catalog = spark.sessionState.catalog
+    val tableIdent = TableIdentifier("tab1", Some("dbx"))
+    val part1 = Map("a" -> "1", "b" -> "q")
+    val part2 = Map("a" -> "2", "b" -> "c")
+    val part3 = Map("a" -> "3", "b" -> "p")
+    createDatabase(catalog, "dbx")
+    createTable(catalog, tableIdent)
+    createTablePartition(catalog, part1, tableIdent)
+    createTablePartition(catalog, part2, tableIdent)
+    createTablePartition(catalog, part3, tableIdent)
+    assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == Set(part1, part2, part3))
+    if (isDatasourceTable) {
+      convertToDatasourceTable(catalog, tableIdent)
+    }
+
+    // basic rename partition
+    sql("ALTER TABLE dbx.tab1 PARTITION (a='1', b='q') RENAME TO PARTITION (a='100', b='p')")
+    sql("ALTER TABLE dbx.tab1 PARTITION (a='2', b='c') RENAME TO PARTITION (a='20', b='c')")
+    assert(catalog.listPartitions(tableIdent).map(_.spec).toSet ==
+      Set(Map("a" -> "100", "b" -> "p"), Map("a" -> "20", "b" -> "c"), Map("a" -> "3", "b" -> "p")))
+
+    // rename without explicitly specifying database
+    catalog.setCurrentDatabase("dbx")
+    sql("ALTER TABLE tab1 PARTITION (a='100', b='p') RENAME TO PARTITION (a='10', b='p')")
+    assert(catalog.listPartitions(tableIdent).map(_.spec).toSet ==
+      Set(Map("a" -> "10", "b" -> "p"), Map("a" -> "20", "b" -> "c"), Map("a" -> "3", "b" -> "p")))
+
+    // table to alter does not exist
+    intercept[NoSuchTableException] {
+      sql("ALTER TABLE does_not_exist PARTITION (c='3') RENAME TO PARTITION (c='333')")
     }
-    if (!isDatasourceTable) {
-      assert(catalog.listPartitions(tableIdent).isEmpty)
+
+    // partition to rename does not exist
+    intercept[NoSuchPartitionException] {
+      sql("ALTER TABLE tab1 PARTITION (a='not_found', b='1') RENAME TO PARTITION (a='1', b='2')")
     }
+
+    // partition spec in RENAME PARTITION should be case insensitive by default
+    sql("ALTER TABLE tab1 PARTITION (A='10', B='p') RENAME TO PARTITION (A='1', B='p')")
+    assert(catalog.listPartitions(tableIdent).map(_.spec).toSet ==
+      Set(Map("a" -> "1", "b" -> "p"), Map("a" -> "20", "b" -> "c"), Map("a" -> "3", "b" -> "p")))
   }
 
   test("drop build-in function") {
@@ -1683,12 +1635,16 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
       }
     }
 
-    // truncating partitioned data source tables is not supported
     withTable("rectangles", "rectangles2") {
       data.write.saveAsTable("rectangles")
       data.write.partitionBy("length").saveAsTable("rectangles2")
+
+      // not supported since the table is not partitioned
       assertUnsupported("TRUNCATE TABLE rectangles PARTITION (width=1)")
-      assertUnsupported("TRUNCATE TABLE rectangles2 PARTITION (width=1)")
+
+      // supported since partitions are stored in the metastore
+      sql("TRUNCATE TABLE rectangles2 PARTITION (width=1)")
+      assert(spark.table("rectangles2").collect().isEmpty)
     }
   }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 2003ff42d4f0c..409c316c6802c 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -38,6 +38,7 @@ import org.apache.spark.sql.execution.command.{ColumnStatStruct, DDLUtils}
 import org.apache.spark.sql.execution.datasources.CaseInsensitiveMap
 import org.apache.spark.sql.hive.client.HiveClient
 import org.apache.spark.sql.internal.HiveSerDe
+import org.apache.spark.sql.internal.SQLConf._
 import org.apache.spark.sql.internal.StaticSQLConf._
 import org.apache.spark.sql.types.{DataType, StructField, StructType}
 
@@ -105,13 +106,11 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
    * metastore.
    */
   private def verifyTableProperties(table: CatalogTable): Unit = {
-    val invalidKeys = table.properties.keys.filter { key =>
-      key.startsWith(DATASOURCE_PREFIX) || key.startsWith(STATISTICS_PREFIX)
-    }
+    val invalidKeys = table.properties.keys.filter(_.startsWith(SPARK_SQL_PREFIX))
     if (invalidKeys.nonEmpty) {
       throw new AnalysisException(s"Cannot persistent ${table.qualifiedName} into hive metastore " +
-        s"as table property keys may not start with '$DATASOURCE_PREFIX' or '$STATISTICS_PREFIX':" +
-        s" ${invalidKeys.mkString("[", ", ", "]")}")
+        s"as table property keys may not start with '$SPARK_SQL_PREFIX': " +
+        invalidKeys.mkString("[", ", ", "]"))
     }
     // External users are not allowed to set/switch the table type. In Hive metastore, the table
     // type can be switched by changing the value of a case-sensitive table property `EXTERNAL`.
@@ -190,11 +189,12 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       throw new TableAlreadyExistsException(db = db, table = table)
     }
     // Before saving data source table metadata into Hive metastore, we should:
-    //  1. Put table schema, partition column names and bucket specification in table properties.
+    //  1. Put table provider, schema, partition column names, bucket specification and partition
+    //     provider in table properties.
     //  2. Check if this table is hive compatible
     //    2.1  If it's not hive compatible, set schema, partition columns and bucket spec to empty
     //         and save table metadata to Hive.
-    //    2.1  If it's hive compatible, set serde information in table metadata and try to save
+    //    2.2  If it's hive compatible, set serde information in table metadata and try to save
     //         it to Hive. If it fails, treat it as not hive compatible and go back to 2.1
     if (DDLUtils.isDatasourceTable(tableDefinition)) {
       // data source table always have a provider, it's guaranteed by `DDLUtils.isDatasourceTable`.
@@ -204,6 +204,9 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
 
       val tableProperties = new scala.collection.mutable.HashMap[String, String]
       tableProperties.put(DATASOURCE_PROVIDER, provider)
+      if (tableDefinition.partitionProviderIsHive) {
+        tableProperties.put(TABLE_PARTITION_PROVIDER, "hive")
+      }
 
       // Serialized JSON schema string may be too long to be stored into a single metastore table
       // property. In this case, we split the JSON string and store each part as a separate table
@@ -241,12 +244,12 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
         }
       }
 
-      // converts the table metadata to Spark SQL specific format, i.e. set schema, partition column
-      // names and bucket specification to empty.
+      // converts the table metadata to Spark SQL specific format, i.e. set data schema, names and
+      // bucket specification to empty. Note that partition columns are retained, so that we can
+      // call partition-related Hive API later.
       def newSparkSQLSpecificMetastoreTable(): CatalogTable = {
         tableDefinition.copy(
-          schema = new StructType,
-          partitionColumnNames = Nil,
+          schema = tableDefinition.partitionSchema,
           bucketSpec = None,
           properties = tableDefinition.properties ++ tableProperties)
       }
@@ -419,12 +422,17 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       // Sets the `schema`, `partitionColumnNames` and `bucketSpec` from the old table definition,
       // to retain the spark specific format if it is. Also add old data source properties to table
       // properties, to retain the data source table format.
-      val oldDataSourceProps = oldDef.properties.filter(_._1.startsWith(DATASOURCE_PREFIX))
+      val oldDataSourceProps = oldDef.properties.filter(_._1.startsWith(SPARK_SQL_PREFIX))
+      val partitionProviderProp = if (tableDefinition.partitionProviderIsHive) {
+        TABLE_PARTITION_PROVIDER -> "hive"
+      } else {
+        TABLE_PARTITION_PROVIDER -> "builtin"
+      }
       val newDef = withStatsProps.copy(
         schema = oldDef.schema,
         partitionColumnNames = oldDef.partitionColumnNames,
         bucketSpec = oldDef.bucketSpec,
-        properties = oldDataSourceProps ++ withStatsProps.properties)
+        properties = oldDataSourceProps ++ withStatsProps.properties + partitionProviderProp)
 
       client.alterTable(newDef)
     } else {
@@ -448,7 +456,11 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
    * properties, and filter out these special entries from table properties.
    */
   private def restoreTableMetadata(table: CatalogTable): CatalogTable = {
-    val catalogTable = if (table.tableType == VIEW || conf.get(DEBUG_MODE)) {
+    if (conf.get(DEBUG_MODE)) {
+      return table
+    }
+
+    val tableWithSchema = if (table.tableType == VIEW) {
       table
     } else {
       getProviderFromTableProperties(table).map { provider =>
@@ -473,30 +485,32 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
           provider = Some(provider),
           partitionColumnNames = getPartitionColumnsFromTableProperties(table),
           bucketSpec = getBucketSpecFromTableProperties(table),
-          properties = getOriginalTableProperties(table))
+          partitionProviderIsHive = table.properties.get(TABLE_PARTITION_PROVIDER) == Some("hive"))
       } getOrElse {
-        table.copy(provider = Some("hive"))
+        table.copy(provider = Some("hive"), partitionProviderIsHive = true)
       }
     }
+
     // construct Spark's statistics from information in Hive metastore
-    val statsProps = catalogTable.properties.filterKeys(_.startsWith(STATISTICS_PREFIX))
-    if (statsProps.nonEmpty) {
+    val statsProps = tableWithSchema.properties.filterKeys(_.startsWith(STATISTICS_PREFIX))
+    val tableWithStats = if (statsProps.nonEmpty) {
       val colStatsProps = statsProps.filterKeys(_.startsWith(STATISTICS_COL_STATS_PREFIX))
         .map { case (k, v) => (k.drop(STATISTICS_COL_STATS_PREFIX.length), v) }
-      val colStats: Map[String, ColumnStat] = catalogTable.schema.collect {
+      val colStats: Map[String, ColumnStat] = tableWithSchema.schema.collect {
         case f if colStatsProps.contains(f.name) =>
           val numFields = ColumnStatStruct.numStatFields(f.dataType)
           (f.name, ColumnStat(numFields, colStatsProps(f.name)))
       }.toMap
-      catalogTable.copy(
-        properties = removeStatsProperties(catalogTable),
+      tableWithSchema.copy(
         stats = Some(Statistics(
-          sizeInBytes = BigInt(catalogTable.properties(STATISTICS_TOTAL_SIZE)),
-          rowCount = catalogTable.properties.get(STATISTICS_NUM_ROWS).map(BigInt(_)),
+          sizeInBytes = BigInt(tableWithSchema.properties(STATISTICS_TOTAL_SIZE)),
+          rowCount = tableWithSchema.properties.get(STATISTICS_NUM_ROWS).map(BigInt(_)),
           colStats = colStats)))
     } else {
-      catalogTable
+      tableWithSchema
     }
+
+    tableWithStats.copy(properties = getOriginalTableProperties(table))
   }
 
   override def tableExists(db: String, table: String): Boolean = withClient {
@@ -581,13 +595,30 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
   // Partitions
   // --------------------------------------------------------------------------
 
+  // Hive metastore is not case preserving and the partition columns are always lower cased. We need
+  // to lower case the column names in partition specification before calling partition related Hive
+  // APIs, to match this behaviour.
+  private def lowerCasePartitionSpec(spec: TablePartitionSpec): TablePartitionSpec = {
+    spec.map { case (k, v) => k.toLowerCase -> v }
+  }
+
+  // Hive metastore is not case preserving and the column names of the partition specification we
+  // get from the metastore are always lower cased. We should restore them w.r.t. the actual table
+  // partition columns.
+  private def restorePartitionSpec(
+      spec: TablePartitionSpec,
+      partCols: Seq[String]): TablePartitionSpec = {
+    spec.map { case (k, v) => partCols.find(_.equalsIgnoreCase(k)).get -> v }
+  }
+
   override def createPartitions(
       db: String,
       table: String,
       parts: Seq[CatalogTablePartition],
       ignoreIfExists: Boolean): Unit = withClient {
     requireTableExists(db, table)
-    client.createPartitions(db, table, parts, ignoreIfExists)
+    val lowerCasedParts = parts.map(p => p.copy(spec = lowerCasePartitionSpec(p.spec)))
+    client.createPartitions(db, table, lowerCasedParts, ignoreIfExists)
   }
 
   override def dropPartitions(
@@ -597,7 +628,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       ignoreIfNotExists: Boolean,
       purge: Boolean): Unit = withClient {
     requireTableExists(db, table)
-    client.dropPartitions(db, table, parts, ignoreIfNotExists, purge)
+    client.dropPartitions(db, table, parts.map(lowerCasePartitionSpec), ignoreIfNotExists, purge)
   }
 
   override def renamePartitions(
@@ -605,21 +636,24 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       table: String,
       specs: Seq[TablePartitionSpec],
       newSpecs: Seq[TablePartitionSpec]): Unit = withClient {
-    client.renamePartitions(db, table, specs, newSpecs)
+    client.renamePartitions(
+      db, table, specs.map(lowerCasePartitionSpec), newSpecs.map(lowerCasePartitionSpec))
   }
 
   override def alterPartitions(
       db: String,
       table: String,
       newParts: Seq[CatalogTablePartition]): Unit = withClient {
-    client.alterPartitions(db, table, newParts)
+    val lowerCasedParts = newParts.map(p => p.copy(spec = lowerCasePartitionSpec(p.spec)))
+    client.alterPartitions(db, table, lowerCasedParts)
   }
 
   override def getPartition(
       db: String,
       table: String,
       spec: TablePartitionSpec): CatalogTablePartition = withClient {
-    client.getPartition(db, table, spec)
+    val part = client.getPartition(db, table, lowerCasePartitionSpec(spec))
+    part.copy(spec = restorePartitionSpec(part.spec, getTable(db, table).partitionColumnNames))
   }
 
   /**
@@ -629,7 +663,9 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       db: String,
       table: String,
       spec: TablePartitionSpec): Option[CatalogTablePartition] = withClient {
-    client.getPartitionOption(db, table, spec)
+    client.getPartitionOption(db, table, lowerCasePartitionSpec(spec)).map { part =>
+      part.copy(spec = restorePartitionSpec(part.spec, getTable(db, table).partitionColumnNames))
+    }
   }
 
   /**
@@ -639,14 +675,17 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       db: String,
       table: String,
       partialSpec: Option[TablePartitionSpec] = None): Seq[CatalogTablePartition] = withClient {
-    client.getPartitions(db, table, partialSpec)
+    client.getPartitions(db, table, partialSpec.map(lowerCasePartitionSpec)).map { part =>
+      part.copy(spec = restorePartitionSpec(part.spec, getTable(db, table).partitionColumnNames))
+    }
   }
 
   override def listPartitionsByFilter(
       db: String,
       table: String,
       predicates: Seq[Expression]): Seq[CatalogTablePartition] = withClient {
-    val catalogTable = client.getTable(db, table)
+    val rawTable = client.getTable(db, table)
+    val catalogTable = restoreTableMetadata(rawTable)
     val partitionColumnNames = catalogTable.partitionColumnNames.toSet
     val nonPartitionPruningPredicates = predicates.filterNot {
       _.references.map(_.name).toSet.subsetOf(partitionColumnNames)
@@ -660,19 +699,20 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     val partitionSchema = catalogTable.partitionSchema
 
     if (predicates.nonEmpty) {
-      val clientPrunedPartitions =
-        client.getPartitionsByFilter(catalogTable, predicates)
+      val clientPrunedPartitions = client.getPartitionsByFilter(rawTable, predicates).map { part =>
+        part.copy(spec = restorePartitionSpec(part.spec, catalogTable.partitionColumnNames))
+      }
       val boundPredicate =
         InterpretedPredicate.create(predicates.reduce(And).transform {
           case att: AttributeReference =>
             val index = partitionSchema.indexWhere(_.name == att.name)
             BoundReference(index, partitionSchema(index).dataType, nullable = true)
         })
-      clientPrunedPartitions.filter { case p: CatalogTablePartition =>
-        boundPredicate(p.toRow(partitionSchema))
-      }
+      clientPrunedPartitions.filter { p => boundPredicate(p.toRow(partitionSchema)) }
     } else {
-      client.getPartitions(catalogTable)
+      client.getPartitions(catalogTable).map { part =>
+        part.copy(spec = restorePartitionSpec(part.spec, catalogTable.partitionColumnNames))
+      }
     }
   }
 
@@ -722,7 +762,9 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
 }
 
 object HiveExternalCatalog {
-  val DATASOURCE_PREFIX = "spark.sql.sources."
+  val SPARK_SQL_PREFIX = "spark.sql."
+
+  val DATASOURCE_PREFIX = SPARK_SQL_PREFIX + "sources."
   val DATASOURCE_PROVIDER = DATASOURCE_PREFIX + "provider"
   val DATASOURCE_SCHEMA = DATASOURCE_PREFIX + "schema"
   val DATASOURCE_SCHEMA_PREFIX = DATASOURCE_SCHEMA + "."
@@ -736,21 +778,20 @@ object HiveExternalCatalog {
   val DATASOURCE_SCHEMA_BUCKETCOL_PREFIX = DATASOURCE_SCHEMA_PREFIX + "bucketCol."
   val DATASOURCE_SCHEMA_SORTCOL_PREFIX = DATASOURCE_SCHEMA_PREFIX + "sortCol."
 
-  val STATISTICS_PREFIX = "spark.sql.statistics."
+  val STATISTICS_PREFIX = SPARK_SQL_PREFIX + "statistics."
   val STATISTICS_TOTAL_SIZE = STATISTICS_PREFIX + "totalSize"
   val STATISTICS_NUM_ROWS = STATISTICS_PREFIX + "numRows"
   val STATISTICS_COL_STATS_PREFIX = STATISTICS_PREFIX + "colStats."
 
-  def removeStatsProperties(metadata: CatalogTable): Map[String, String] = {
-    metadata.properties.filterNot { case (key, _) => key.startsWith(STATISTICS_PREFIX) }
-  }
+  val TABLE_PARTITION_PROVIDER = SPARK_SQL_PREFIX + "partitionProvider"
+
 
   def getProviderFromTableProperties(metadata: CatalogTable): Option[String] = {
     metadata.properties.get(DATASOURCE_PROVIDER)
   }
 
   def getOriginalTableProperties(metadata: CatalogTable): Map[String, String] = {
-    metadata.properties.filterNot { case (key, _) => key.startsWith(DATASOURCE_PREFIX) }
+    metadata.properties.filterNot { case (key, _) => key.startsWith(SPARK_SQL_PREFIX) }
   }
 
   // A persisted data source table always store its schema in the catalog.
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 6c1585d5f5617..d1de863ce3623 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -76,11 +76,10 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
             partitionColumns = table.partitionColumnNames,
             bucketSpec = table.bucketSpec,
             className = table.provider.get,
-            options = table.storage.properties)
+            options = table.storage.properties,
+            catalogTable = Some(table))
 
-        LogicalRelation(
-          dataSource.resolveRelation(),
-          catalogTable = Some(table))
+        LogicalRelation(dataSource.resolveRelation(), catalogTable = Some(table))
       }
     }
 
@@ -194,7 +193,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
       QualifiedTableName(metastoreRelation.databaseName, metastoreRelation.tableName)
     val bucketSpec = None  // We don't support hive bucketed tables, only ones we write out.
 
-    val lazyPruningEnabled = sparkSession.sqlContext.conf.filesourcePartitionPruning
+    val lazyPruningEnabled = sparkSession.sqlContext.conf.manageFilesourcePartitions
     val result = if (metastoreRelation.hiveQlTable.isPartitioned) {
       val partitionSchema = StructType.fromAttributes(metastoreRelation.partitionKeys)
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index 8835b266b22a4..84873bbbb81ce 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -777,7 +777,7 @@ private[hive] class HiveClientImpl(
     val (partCols, schema) = table.schema.map(toHiveColumn).partition { c =>
       table.partitionColumnNames.contains(c.getName)
     }
-    if (table.schema.isEmpty) {
+    if (schema.isEmpty) {
       // This is a hack to preserve existing behavior. Before Spark 2.0, we do not
       // set a default serde here (this was done in Hive), and so if the user provides
       // an empty schema Hive would automatically populate the schema with a single
@@ -831,9 +831,6 @@ private[hive] class HiveClientImpl(
     new HivePartition(ht, tpart)
   }
 
-  // TODO (cloud-fan): the column names in partition specification are always lower cased because
-  // Hive metastore is not case preserving. We should normalize them to the actual column names of
-  // the table, once we store partition spec of data source tables.
   private def fromHivePartition(hp: HivePartition): CatalogTablePartition = {
     val apiPartition = hp.getTPartition
     CatalogTablePartition(
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
index d290fe9962db2..6e887d95c0f09 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
@@ -63,7 +63,7 @@ class HiveMetadataCacheSuite extends QueryTest with SQLTestUtils with TestHiveSi
 
   def testCaching(pruningEnabled: Boolean): Unit = {
     test(s"partitioned table is cached when partition pruning is $pruningEnabled") {
-      withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_PRUNING.key -> pruningEnabled.toString) {
+      withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> pruningEnabled.toString) {
         withTable("test") {
           withTempDir { dir =>
             spark.range(5).selectExpr("id", "id as f1", "id as f2").write
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
new file mode 100644
index 0000000000000..5f16960fb1496
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive
+
+import java.io.File
+
+import org.apache.spark.metrics.source.HiveCatalogMetrics
+import org.apache.spark.sql.{AnalysisException, QueryTest}
+import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.test.SQLTestUtils
+
+class PartitionProviderCompatibilitySuite
+  extends QueryTest with TestHiveSingleton with SQLTestUtils {
+
+  private def setupPartitionedDatasourceTable(tableName: String, dir: File): Unit = {
+    spark.range(5).selectExpr("id as fieldOne", "id as partCol").write
+      .partitionBy("partCol")
+      .mode("overwrite")
+      .parquet(dir.getAbsolutePath)
+
+    spark.sql(s"""
+      |create table $tableName (fieldOne long, partCol int)
+      |using parquet
+      |options (path "${dir.getAbsolutePath}")
+      |partitioned by (partCol)""".stripMargin)
+  }
+
+  private def verifyIsLegacyTable(tableName: String): Unit = {
+    val unsupportedCommands = Seq(
+      s"ALTER TABLE $tableName ADD PARTITION (partCol=1) LOCATION '/foo'",
+      s"ALTER TABLE $tableName PARTITION (partCol=1) RENAME TO PARTITION (partCol=2)",
+      s"ALTER TABLE $tableName PARTITION (partCol=1) SET LOCATION '/foo'",
+      s"ALTER TABLE $tableName DROP PARTITION (partCol=1)",
+      s"DESCRIBE $tableName PARTITION (partCol=1)",
+      s"SHOW PARTITIONS $tableName")
+
+    withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {
+      for (cmd <- unsupportedCommands) {
+        val e = intercept[AnalysisException] {
+          spark.sql(cmd)
+        }
+        assert(e.getMessage.contains("partition metadata is not stored in the Hive metastore"), e)
+      }
+    }
+  }
+
+  test("convert partition provider to hive with repair table") {
+    withTable("test") {
+      withTempDir { dir =>
+        withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "false") {
+          setupPartitionedDatasourceTable("test", dir)
+          assert(spark.sql("select * from test").count() == 5)
+        }
+        withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {
+          verifyIsLegacyTable("test")
+          spark.sql("msck repair table test")
+          spark.sql("show partitions test").count()  // check we are a new table
+
+          // sanity check table performance
+          HiveCatalogMetrics.reset()
+          assert(spark.sql("select * from test where partCol < 2").count() == 2)
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 2)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 2)
+        }
+      }
+    }
+  }
+
+  test("when partition management is enabled, new tables have partition provider hive") {
+    withTable("test") {
+      withTempDir { dir =>
+        withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {
+          setupPartitionedDatasourceTable("test", dir)
+          spark.sql("show partitions test").count()  // check we are a new table
+          assert(spark.sql("select * from test").count() == 0)  // needs repair
+          spark.sql("msck repair table test")
+          assert(spark.sql("select * from test").count() == 5)
+        }
+      }
+    }
+  }
+
+  test("when partition management is disabled, new tables have no partition provider") {
+    withTable("test") {
+      withTempDir { dir =>
+        withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "false") {
+          setupPartitionedDatasourceTable("test", dir)
+          verifyIsLegacyTable("test")
+          assert(spark.sql("select * from test").count() == 5)
+        }
+      }
+    }
+  }
+
+  test("when partition management is disabled, we preserve the old behavior even for new tables") {
+    withTable("test") {
+      withTempDir { dir =>
+        withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {
+          setupPartitionedDatasourceTable("test", dir)
+          spark.sql("show partitions test").count()  // check we are a new table
+          spark.sql("refresh table test")
+          assert(spark.sql("select * from test").count() == 0)
+        }
+        // disabled
+        withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "false") {
+          val e = intercept[AnalysisException] {
+            spark.sql(s"show partitions test")
+          }
+          assert(e.getMessage.contains("filesource partition management is disabled"))
+          spark.sql("refresh table test")
+          assert(spark.sql("select * from test").count() == 5)
+        }
+        // then enabled again
+        withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {
+          spark.sql("refresh table test")
+          assert(spark.sql("select * from test").count() == 0)
+        }
+      }
+    }
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveTablePerfStatsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
similarity index 68%
rename from sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveTablePerfStatsSuite.scala
rename to sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
index 82ee813c6a95f..476383a5b33a5 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveTablePerfStatsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SQLTestUtils
 
-class HiveTablePerfStatsSuite
+class PartitionedTablePerfStatsSuite
   extends QueryTest with TestHiveSingleton with SQLTestUtils with BeforeAndAfterEach {
 
   override def beforeEach(): Unit = {
@@ -41,25 +41,54 @@ class HiveTablePerfStatsSuite
     FileStatusCache.resetForTesting()
   }
 
-  private def setupPartitionedTable(tableName: String, dir: File): Unit = {
-    spark.range(5).selectExpr("id", "id as partCol1", "id as partCol2").write
+  private case class TestSpec(setupTable: (String, File) => Unit, isDatasourceTable: Boolean)
+
+  /**
+   * Runs a test against both converted hive and native datasource tables. The test can use the
+   * passed TestSpec object for setup and inspecting test parameters.
+   */
+  private def genericTest(testName: String)(fn: TestSpec => Unit): Unit = {
+    test("hive table: " + testName) {
+      fn(TestSpec(setupPartitionedHiveTable, false))
+    }
+    test("datasource table: " + testName) {
+      fn(TestSpec(setupPartitionedDatasourceTable, true))
+    }
+  }
+
+  private def setupPartitionedHiveTable(tableName: String, dir: File): Unit = {
+    spark.range(5).selectExpr("id as fieldOne", "id as partCol1", "id as partCol2").write
       .partitionBy("partCol1", "partCol2")
       .mode("overwrite")
       .parquet(dir.getAbsolutePath)
 
     spark.sql(s"""
-      |create external table $tableName (id long)
+      |create external table $tableName (fieldOne long)
       |partitioned by (partCol1 int, partCol2 int)
       |stored as parquet
       |location "${dir.getAbsolutePath}"""".stripMargin)
     spark.sql(s"msck repair table $tableName")
   }
 
-  test("partitioned pruned table reports only selected files") {
+  private def setupPartitionedDatasourceTable(tableName: String, dir: File): Unit = {
+    spark.range(5).selectExpr("id as fieldOne", "id as partCol1", "id as partCol2").write
+      .partitionBy("partCol1", "partCol2")
+      .mode("overwrite")
+      .parquet(dir.getAbsolutePath)
+
+    spark.sql(s"""
+      |create table $tableName (fieldOne long, partCol1 int, partCol2 int)
+      |using parquet
+      |options (path "${dir.getAbsolutePath}")
+      |partitioned by (partCol1, partCol2)""".stripMargin)
+    spark.sql(s"msck repair table $tableName")
+  }
+
+  genericTest("partitioned pruned table reports only selected files") { spec =>
     assert(spark.sqlContext.getConf(HiveUtils.CONVERT_METASTORE_PARQUET.key) == "true")
     withTable("test") {
       withTempDir { dir =>
-        setupPartitionedTable("test", dir)
+        spec.setupTable("test", dir)
         val df = spark.sql("select * from test")
         assert(df.count() == 5)
         assert(df.inputFiles.length == 5)  // unpruned
@@ -75,17 +104,24 @@ class HiveTablePerfStatsSuite
         val df4 = spark.sql("select * from test where partCol1 = 999")
         assert(df4.count() == 0)
         assert(df4.inputFiles.length == 0)
+
+        // TODO(ekl) enable for hive tables as well once SPARK-17983 is fixed
+        if (spec.isDatasourceTable) {
+          val df5 = spark.sql("select * from test where fieldOne = 4")
+          assert(df5.count() == 1)
+          assert(df5.inputFiles.length == 5)
+        }
       }
     }
   }
 
-  test("lazy partition pruning reads only necessary partition data") {
+  genericTest("lazy partition pruning reads only necessary partition data") { spec =>
     withSQLConf(
-        SQLConf.HIVE_FILESOURCE_PARTITION_PRUNING.key -> "true",
+        SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true",
         SQLConf.HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE.key -> "0") {
       withTable("test") {
         withTempDir { dir =>
-          setupPartitionedTable("test", dir)
+          spec.setupTable("test", dir)
           HiveCatalogMetrics.reset()
           spark.sql("select * from test where partCol1 = 999").count()
           assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 0)
@@ -120,13 +156,13 @@ class HiveTablePerfStatsSuite
     }
   }
 
-  test("lazy partition pruning with file status caching enabled") {
+  genericTest("lazy partition pruning with file status caching enabled") { spec =>
     withSQLConf(
-        "spark.sql.hive.filesourcePartitionPruning" -> "true",
-        "spark.sql.hive.filesourcePartitionFileCacheSize" -> "9999999") {
+        SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true",
+        SQLConf.HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE.key -> "9999999") {
       withTable("test") {
         withTempDir { dir =>
-          setupPartitionedTable("test", dir)
+          spec.setupTable("test", dir)
           HiveCatalogMetrics.reset()
           assert(spark.sql("select * from test where partCol1 = 999").count() == 0)
           assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 0)
@@ -161,13 +197,13 @@ class HiveTablePerfStatsSuite
     }
   }
 
-  test("file status caching respects refresh table and refreshByPath") {
+  genericTest("file status caching respects refresh table and refreshByPath") { spec =>
     withSQLConf(
-        "spark.sql.hive.filesourcePartitionPruning" -> "true",
-        "spark.sql.hive.filesourcePartitionFileCacheSize" -> "9999999") {
+        SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true",
+        SQLConf.HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE.key -> "9999999") {
       withTable("test") {
         withTempDir { dir =>
-          setupPartitionedTable("test", dir)
+          spec.setupTable("test", dir)
           HiveCatalogMetrics.reset()
           assert(spark.sql("select * from test").count() == 5)
           assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 5)
@@ -190,13 +226,13 @@ class HiveTablePerfStatsSuite
     }
   }
 
-  test("file status cache respects size limit") {
+  genericTest("file status cache respects size limit") { spec =>
     withSQLConf(
-        "spark.sql.hive.filesourcePartitionPruning" -> "true",
-        "spark.sql.hive.filesourcePartitionFileCacheSize" -> "1" /* 1 byte */) {
+        SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true",
+        SQLConf.HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE.key -> "1" /* 1 byte */) {
       withTable("test") {
         withTempDir { dir =>
-          setupPartitionedTable("test", dir)
+          spec.setupTable("test", dir)
           HiveCatalogMetrics.reset()
           assert(spark.sql("select * from test").count() == 5)
           assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 5)
@@ -209,11 +245,11 @@ class HiveTablePerfStatsSuite
     }
   }
 
-  test("all partitions read and cached when filesource partition pruning is off") {
-    withSQLConf(SQLConf.HIVE_FILESOURCE_PARTITION_PRUNING.key -> "false") {
+  test("hive table: files read and cached when filesource partition management is off") {
+    withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "false") {
       withTable("test") {
         withTempDir { dir =>
-          setupPartitionedTable("test", dir)
+          setupPartitionedHiveTable("test", dir)
 
           // We actually query the partitions from hive each time the table is resolved in this
           // mode. This is kind of terrible, but is needed to preserve the legacy behavior
@@ -237,4 +273,32 @@ class HiveTablePerfStatsSuite
       }
     }
   }
+
+  test("datasource table: all partition data cached in memory when partition management is off") {
+    withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "false") {
+      withTable("test") {
+        withTempDir { dir =>
+          setupPartitionedDatasourceTable("test", dir)
+          HiveCatalogMetrics.reset()
+          assert(spark.sql("select * from test where partCol1 = 999").count() == 0)
+
+          // not using metastore
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 0)
+
+          // reads and caches all the files initially
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 5)
+
+          HiveCatalogMetrics.reset()
+          assert(spark.sql("select * from test where partCol1 < 2").count() == 2)
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 0)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 0)
+
+          HiveCatalogMetrics.reset()
+          assert(spark.sql("select * from test").count() == 5)
+          assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 0)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 0)
+        }
+      }
+    }
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index c351063a63ff8..4f5ebc3d838b9 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -310,39 +310,50 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
     }
   }
 
-  test("test table-level statistics for data source table created in HiveExternalCatalog") {
-    val parquetTable = "parquetTable"
-    withTable(parquetTable) {
-      sql(s"CREATE TABLE $parquetTable (key STRING, value STRING) USING PARQUET")
-      val catalogTable = spark.sessionState.catalog.getTableMetadata(TableIdentifier(parquetTable))
-      assert(DDLUtils.isDatasourceTable(catalogTable))
+  private def testUpdatingTableStats(tableDescription: String, createTableCmd: String): Unit = {
+    test("test table-level statistics for " + tableDescription) {
+      val parquetTable = "parquetTable"
+      withTable(parquetTable) {
+        sql(createTableCmd)
+        val catalogTable = spark.sessionState.catalog.getTableMetadata(
+          TableIdentifier(parquetTable))
+        assert(DDLUtils.isDatasourceTable(catalogTable))
+
+        sql(s"INSERT INTO TABLE $parquetTable SELECT * FROM src")
+        checkTableStats(
+          parquetTable, isDataSourceTable = true, hasSizeInBytes = false, expectedRowCounts = None)
 
-      sql(s"INSERT INTO TABLE $parquetTable SELECT * FROM src")
-      checkTableStats(
-        parquetTable, isDataSourceTable = true, hasSizeInBytes = false, expectedRowCounts = None)
+        // noscan won't count the number of rows
+        sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS noscan")
+        val fetchedStats1 = checkTableStats(
+          parquetTable, isDataSourceTable = true, hasSizeInBytes = true, expectedRowCounts = None)
 
-      // noscan won't count the number of rows
-      sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS noscan")
-      val fetchedStats1 = checkTableStats(
-        parquetTable, isDataSourceTable = true, hasSizeInBytes = true, expectedRowCounts = None)
+        sql(s"INSERT INTO TABLE $parquetTable SELECT * FROM src")
+        sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS noscan")
+        val fetchedStats2 = checkTableStats(
+          parquetTable, isDataSourceTable = true, hasSizeInBytes = true, expectedRowCounts = None)
+        assert(fetchedStats2.get.sizeInBytes > fetchedStats1.get.sizeInBytes)
 
-      sql(s"INSERT INTO TABLE $parquetTable SELECT * FROM src")
-      sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS noscan")
-      val fetchedStats2 = checkTableStats(
-        parquetTable, isDataSourceTable = true, hasSizeInBytes = true, expectedRowCounts = None)
-      assert(fetchedStats2.get.sizeInBytes > fetchedStats1.get.sizeInBytes)
-
-      // without noscan, we count the number of rows
-      sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS")
-      val fetchedStats3 = checkTableStats(
-        parquetTable,
-        isDataSourceTable = true,
-        hasSizeInBytes = true,
-        expectedRowCounts = Some(1000))
-      assert(fetchedStats3.get.sizeInBytes == fetchedStats2.get.sizeInBytes)
+        // without noscan, we count the number of rows
+        sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS")
+        val fetchedStats3 = checkTableStats(
+          parquetTable,
+          isDataSourceTable = true,
+          hasSizeInBytes = true,
+          expectedRowCounts = Some(1000))
+        assert(fetchedStats3.get.sizeInBytes == fetchedStats2.get.sizeInBytes)
+      }
     }
   }
 
+  testUpdatingTableStats(
+    "data source table created in HiveExternalCatalog",
+    "CREATE TABLE parquetTable (key STRING, value STRING) USING PARQUET")
+
+  testUpdatingTableStats(
+    "partitioned data source table",
+    "CREATE TABLE parquetTable (key STRING, value STRING) USING PARQUET PARTITIONED BY (key)")
+
   test("statistics collection of a table with zero column") {
     val table_no_cols = "table_no_cols"
     withTable(table_no_cols) {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala
index ad1e9b17a9f71..46ed18c70fb56 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala
@@ -415,10 +415,7 @@ class HiveCommandSuite extends QueryTest with SQLTestUtils with TestHiveSingleto
         .mode(SaveMode.Overwrite)
         .saveAsTable("part_datasrc")
 
-      val message1 = intercept[AnalysisException] {
-        sql("SHOW PARTITIONS part_datasrc")
-      }.getMessage
-      assert(message1.contains("is not allowed on a datasource table"))
+      assert(sql("SHOW PARTITIONS part_datasrc").count() == 3)
     }
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 495b4f874a1d6..01fa827220c51 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -358,7 +358,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
         "# Partition Information",
         "# col_name",
         "Detailed Partition Information CatalogPartition(",
-        "Partition Values: [Us, 1]",
+        "Partition Values: [c=Us, d=1]",
         "Storage(Location:",
         "Partition Parameters")
 
@@ -399,10 +399,8 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
         .range(1).select('id as 'a, 'id as 'b, 'id as 'c, 'id as 'd).write
         .partitionBy("d")
         .saveAsTable("datasource_table")
-      val m4 = intercept[AnalysisException] {
-        sql("DESC datasource_table PARTITION (d=2)")
-      }.getMessage()
-      assert(m4.contains("DESC PARTITION is not allowed on a datasource table"))
+
+      sql("DESC datasource_table PARTITION (d=0)")
 
       val m5 = intercept[AnalysisException] {
         spark.range(10).select('id as 'a, 'id as 'b).createTempView("view1")

From ab5f938bc7c3c9b137d63e479fced2b7e9c9d75b Mon Sep 17 00:00:00 2001
From: Sunitha Kambhampati <skambha@us.ibm.com>
Date: Fri, 28 Oct 2016 08:39:02 +0800
Subject: [PATCH 145/162] [SPARK-18121][SQL] Unable to query global temp views
 when hive support is enabled

## What changes were proposed in this pull request?

Issue:
Querying on a global temp view throws Table or view not found exception.

Fix:
Update the lookupRelation in HiveSessionCatalog to check for global temp views similar to the SessionCatalog.lookupRelation.

Before fix:
Querying on a global temp view ( for. e.g.:  select * from global_temp.v1)  throws Table or view not found exception

After fix:
Query succeeds and returns the right result.

## How was this patch tested?
- Two unit tests are added to check for global temp view for the code path when hive support is enabled.
- Regression unit tests were run successfully. ( build/sbt -Phive hive/test, build/sbt sql/test, build/sbt catalyst/test)

Author: Sunitha Kambhampati <skambha@us.ibm.com>

Closes #15649 from skambha/lookuprelationChanges.
---
 .../spark/sql/hive/HiveSessionCatalog.scala      | 10 ++++++++--
 .../spark/sql/hive/execution/SQLQuerySuite.scala | 16 ++++++++++++++++
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
index 85ecf0ce70756..4f2910abfd216 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
@@ -27,7 +27,7 @@ import org.apache.hadoop.hive.ql.udf.generic.{AbstractGenericUDAFResolver, Gener
 
 import org.apache.spark.sql.{AnalysisException, SparkSession}
 import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
-import org.apache.spark.sql.catalyst.analysis.FunctionRegistry
+import org.apache.spark.sql.catalyst.analysis.{FunctionRegistry, NoSuchTableException}
 import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder
 import org.apache.spark.sql.catalyst.catalog.{FunctionResourceLoader, GlobalTempViewManager, SessionCatalog}
 import org.apache.spark.sql.catalyst.expressions.{Cast, Expression, ExpressionInfo}
@@ -57,7 +57,13 @@ private[sql] class HiveSessionCatalog(
 
   override def lookupRelation(name: TableIdentifier, alias: Option[String]): LogicalPlan = {
     val table = formatTableName(name.table)
-    if (name.database.isDefined || !tempTables.contains(table)) {
+    val db = formatDatabaseName(name.database.getOrElse(currentDb))
+    if (db == globalTempViewManager.database) {
+      val relationAlias = alias.getOrElse(table)
+      globalTempViewManager.get(table).map { viewDef =>
+        SubqueryAlias(relationAlias, viewDef, Some(name))
+      }.getOrElse(throw new NoSuchTableException(db, table))
+    } else if (name.database.isDefined || !tempTables.contains(table)) {
       val database = name.database.map(formatDatabaseName)
       val newName = name.copy(database = database, table = table)
       metastoreCatalog.lookupRelation(newName, alias)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 01fa827220c51..2735d3a5267e3 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -68,6 +68,22 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
   import hiveContext._
   import spark.implicits._
 
+  test("query global temp view") {
+    val df = Seq(1).toDF("i1")
+    df.createGlobalTempView("tbl1")
+    val global_temp_db = spark.conf.get("spark.sql.globalTempDatabase")
+    checkAnswer(spark.sql(s"select * from ${global_temp_db}.tbl1"), Row(1))
+    spark.sql(s"drop view ${global_temp_db}.tbl1")
+  }
+
+  test("non-existent global temp view") {
+    val global_temp_db = spark.conf.get("spark.sql.globalTempDatabase")
+    val message = intercept[AnalysisException] {
+      spark.sql(s"select * from ${global_temp_db}.nonexistentview")
+    }.getMessage
+    assert(message.contains("Table or view not found"))
+  }
+
   test("script") {
     val scriptFilePath = getTestResourcePath("test_script.sh")
     if (testCommandAvailable("bash") && testCommandAvailable("echo | sed")) {

From 569788a55e4c6b218fb697e1e54c6138ffe657a6 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Fri, 28 Oct 2016 00:40:06 -0700
Subject: [PATCH 146/162] [SPARK-18109][ML] Add instrumentation to GMM

## What changes were proposed in this pull request?

Add instrumentation to GMM

## How was this patch tested?

Test in spark-shell

Author: Zheng RuiFeng <ruifengz@foxmail.com>

Closes #15636 from zhengruifeng/gmm_instr.
---
 .../org/apache/spark/ml/clustering/GaussianMixture.scala    | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
index e3cb92f4f144d..8fac63fefbb55 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -323,6 +323,9 @@ class GaussianMixture @Since("2.0.0") (
       case Row(point: Vector) => OldVectors.fromML(point)
     }
 
+    val instr = Instrumentation.create(this, rdd)
+    instr.logParams(featuresCol, predictionCol, probabilityCol, k, maxIter, seed, tol)
+
     val algo = new MLlibGM()
       .setK($(k))
       .setMaxIterations($(maxIter))
@@ -337,6 +340,9 @@ class GaussianMixture @Since("2.0.0") (
     val summary = new GaussianMixtureSummary(model.transform(dataset),
       $(predictionCol), $(probabilityCol), $(featuresCol), $(k))
     model.setSummary(summary)
+    instr.logNumFeatures(model.gaussians.head.mean.size)
+    instr.logSuccess(model)
+    model
   }
 
   @Since("2.0.0")

From e9746f87d0b553b8115948acb79f7e32c23dfd86 Mon Sep 17 00:00:00 2001
From: Jagadeesan <as2@us.ibm.com>
Date: Fri, 28 Oct 2016 02:26:55 -0700
Subject: [PATCH 147/162] =?UTF-8?q?[SPARK-18133][EXAMPLES][ML]=20Python=20?=
 =?UTF-8?q?ML=20Pipeline=20Example=20has=20syntax=20e=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

In Python 3, there is only one integer type (i.e., int), which mostly behaves like the long type in Python 2. Since Python 3 won't accept "L", so removed "L" in all examples.

## How was this patch tested?

Unit tests.

…rrors]

Author: Jagadeesan <as2@us.ibm.com>

Closes #15660 from jagadeesanas2/SPARK-18133.
---
 examples/src/main/python/ml/cross_validator.py   |  8 ++++----
 .../main/python/ml/gaussian_mixture_example.py   |  2 +-
 examples/src/main/python/ml/pipeline_example.py  | 16 ++++++++--------
 .../binary_classification_metrics_example.py     |  2 +-
 .../python/mllib/multi_class_metrics_example.py  |  2 +-
 5 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/examples/src/main/python/ml/cross_validator.py b/examples/src/main/python/ml/cross_validator.py
index 907eec67a0eb5..db7054307c2e3 100644
--- a/examples/src/main/python/ml/cross_validator.py
+++ b/examples/src/main/python/ml/cross_validator.py
@@ -84,10 +84,10 @@
 
     # Prepare test documents, which are unlabeled.
     test = spark.createDataFrame([
-        (4L, "spark i j k"),
-        (5L, "l m n"),
-        (6L, "mapreduce spark"),
-        (7L, "apache hadoop")
+        (4, "spark i j k"),
+        (5, "l m n"),
+        (6, "mapreduce spark"),
+        (7, "apache hadoop")
     ], ["id", "text"])
 
     # Make predictions on test documents. cvModel uses the best model found (lrModel).
diff --git a/examples/src/main/python/ml/gaussian_mixture_example.py b/examples/src/main/python/ml/gaussian_mixture_example.py
index 8ad450b669fc9..e4a0d314e9d91 100644
--- a/examples/src/main/python/ml/gaussian_mixture_example.py
+++ b/examples/src/main/python/ml/gaussian_mixture_example.py
@@ -38,7 +38,7 @@
     # loads data
     dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt")
 
-    gmm = GaussianMixture().setK(2).setSeed(538009335L)
+    gmm = GaussianMixture().setK(2).setSeed(538009335)
     model = gmm.fit(dataset)
 
     print("Gaussians shown as a DataFrame: ")
diff --git a/examples/src/main/python/ml/pipeline_example.py b/examples/src/main/python/ml/pipeline_example.py
index f63e4db434222..e1fab7cbe6d80 100644
--- a/examples/src/main/python/ml/pipeline_example.py
+++ b/examples/src/main/python/ml/pipeline_example.py
@@ -35,10 +35,10 @@
     # $example on$
     # Prepare training documents from a list of (id, text, label) tuples.
     training = spark.createDataFrame([
-        (0L, "a b c d e spark", 1.0),
-        (1L, "b d", 0.0),
-        (2L, "spark f g h", 1.0),
-        (3L, "hadoop mapreduce", 0.0)
+        (0, "a b c d e spark", 1.0),
+        (1, "b d", 0.0),
+        (2, "spark f g h", 1.0),
+        (3, "hadoop mapreduce", 0.0)
     ], ["id", "text", "label"])
 
     # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
@@ -52,10 +52,10 @@
 
     # Prepare test documents, which are unlabeled (id, text) tuples.
     test = spark.createDataFrame([
-        (4L, "spark i j k"),
-        (5L, "l m n"),
-        (6L, "spark hadoop spark"),
-        (7L, "apache hadoop")
+        (4, "spark i j k"),
+        (5, "l m n"),
+        (6, "spark hadoop spark"),
+        (7, "apache hadoop")
     ], ["id", "text"])
 
     # Make predictions on test documents and print columns of interest.
diff --git a/examples/src/main/python/mllib/binary_classification_metrics_example.py b/examples/src/main/python/mllib/binary_classification_metrics_example.py
index daf000e38dcd0..91f8378f29c0c 100644
--- a/examples/src/main/python/mllib/binary_classification_metrics_example.py
+++ b/examples/src/main/python/mllib/binary_classification_metrics_example.py
@@ -39,7 +39,7 @@
         .rdd.map(lambda row: LabeledPoint(row[0], row[1]))
 
     # Split data into training (60%) and test (40%)
-    training, test = data.randomSplit([0.6, 0.4], seed=11L)
+    training, test = data.randomSplit([0.6, 0.4], seed=11)
     training.cache()
 
     # Run training algorithm to build the model
diff --git a/examples/src/main/python/mllib/multi_class_metrics_example.py b/examples/src/main/python/mllib/multi_class_metrics_example.py
index cd56b3c97c778..7dc5fb4f9127f 100644
--- a/examples/src/main/python/mllib/multi_class_metrics_example.py
+++ b/examples/src/main/python/mllib/multi_class_metrics_example.py
@@ -32,7 +32,7 @@
     data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_multiclass_classification_data.txt")
 
     # Split data into training (60%) and test (40%)
-    training, test = data.randomSplit([0.6, 0.4], seed=11L)
+    training, test = data.randomSplit([0.6, 0.4], seed=11)
     training.cache()
 
     # Run training algorithm to build the model

From ac26e9cf27862fbfb97ae18d591606ecf2cd41cf Mon Sep 17 00:00:00 2001
From: Yunni <Euler57721@gmail.com>
Date: Fri, 28 Oct 2016 14:57:52 -0700
Subject: [PATCH 148/162] [SPARK-5992][ML] Locality Sensitive Hashing

## What changes were proposed in this pull request?

Implement Locality Sensitive Hashing along with approximate nearest neighbors and approximate similarity join based on the [design doc](https://docs.google.com/document/d/1D15DTDMF_UWTTyWqXfG7y76iZalky4QmifUYQ6lH5GM/edit).

Detailed changes are as follows:
(1) Implement abstract LSH, LSHModel classes as Estimator-Model
(2) Implement approxNearestNeighbors and approxSimilarityJoin in the abstract LSHModel
(3) Implement Random Projection as LSH subclass for Euclidean distance, Min Hash for Jaccard Distance
(4) Implement unit test utility methods including checkLshProperty, checkNearestNeighbor and checkSimilarityJoin

Things that will be implemented in a follow-up PR:
 - Bit Sampling for Hamming Distance, SignRandomProjection for Cosine Distance
 - PySpark Integration for the scala classes and methods.

## How was this patch tested?
Unit test is implemented for all the implemented classes and algorithms. A scalability test on Uber's dataset was performed internally.

Tested the methods on [WEX dataset](https://aws.amazon.com/items/2345) from AWS, with the steps and results [here](https://docs.google.com/document/d/19BXg-67U83NVB3M0I84HVBVg3baAVaESD_mrg_-vLro/edit).

## References
Gionis, Aristides, Piotr Indyk, and Rajeev Motwani. "Similarity search in high dimensions via hashing." VLDB 7 Sep. 1999: 518-529.
Wang, Jingdong et al. "Hashing for similarity search: A survey." arXiv preprint arXiv:1408.2927 (2014).

Author: Yunni <Euler57721@gmail.com>
Author: Yun Ni <yunn@uber.com>

Closes #15148 from Yunni/SPARK-5992-yunn-lsh.
---
 .../org/apache/spark/ml/feature/LSH.scala     | 313 ++++++++++++++++++
 .../org/apache/spark/ml/feature/MinHash.scala | 194 +++++++++++
 .../spark/ml/feature/RandomProjection.scala   | 225 +++++++++++++
 .../org/apache/spark/ml/feature/LSHTest.scala | 153 +++++++++
 .../spark/ml/feature/MinHashSuite.scala       | 126 +++++++
 .../ml/feature/RandomProjectionSuite.scala    | 197 +++++++++++
 6 files changed, 1208 insertions(+)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala
new file mode 100644
index 0000000000000..333a8c364a884
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala
@@ -0,0 +1,313 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import scala.util.Random
+
+import org.apache.spark.ml.{Estimator, Model}
+import org.apache.spark.ml.linalg.{Vector, VectorUDT}
+import org.apache.spark.ml.param.{IntParam, ParamValidators}
+import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
+import org.apache.spark.ml.util._
+import org.apache.spark.sql._
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types._
+
+/**
+ * Params for [[LSH]].
+ */
+private[ml] trait LSHParams extends HasInputCol with HasOutputCol {
+  /**
+   * Param for the dimension of LSH OR-amplification.
+   *
+   * In this implementation, we use LSH OR-amplification to reduce the false negative rate. The
+   * higher the dimension is, the lower the false negative rate.
+   * @group param
+   */
+  final val outputDim: IntParam = new IntParam(this, "outputDim", "output dimension, where" +
+    "increasing dimensionality lowers the false negative rate, and decreasing dimensionality" +
+    " improves the running performance", ParamValidators.gt(0))
+
+  /** @group getParam */
+  final def getOutputDim: Int = $(outputDim)
+
+  setDefault(outputDim -> 1)
+
+  /**
+   * Transform the Schema for LSH
+   * @param schema The schema of the input dataset without [[outputCol]]
+   * @return A derived schema with [[outputCol]] added
+   */
+  protected[this] final def validateAndTransformSchema(schema: StructType): StructType = {
+    SchemaUtils.appendColumn(schema, $(outputCol), new VectorUDT)
+  }
+}
+
+/**
+ * Model produced by [[LSH]].
+ */
+private[ml] abstract class LSHModel[T <: LSHModel[T]]
+  extends Model[T] with LSHParams with MLWritable {
+  self: T =>
+
+  /**
+   * The hash function of LSH, mapping a predefined KeyType to a Vector
+   * @return The mapping of LSH function.
+   */
+  protected[ml] val hashFunction: Vector => Vector
+
+  /**
+   * Calculate the distance between two different keys using the distance metric corresponding
+   * to the hashFunction
+   * @param x One input vector in the metric space
+   * @param y One input vector in the metric space
+   * @return The distance between x and y
+   */
+  protected[ml] def keyDistance(x: Vector, y: Vector): Double
+
+  /**
+   * Calculate the distance between two different hash Vectors.
+   *
+   * @param x One of the hash vector
+   * @param y Another hash vector
+   * @return The distance between hash vectors x and y
+   */
+  protected[ml] def hashDistance(x: Vector, y: Vector): Double
+
+  override def transform(dataset: Dataset[_]): DataFrame = {
+    transformSchema(dataset.schema, logging = true)
+    val transformUDF = udf(hashFunction, new VectorUDT)
+    dataset.withColumn($(outputCol), transformUDF(dataset($(inputCol))))
+  }
+
+  override def transformSchema(schema: StructType): StructType = {
+    validateAndTransformSchema(schema)
+  }
+
+  /**
+   * Given a large dataset and an item, approximately find at most k items which have the closest
+   * distance to the item. If the [[outputCol]] is missing, the method will transform the data; if
+   * the [[outputCol]] exists, it will use the [[outputCol]]. This allows caching of the
+   * transformed data when necessary.
+   *
+   * This method implements two ways of fetching k nearest neighbors:
+   *  - Single Probing: Fast, return at most k elements (Probing only one buckets)
+   *  - Multiple Probing: Slow, return exact k elements (Probing multiple buckets close to the key)
+   *
+   * @param dataset the dataset to search for nearest neighbors of the key
+   * @param key Feature vector representing the item to search for
+   * @param numNearestNeighbors The maximum number of nearest neighbors
+   * @param singleProbing True for using Single Probing; false for multiple probing
+   * @param distCol Output column for storing the distance between each result row and the key
+   * @return A dataset containing at most k items closest to the key. A distCol is added to show
+   *         the distance between each row and the key.
+   */
+  def approxNearestNeighbors(
+      dataset: Dataset[_],
+      key: Vector,
+      numNearestNeighbors: Int,
+      singleProbing: Boolean,
+      distCol: String): Dataset[_] = {
+    require(numNearestNeighbors > 0, "The number of nearest neighbors cannot be less than 1")
+    // Get Hash Value of the key
+    val keyHash = hashFunction(key)
+    val modelDataset: DataFrame = if (!dataset.columns.contains($(outputCol))) {
+        transform(dataset)
+      } else {
+        dataset.toDF()
+      }
+
+    // In the origin dataset, find the hash value that is closest to the key
+    val hashDistUDF = udf((x: Vector) => hashDistance(x, keyHash), DataTypes.DoubleType)
+    val hashDistCol = hashDistUDF(col($(outputCol)))
+
+    val modelSubset = if (singleProbing) {
+      modelDataset.filter(hashDistCol === 0.0)
+    } else {
+      // Compute threshold to get exact k elements.
+      val modelDatasetSortedByHash = modelDataset.sort(hashDistCol).limit(numNearestNeighbors)
+      val thresholdDataset = modelDatasetSortedByHash.select(max(hashDistCol))
+      val hashThreshold = thresholdDataset.take(1).head.getDouble(0)
+
+      // Filter the dataset where the hash value is less than the threshold.
+      modelDataset.filter(hashDistCol <= hashThreshold)
+    }
+
+    // Get the top k nearest neighbor by their distance to the key
+    val keyDistUDF = udf((x: Vector) => keyDistance(x, key), DataTypes.DoubleType)
+    val modelSubsetWithDistCol = modelSubset.withColumn(distCol, keyDistUDF(col($(inputCol))))
+    modelSubsetWithDistCol.sort(distCol).limit(numNearestNeighbors)
+  }
+
+  /**
+   * Overloaded method for approxNearestNeighbors. Use Single Probing as default way to search
+   * nearest neighbors and "distCol" as default distCol.
+   */
+  def approxNearestNeighbors(
+      dataset: Dataset[_],
+      key: Vector,
+      numNearestNeighbors: Int): Dataset[_] = {
+    approxNearestNeighbors(dataset, key, numNearestNeighbors, true, "distCol")
+  }
+
+  /**
+   * Preprocess step for approximate similarity join. Transform and explode the [[outputCol]] to
+   * two explodeCols: entry and value. "entry" is the index in hash vector, and "value" is the
+   * value of corresponding value of the index in the vector.
+   *
+   * @param dataset The dataset to transform and explode.
+   * @param explodeCols The alias for the exploded columns, must be a seq of two strings.
+   * @return A dataset containing idCol, inputCol and explodeCols
+   */
+  private[this] def processDataset(
+      dataset: Dataset[_],
+      inputName: String,
+      explodeCols: Seq[String]): Dataset[_] = {
+    require(explodeCols.size == 2, "explodeCols must be two strings.")
+    val vectorToMap = udf((x: Vector) => x.asBreeze.iterator.toMap,
+      MapType(DataTypes.IntegerType, DataTypes.DoubleType))
+    val modelDataset: DataFrame = if (!dataset.columns.contains($(outputCol))) {
+      transform(dataset)
+    } else {
+      dataset.toDF()
+    }
+    modelDataset.select(
+      struct(col("*")).as(inputName),
+      explode(vectorToMap(col($(outputCol)))).as(explodeCols))
+  }
+
+  /**
+   * Recreate a column using the same column name but different attribute id. Used in approximate
+   * similarity join.
+   * @param dataset The dataset where a column need to recreate
+   * @param colName The name of the column to recreate
+   * @param tmpColName A temporary column name which does not conflict with existing columns
+   * @return
+   */
+  private[this] def recreateCol(
+      dataset: Dataset[_],
+      colName: String,
+      tmpColName: String): Dataset[_] = {
+    dataset
+      .withColumnRenamed(colName, tmpColName)
+      .withColumn(colName, col(tmpColName))
+      .drop(tmpColName)
+  }
+
+  /**
+   * Join two dataset to approximately find all pairs of rows whose distance are smaller than
+   * the threshold. If the [[outputCol]] is missing, the method will transform the data; if the
+   * [[outputCol]] exists, it will use the [[outputCol]]. This allows caching of the transformed
+   * data when necessary.
+   *
+   * @param datasetA One of the datasets to join
+   * @param datasetB Another dataset to join
+   * @param threshold The threshold for the distance of row pairs
+   * @param distCol Output column for storing the distance between each result row and the key
+   * @return A joined dataset containing pairs of rows. The original rows are in columns
+   *         "datasetA" and "datasetB", and a distCol is added to show the distance of each pair
+   */
+  def approxSimilarityJoin(
+      datasetA: Dataset[_],
+      datasetB: Dataset[_],
+      threshold: Double,
+      distCol: String): Dataset[_] = {
+
+    val leftColName = "datasetA"
+    val rightColName = "datasetB"
+    val explodeCols = Seq("entry", "hashValue")
+    val explodedA = processDataset(datasetA, leftColName, explodeCols)
+
+    // If this is a self join, we need to recreate the inputCol of datasetB to avoid ambiguity.
+    // TODO: Remove recreateCol logic once SPARK-17154 is resolved.
+    val explodedB = if (datasetA != datasetB) {
+      processDataset(datasetB, rightColName, explodeCols)
+    } else {
+      val recreatedB = recreateCol(datasetB, $(inputCol), s"${$(inputCol)}#${Random.nextString(5)}")
+      processDataset(recreatedB, rightColName, explodeCols)
+    }
+
+    // Do a hash join on where the exploded hash values are equal.
+    val joinedDataset = explodedA.join(explodedB, explodeCols)
+      .drop(explodeCols: _*).distinct()
+
+    // Add a new column to store the distance of the two rows.
+    val distUDF = udf((x: Vector, y: Vector) => keyDistance(x, y), DataTypes.DoubleType)
+    val joinedDatasetWithDist = joinedDataset.select(col("*"),
+      distUDF(col(s"$leftColName.${$(inputCol)}"), col(s"$rightColName.${$(inputCol)}")).as(distCol)
+    )
+
+    // Filter the joined datasets where the distance are smaller than the threshold.
+    joinedDatasetWithDist.filter(col(distCol) < threshold)
+  }
+
+  /**
+   * Overloaded method for approxSimilarityJoin. Use "distCol" as default distCol.
+   */
+  def approxSimilarityJoin(
+      datasetA: Dataset[_],
+      datasetB: Dataset[_],
+      threshold: Double): Dataset[_] = {
+    approxSimilarityJoin(datasetA, datasetB, threshold, "distCol")
+  }
+}
+
+/**
+ * Locality Sensitive Hashing for different metrics space. Support basic transformation with a new
+ * hash column, approximate nearest neighbor search with a dataset and a key, and approximate
+ * similarity join of two datasets.
+ *
+ * This LSH class implements OR-amplification: more than 1 hash functions can be chosen, and each
+ * input vector are hashed by all hash functions. Two input vectors are defined to be in the same
+ * bucket as long as ANY one of the hash value matches.
+ *
+ * References:
+ * (1) Gionis, Aristides, Piotr Indyk, and Rajeev Motwani. "Similarity search in high dimensions
+ * via hashing." VLDB 7 Sep. 1999: 518-529.
+ * (2) Wang, Jingdong et al. "Hashing for similarity search: A survey." arXiv preprint
+ * arXiv:1408.2927 (2014).
+ */
+private[ml] abstract class LSH[T <: LSHModel[T]]
+  extends Estimator[T] with LSHParams with DefaultParamsWritable {
+  self: Estimator[T] =>
+
+  /** @group setParam */
+  def setInputCol(value: String): this.type = set(inputCol, value)
+
+  /** @group setParam */
+  def setOutputCol(value: String): this.type = set(outputCol, value)
+
+  /** @group setParam */
+  def setOutputDim(value: Int): this.type = set(outputDim, value)
+
+  /**
+   * Validate and create a new instance of concrete LSHModel. Because different LSHModel may have
+   * different initial setting, developer needs to define how their LSHModel is created instead of
+   * using reflection in this abstract class.
+   * @param inputDim The dimension of the input dataset
+   * @return A new LSHModel instance without any params
+   */
+  protected[this] def createRawLSHModel(inputDim: Int): T
+
+  override def fit(dataset: Dataset[_]): T = {
+    transformSchema(dataset.schema, logging = true)
+    val inputDim = dataset.select(col($(inputCol))).head().get(0).asInstanceOf[Vector].size
+    val model = createRawLSHModel(inputDim).setParent(this)
+    copyValues(model)
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala
new file mode 100644
index 0000000000000..d9d0f32254e24
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import scala.util.Random
+
+import org.apache.hadoop.fs.Path
+
+import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT}
+import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.ml.param.shared.HasSeed
+import org.apache.spark.ml.util._
+import org.apache.spark.sql.types.StructType
+
+/**
+ * :: Experimental ::
+ *
+ * Model produced by [[MinHash]], where multiple hash functions are stored. Each hash function is
+ * a perfect hash function:
+ *    `h_i(x) = (x * k_i mod prime) mod numEntries`
+ * where `k_i` is the i-th coefficient, and both `x` and `k_i` are from `Z_prime^*`
+ *
+ * Reference:
+ * [[https://en.wikipedia.org/wiki/Perfect_hash_function Wikipedia on Perfect Hash Function]]
+ *
+ * @param numEntries The number of entries of the hash functions.
+ * @param randCoefficients An array of random coefficients, each used by one hash function.
+ */
+@Experimental
+@Since("2.1.0")
+class MinHashModel private[ml] (
+    override val uid: String,
+    @Since("2.1.0") val numEntries: Int,
+    @Since("2.1.0") val randCoefficients: Array[Int])
+  extends LSHModel[MinHashModel] {
+
+  @Since("2.1.0")
+  override protected[ml] val hashFunction: Vector => Vector = {
+    elems: Vector =>
+      require(elems.numNonzeros > 0, "Must have at least 1 non zero entry.")
+      val elemsList = elems.toSparse.indices.toList
+      val hashValues = randCoefficients.map({ randCoefficient: Int =>
+          elemsList.map({elem: Int =>
+            (1 + elem) * randCoefficient.toLong % MinHash.prime % numEntries
+          }).min.toDouble
+      })
+      Vectors.dense(hashValues)
+  }
+
+  @Since("2.1.0")
+  override protected[ml] def keyDistance(x: Vector, y: Vector): Double = {
+    val xSet = x.toSparse.indices.toSet
+    val ySet = y.toSparse.indices.toSet
+    val intersectionSize = xSet.intersect(ySet).size.toDouble
+    val unionSize = xSet.size + ySet.size - intersectionSize
+    assert(unionSize > 0, "The union of two input sets must have at least 1 elements")
+    1 - intersectionSize / unionSize
+  }
+
+  @Since("2.1.0")
+  override protected[ml] def hashDistance(x: Vector, y: Vector): Double = {
+    // Since it's generated by hashing, it will be a pair of dense vectors.
+    x.toDense.values.zip(y.toDense.values).map(pair => math.abs(pair._1 - pair._2)).min
+  }
+
+  @Since("2.1.0")
+  override def copy(extra: ParamMap): this.type = defaultCopy(extra)
+
+  @Since("2.1.0")
+  override def write: MLWriter = new MinHashModel.MinHashModelWriter(this)
+}
+
+/**
+ * :: Experimental ::
+ *
+ * LSH class for Jaccard distance.
+ *
+ * The input can be dense or sparse vectors, but it is more efficient if it is sparse. For example,
+ *    `Vectors.sparse(10, Array[(2, 1.0), (3, 1.0), (5, 1.0)])`
+ * means there are 10 elements in the space. This set contains elem 2, elem 3 and elem 5.
+ * Also, any input vector must have at least 1 non-zero indices, and all non-zero values are treated
+ * as binary "1" values.
+ *
+ * References:
+ * [[https://en.wikipedia.org/wiki/MinHash Wikipedia on MinHash]]
+ */
+@Experimental
+@Since("2.1.0")
+class MinHash(override val uid: String) extends LSH[MinHashModel] with HasSeed {
+
+
+  @Since("2.1.0")
+  override def setInputCol(value: String): this.type = super.setInputCol(value)
+
+  @Since("2.1.0")
+  override def setOutputCol(value: String): this.type = super.setOutputCol(value)
+
+  @Since("2.1.0")
+  override def setOutputDim(value: Int): this.type = super.setOutputDim(value)
+
+  @Since("2.1.0")
+  def this() = {
+    this(Identifiable.randomUID("min hash"))
+  }
+
+  /** @group setParam */
+  @Since("2.1.0")
+  def setSeed(value: Long): this.type = set(seed, value)
+
+  @Since("2.1.0")
+  override protected[ml] def createRawLSHModel(inputDim: Int): MinHashModel = {
+    require(inputDim <= MinHash.prime / 2,
+      s"The input vector dimension $inputDim exceeds the threshold ${MinHash.prime / 2}.")
+    val rand = new Random($(seed))
+    val numEntry = inputDim * 2
+    val randCoofs: Array[Int] = Array.fill($(outputDim))(1 + rand.nextInt(MinHash.prime - 1))
+    new MinHashModel(uid, numEntry, randCoofs)
+  }
+
+  @Since("2.1.0")
+  override def transformSchema(schema: StructType): StructType = {
+    SchemaUtils.checkColumnType(schema, $(inputCol), new VectorUDT)
+    validateAndTransformSchema(schema)
+  }
+
+  @Since("2.1.0")
+  override def copy(extra: ParamMap): this.type = defaultCopy(extra)
+}
+
+@Since("2.1.0")
+object MinHash extends DefaultParamsReadable[MinHash] {
+  // A large prime smaller than sqrt(2^63 − 1)
+  private[ml] val prime = 2038074743
+
+  @Since("2.1.0")
+  override def load(path: String): MinHash = super.load(path)
+}
+
+@Since("2.1.0")
+object MinHashModel extends MLReadable[MinHashModel] {
+
+  @Since("2.1.0")
+  override def read: MLReader[MinHashModel] = new MinHashModelReader
+
+  @Since("2.1.0")
+  override def load(path: String): MinHashModel = super.load(path)
+
+  private[MinHashModel] class MinHashModelWriter(instance: MinHashModel) extends MLWriter {
+
+    private case class Data(numEntries: Int, randCoefficients: Array[Int])
+
+    override protected def saveImpl(path: String): Unit = {
+      DefaultParamsWriter.saveMetadata(instance, path, sc)
+      val data = Data(instance.numEntries, instance.randCoefficients)
+      val dataPath = new Path(path, "data").toString
+      sparkSession.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath)
+    }
+  }
+
+  private class MinHashModelReader extends MLReader[MinHashModel] {
+
+    /** Checked against metadata when loading model */
+    private val className = classOf[MinHashModel].getName
+
+    override def load(path: String): MinHashModel = {
+      val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
+
+      val dataPath = new Path(path, "data").toString
+      val data = sparkSession.read.parquet(dataPath).select("numEntries", "randCoefficients").head()
+      val numEntries = data.getAs[Int](0)
+      val randCoefficients = data.getAs[Seq[Int]](1).toArray
+      val model = new MinHashModel(metadata.uid, numEntries, randCoefficients)
+
+      DefaultParamsReader.getAndSetParams(model, metadata)
+      model
+    }
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala
new file mode 100644
index 0000000000000..1b524c6710b42
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala
@@ -0,0 +1,225 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import scala.util.Random
+
+import breeze.linalg.normalize
+import org.apache.hadoop.fs.Path
+
+import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.ml.linalg._
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.param.shared.HasSeed
+import org.apache.spark.ml.util._
+import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.types.StructType
+
+/**
+ * :: Experimental ::
+ *
+ * Params for [[RandomProjection]].
+ */
+private[ml] trait RandomProjectionParams extends Params {
+
+  /**
+   * The length of each hash bucket, a larger bucket lowers the false negative rate. The number of
+   * buckets will be `(max L2 norm of input vectors) / bucketLength`.
+   *
+   *
+   * If input vectors are normalized, 1-10 times of pow(numRecords, -1/inputDim) would be a
+   * reasonable value
+   * @group param
+   */
+  val bucketLength: DoubleParam = new DoubleParam(this, "bucketLength",
+    "the length of each hash bucket, a larger bucket lowers the false negative rate.",
+    ParamValidators.gt(0))
+
+  /** @group getParam */
+  final def getBucketLength: Double = $(bucketLength)
+}
+
+/**
+ * :: Experimental ::
+ *
+ * Model produced by [[RandomProjection]], where multiple random vectors are stored. The vectors
+ * are normalized to be unit vectors and each vector is used in a hash function:
+ *    `h_i(x) = floor(r_i.dot(x) / bucketLength)`
+ * where `r_i` is the i-th random unit vector. The number of buckets will be `(max L2 norm of input
+ * vectors) / bucketLength`.
+ *
+ * @param randUnitVectors An array of random unit vectors. Each vector represents a hash function.
+ */
+@Experimental
+@Since("2.1.0")
+class RandomProjectionModel private[ml] (
+    override val uid: String,
+    @Since("2.1.0") val randUnitVectors: Array[Vector])
+  extends LSHModel[RandomProjectionModel] with RandomProjectionParams {
+
+  @Since("2.1.0")
+  override protected[ml] val hashFunction: (Vector) => Vector = {
+    key: Vector => {
+      val hashValues: Array[Double] = randUnitVectors.map({
+        randUnitVector => Math.floor(BLAS.dot(key, randUnitVector) / $(bucketLength))
+      })
+      Vectors.dense(hashValues)
+    }
+  }
+
+  @Since("2.1.0")
+  override protected[ml] def keyDistance(x: Vector, y: Vector): Double = {
+    Math.sqrt(Vectors.sqdist(x, y))
+  }
+
+  @Since("2.1.0")
+  override protected[ml] def hashDistance(x: Vector, y: Vector): Double = {
+    // Since it's generated by hashing, it will be a pair of dense vectors.
+    x.toDense.values.zip(y.toDense.values).map(pair => math.abs(pair._1 - pair._2)).min
+  }
+
+  @Since("2.1.0")
+  override def copy(extra: ParamMap): this.type = defaultCopy(extra)
+
+  @Since("2.1.0")
+  override def write: MLWriter = new RandomProjectionModel.RandomProjectionModelWriter(this)
+}
+
+/**
+ * :: Experimental ::
+ *
+ * This [[RandomProjection]] implements Locality Sensitive Hashing functions for Euclidean
+ * distance metrics.
+ *
+ * The input is dense or sparse vectors, each of which represents a point in the Euclidean
+ * distance space. The output will be vectors of configurable dimension. Hash value in the same
+ * dimension is calculated by the same hash function.
+ *
+ * References:
+ *
+ * 1. [[https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Stable_distributions
+ * Wikipedia on Stable Distributions]]
+ *
+ * 2. Wang, Jingdong et al. "Hashing for similarity search: A survey." arXiv preprint
+ * arXiv:1408.2927 (2014).
+ */
+@Experimental
+@Since("2.1.0")
+class RandomProjection(override val uid: String) extends LSH[RandomProjectionModel]
+  with RandomProjectionParams with HasSeed {
+
+  @Since("2.1.0")
+  override def setInputCol(value: String): this.type = super.setInputCol(value)
+
+  @Since("2.1.0")
+  override def setOutputCol(value: String): this.type = super.setOutputCol(value)
+
+  @Since("2.1.0")
+  override def setOutputDim(value: Int): this.type = super.setOutputDim(value)
+
+  @Since("2.1.0")
+  def this() = {
+    this(Identifiable.randomUID("random projection"))
+  }
+
+  /** @group setParam */
+  @Since("2.1.0")
+  def setBucketLength(value: Double): this.type = set(bucketLength, value)
+
+  /** @group setParam */
+  @Since("2.1.0")
+  def setSeed(value: Long): this.type = set(seed, value)
+
+  @Since("2.1.0")
+  override protected[this] def createRawLSHModel(inputDim: Int): RandomProjectionModel = {
+    val rand = new Random($(seed))
+    val randUnitVectors: Array[Vector] = {
+      Array.fill($(outputDim)) {
+        val randArray = Array.fill(inputDim)(rand.nextGaussian())
+        Vectors.fromBreeze(normalize(breeze.linalg.Vector(randArray)))
+      }
+    }
+    new RandomProjectionModel(uid, randUnitVectors)
+  }
+
+  @Since("2.1.0")
+  override def transformSchema(schema: StructType): StructType = {
+    SchemaUtils.checkColumnType(schema, $(inputCol), new VectorUDT)
+    validateAndTransformSchema(schema)
+  }
+
+  @Since("2.1.0")
+  override def copy(extra: ParamMap): this.type = defaultCopy(extra)
+}
+
+@Since("2.1.0")
+object RandomProjection extends DefaultParamsReadable[RandomProjection] {
+
+  @Since("2.1.0")
+  override def load(path: String): RandomProjection = super.load(path)
+}
+
+@Since("2.1.0")
+object RandomProjectionModel extends MLReadable[RandomProjectionModel] {
+
+  @Since("2.1.0")
+  override def read: MLReader[RandomProjectionModel] = new RandomProjectionModelReader
+
+  @Since("2.1.0")
+  override def load(path: String): RandomProjectionModel = super.load(path)
+
+  private[RandomProjectionModel] class RandomProjectionModelWriter(instance: RandomProjectionModel)
+    extends MLWriter {
+
+    // TODO: Save using the existing format of Array[Vector] once SPARK-12878 is resolved.
+    private case class Data(randUnitVectors: Matrix)
+
+    override protected def saveImpl(path: String): Unit = {
+      DefaultParamsWriter.saveMetadata(instance, path, sc)
+      val numRows = instance.randUnitVectors.length
+      require(numRows > 0)
+      val numCols = instance.randUnitVectors.head.size
+      val values = instance.randUnitVectors.map(_.toArray).reduce(Array.concat(_, _))
+      val randMatrix = Matrices.dense(numRows, numCols, values)
+      val data = Data(randMatrix)
+      val dataPath = new Path(path, "data").toString
+      sparkSession.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath)
+    }
+  }
+
+  private class RandomProjectionModelReader extends MLReader[RandomProjectionModel] {
+
+    /** Checked against metadata when loading model */
+    private val className = classOf[RandomProjectionModel].getName
+
+    override def load(path: String): RandomProjectionModel = {
+      val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
+
+      val dataPath = new Path(path, "data").toString
+      val data = sparkSession.read.parquet(dataPath)
+      val Row(randUnitVectors: Matrix) = MLUtils.convertMatrixColumnsToML(data, "randUnitVectors")
+        .select("randUnitVectors")
+        .head()
+      val model = new RandomProjectionModel(metadata.uid, randUnitVectors.rowIter.toArray)
+
+      DefaultParamsReader.getAndSetParams(model, metadata)
+      model
+    }
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala
new file mode 100644
index 0000000000000..5c025546f332b
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.apache.spark.ml.linalg.{Vector, VectorUDT}
+import org.apache.spark.ml.util.SchemaUtils
+import org.apache.spark.sql.Dataset
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types.DataTypes
+
+private[ml] object LSHTest {
+  /**
+   * For any locality sensitive function h in a metric space, we meed to verify whether
+   * the following property is satisfied.
+   *
+   * There exist dist1, dist2, p1, p2, so that for any two elements e1 and e2,
+   * If dist(e1, e2) <= dist1, then Pr{h(x) == h(y)} >= p1
+   * If dist(e1, e2) >= dist2, then Pr{h(x) == h(y)} <= p2
+   *
+   * This is called locality sensitive property. This method checks the property on an
+   * existing dataset and calculate the probabilities.
+   * (https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Definition)
+   *
+   * This method hashes each elements to hash buckets using LSH, and calculate the false positive
+   * and false negative:
+   * False positive: Of all (e1, e2) sharing any bucket, the probability of dist(e1, e2) > distFP
+   * False negative: Of all (e1, e2) not sharing buckets, the probability of dist(e1, e2) < distFN
+   *
+   * @param dataset The dataset to verify the locality sensitive hashing property.
+   * @param lsh The lsh instance to perform the hashing
+   * @param distFP Distance threshold for false positive
+   * @param distFN Distance threshold for false negative
+   * @tparam T The class type of lsh
+   * @return A tuple of two doubles, representing the false positive and false negative rate
+   */
+  def calculateLSHProperty[T <: LSHModel[T]](
+      dataset: Dataset[_],
+      lsh: LSH[T],
+      distFP: Double,
+      distFN: Double): (Double, Double) = {
+    val model = lsh.fit(dataset)
+    val inputCol = model.getInputCol
+    val outputCol = model.getOutputCol
+    val transformedData = model.transform(dataset)
+
+    SchemaUtils.checkColumnType(transformedData.schema, model.getOutputCol, new VectorUDT)
+
+    // Perform a cross join and label each pair of same_bucket and distance
+    val pairs = transformedData.as("a").crossJoin(transformedData.as("b"))
+    val distUDF = udf((x: Vector, y: Vector) => model.keyDistance(x, y), DataTypes.DoubleType)
+    val sameBucket = udf((x: Vector, y: Vector) => model.hashDistance(x, y) == 0.0,
+      DataTypes.BooleanType)
+    val result = pairs
+      .withColumn("same_bucket", sameBucket(col(s"a.$outputCol"), col(s"b.$outputCol")))
+      .withColumn("distance", distUDF(col(s"a.$inputCol"), col(s"b.$inputCol")))
+
+    // Compute the probabilities based on the join result
+    val positive = result.filter(col("same_bucket"))
+    val negative = result.filter(!col("same_bucket"))
+    val falsePositiveCount = positive.filter(col("distance") > distFP).count().toDouble
+    val falseNegativeCount = negative.filter(col("distance") < distFN).count().toDouble
+    (falsePositiveCount / positive.count(), falseNegativeCount / negative.count())
+  }
+
+  /**
+   * Compute the precision and recall of approximate nearest neighbors
+   * @param lsh The lsh instance
+   * @param dataset the dataset to look for the key
+   * @param key The key to hash for the item
+   * @param k The maximum number of items closest to the key
+   * @tparam T The class type of lsh
+   * @return A tuple of two doubles, representing precision and recall rate
+   */
+  def calculateApproxNearestNeighbors[T <: LSHModel[T]](
+      lsh: LSH[T],
+      dataset: Dataset[_],
+      key: Vector,
+      k: Int,
+      singleProbing: Boolean): (Double, Double) = {
+    val model = lsh.fit(dataset)
+
+    // Compute expected
+    val distUDF = udf((x: Vector) => model.keyDistance(x, key), DataTypes.DoubleType)
+    val expected = dataset.sort(distUDF(col(model.getInputCol))).limit(k)
+
+    // Compute actual
+    val actual = model.approxNearestNeighbors(dataset, key, k, singleProbing, "distCol")
+
+    assert(actual.schema.sameType(model
+      .transformSchema(dataset.schema)
+      .add("distCol", DataTypes.DoubleType))
+    )
+
+    if (!singleProbing) {
+      assert(actual.count() == k)
+    }
+
+    // Compute precision and recall
+    val correctCount = expected.join(actual, model.getInputCol).count().toDouble
+    (correctCount / actual.count(), correctCount / expected.count())
+  }
+
+  /**
+   * Compute the precision and recall of approximate similarity join
+   * @param lsh The lsh instance
+   * @param datasetA One of the datasets to join
+   * @param datasetB Another dataset to join
+   * @param threshold The threshold for the distance of record pairs
+   * @tparam T The class type of lsh
+   * @return A tuple of two doubles, representing precision and recall rate
+   */
+  def calculateApproxSimilarityJoin[T <: LSHModel[T]](
+      lsh: LSH[T],
+      datasetA: Dataset[_],
+      datasetB: Dataset[_],
+      threshold: Double): (Double, Double) = {
+    val model = lsh.fit(datasetA)
+    val inputCol = model.getInputCol
+
+    // Compute expected
+    val distUDF = udf((x: Vector, y: Vector) => model.keyDistance(x, y), DataTypes.DoubleType)
+    val expected = datasetA.as("a").crossJoin(datasetB.as("b"))
+      .filter(distUDF(col(s"a.$inputCol"), col(s"b.$inputCol")) < threshold)
+
+    // Compute actual
+    val actual = model.approxSimilarityJoin(datasetA, datasetB, threshold)
+
+    SchemaUtils.checkColumnType(actual.schema, "distCol", DataTypes.DoubleType)
+    assert(actual.schema.apply("datasetA").dataType
+      .sameType(model.transformSchema(datasetA.schema)))
+    assert(actual.schema.apply("datasetB").dataType
+      .sameType(model.transformSchema(datasetB.schema)))
+
+    // Compute precision and recall
+    val correctCount = actual.filter(col("distCol") < threshold).count().toDouble
+    (correctCount / actual.count(), correctCount / expected.count())
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala
new file mode 100644
index 0000000000000..c32ca7d69cf84
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.linalg.{Vector, Vectors}
+import org.apache.spark.ml.param.ParamsSuite
+import org.apache.spark.ml.util.DefaultReadWriteTest
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.sql.Dataset
+
+class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+
+  @transient var dataset: Dataset[_] = _
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+
+    val data = {
+      for (i <- 0 to 95) yield Vectors.sparse(100, (i until i + 5).map((_, 1.0)))
+    }
+    dataset = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys")
+  }
+
+  test("params") {
+    ParamsSuite.checkParams(new MinHash)
+    val model = new MinHashModel("mh", numEntries = 2, randCoefficients = Array(1))
+    ParamsSuite.checkParams(model)
+  }
+
+  test("MinHash: default params") {
+    val rp = new MinHash
+    assert(rp.getOutputDim === 1.0)
+  }
+
+  test("read/write") {
+    def checkModelData(model: MinHashModel, model2: MinHashModel): Unit = {
+      assert(model.numEntries === model2.numEntries)
+      assertResult(model.randCoefficients)(model2.randCoefficients)
+    }
+    val mh = new MinHash()
+    val settings = Map("inputCol" -> "keys", "outputCol" -> "values")
+    testEstimatorAndModelReadWrite(mh, dataset, settings, checkModelData)
+  }
+
+  test("hashFunction") {
+    val model = new MinHashModel("mh", numEntries = 20, randCoefficients = Array(0, 1, 3))
+    val res = model.hashFunction(Vectors.sparse(10, Seq((2, 1.0), (3, 1.0), (5, 1.0), (7, 1.0))))
+    assert(res.equals(Vectors.dense(0.0, 3.0, 4.0)))
+  }
+
+  test("keyDistance and hashDistance") {
+    val model = new MinHashModel("mh", numEntries = 20, randCoefficients = Array(1))
+    val v1 = Vectors.sparse(10, Seq((2, 1.0), (3, 1.0), (5, 1.0), (7, 1.0)))
+    val v2 = Vectors.sparse(10, Seq((1, 1.0), (3, 1.0), (5, 1.0), (7, 1.0), (9, 1.0)))
+    val keyDist = model.keyDistance(v1, v2)
+    val hashDist = model.hashDistance(Vectors.dense(-5, 5), Vectors.dense(1, 2))
+    assert(keyDist === 0.5)
+    assert(hashDist === 3)
+  }
+
+  test("MinHash: test of LSH property") {
+    val mh = new MinHash()
+      .setOutputDim(1)
+      .setInputCol("keys")
+      .setOutputCol("values")
+      .setSeed(12344)
+
+    val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(dataset, mh, 0.75, 0.5)
+    assert(falsePositive < 0.3)
+    assert(falseNegative < 0.3)
+  }
+
+  test("approxNearestNeighbors for min hash") {
+    val mh = new MinHash()
+      .setOutputDim(20)
+      .setInputCol("keys")
+      .setOutputCol("values")
+      .setSeed(12345)
+
+    val key: Vector = Vectors.sparse(100,
+      (0 until 100).filter(_.toString.contains("1")).map((_, 1.0)))
+
+    val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(mh, dataset, key, 20,
+      singleProbing = true)
+    assert(precision >= 0.7)
+    assert(recall >= 0.7)
+  }
+
+  test("approxSimilarityJoin for minhash on different dataset") {
+    val data1 = {
+      for (i <- 0 until 20) yield Vectors.sparse(100, (5 * i until 5 * i + 5).map((_, 1.0)))
+    }
+    val df1 = spark.createDataFrame(data1.map(Tuple1.apply)).toDF("keys")
+
+    val data2 = {
+      for (i <- 0 until 30) yield Vectors.sparse(100, (3 * i until 3 * i + 3).map((_, 1.0)))
+    }
+    val df2 = spark.createDataFrame(data2.map(Tuple1.apply)).toDF("keys")
+
+    val mh = new MinHash()
+      .setOutputDim(20)
+      .setInputCol("keys")
+      .setOutputCol("values")
+      .setSeed(12345)
+
+    val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(mh, df1, df2, 0.5)
+    assert(precision == 1.0)
+    assert(recall >= 0.7)
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala
new file mode 100644
index 0000000000000..cd82ee2117a07
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala
@@ -0,0 +1,197 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import breeze.numerics.{cos, sin}
+import breeze.numerics.constants.Pi
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.linalg.{Vector, Vectors}
+import org.apache.spark.ml.param.ParamsSuite
+import org.apache.spark.ml.util.DefaultReadWriteTest
+import org.apache.spark.ml.util.TestingUtils._
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.sql.Dataset
+
+class RandomProjectionSuite
+  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+
+  @transient var dataset: Dataset[_] = _
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+
+    val data = {
+      for (i <- -10 until 10; j <- -10 until 10) yield Vectors.dense(i.toDouble, j.toDouble)
+    }
+    dataset = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys")
+  }
+
+  test("params") {
+    ParamsSuite.checkParams(new RandomProjection)
+    val model = new RandomProjectionModel("rp", randUnitVectors = Array(Vectors.dense(1.0, 0.0)))
+    ParamsSuite.checkParams(model)
+  }
+
+  test("RandomProjection: default params") {
+    val rp = new RandomProjection
+    assert(rp.getOutputDim === 1.0)
+  }
+
+  test("read/write") {
+    def checkModelData(model: RandomProjectionModel, model2: RandomProjectionModel): Unit = {
+      model.randUnitVectors.zip(model2.randUnitVectors)
+        .foreach(pair => assert(pair._1 === pair._2))
+    }
+    val mh = new RandomProjection()
+    val settings = Map("inputCol" -> "keys", "outputCol" -> "values", "bucketLength" -> 1.0)
+    testEstimatorAndModelReadWrite(mh, dataset, settings, checkModelData)
+  }
+
+  test("hashFunction") {
+    val randUnitVectors = Array(Vectors.dense(0.0, 1.0), Vectors.dense(1.0, 0.0))
+    val model = new RandomProjectionModel("rp", randUnitVectors)
+    model.set(model.bucketLength, 0.5)
+    val res = model.hashFunction(Vectors.dense(1.23, 4.56))
+    assert(res.equals(Vectors.dense(9.0, 2.0)))
+  }
+
+  test("keyDistance and hashDistance") {
+    val model = new RandomProjectionModel("rp", Array(Vectors.dense(0.0, 1.0)))
+    val keyDist = model.keyDistance(Vectors.dense(1, 2), Vectors.dense(-2, -2))
+    val hashDist = model.hashDistance(Vectors.dense(-5, 5), Vectors.dense(1, 2))
+    assert(keyDist === 5)
+    assert(hashDist === 3)
+  }
+
+  test("RandomProjection: randUnitVectors") {
+    val rp = new RandomProjection()
+      .setOutputDim(20)
+      .setInputCol("keys")
+      .setOutputCol("values")
+      .setBucketLength(1.0)
+      .setSeed(12345)
+    val unitVectors = rp.fit(dataset).randUnitVectors
+    unitVectors.foreach { v: Vector =>
+      assert(Vectors.norm(v, 2.0) ~== 1.0 absTol 1e-14)
+    }
+  }
+
+  test("RandomProjection: test of LSH property") {
+    // Project from 2 dimensional Euclidean Space to 1 dimensions
+    val rp = new RandomProjection()
+      .setOutputDim(1)
+      .setInputCol("keys")
+      .setOutputCol("values")
+      .setBucketLength(1.0)
+      .setSeed(12345)
+
+    val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(dataset, rp, 8.0, 2.0)
+    assert(falsePositive < 0.4)
+    assert(falseNegative < 0.4)
+  }
+
+  test("RandomProjection with high dimension data: test of LSH property") {
+    val numDim = 100
+    val data = {
+      for (i <- 0 until numDim; j <- Seq(-2, -1, 1, 2))
+        yield Vectors.sparse(numDim, Seq((i, j.toDouble)))
+    }
+    val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys")
+
+    // Project from 100 dimensional Euclidean Space to 10 dimensions
+    val rp = new RandomProjection()
+      .setOutputDim(10)
+      .setInputCol("keys")
+      .setOutputCol("values")
+      .setBucketLength(2.5)
+      .setSeed(12345)
+
+    val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(df, rp, 3.0, 2.0)
+    assert(falsePositive < 0.3)
+    assert(falseNegative < 0.3)
+  }
+
+  test("approxNearestNeighbors for random projection") {
+    val key = Vectors.dense(1.2, 3.4)
+
+    val rp = new RandomProjection()
+      .setOutputDim(2)
+      .setInputCol("keys")
+      .setOutputCol("values")
+      .setBucketLength(4.0)
+      .setSeed(12345)
+
+    val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(rp, dataset, key, 100,
+      singleProbing = true)
+    assert(precision >= 0.6)
+    assert(recall >= 0.6)
+  }
+
+  test("approxNearestNeighbors with multiple probing") {
+    val key = Vectors.dense(1.2, 3.4)
+
+    val rp = new RandomProjection()
+      .setOutputDim(20)
+      .setInputCol("keys")
+      .setOutputCol("values")
+      .setBucketLength(1.0)
+      .setSeed(12345)
+
+    val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(rp, dataset, key, 100,
+      singleProbing = false)
+    assert(precision >= 0.7)
+    assert(recall >= 0.7)
+  }
+
+  test("approxSimilarityJoin for random projection on different dataset") {
+    val data2 = {
+      for (i <- 0 until 24) yield Vectors.dense(10 * sin(Pi / 12 * i), 10 * cos(Pi / 12 * i))
+    }
+    val dataset2 = spark.createDataFrame(data2.map(Tuple1.apply)).toDF("keys")
+
+    val rp = new RandomProjection()
+      .setOutputDim(2)
+      .setInputCol("keys")
+      .setOutputCol("values")
+      .setBucketLength(4.0)
+      .setSeed(12345)
+
+    val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(rp, dataset, dataset2, 1.0)
+    assert(precision == 1.0)
+    assert(recall >= 0.7)
+  }
+
+  test("approxSimilarityJoin for self join") {
+    val data = {
+      for (i <- 0 until 24) yield Vectors.dense(10 * sin(Pi / 12 * i), 10 * cos(Pi / 12 * i))
+    }
+    val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys")
+
+    val rp = new RandomProjection()
+      .setOutputDim(2)
+      .setInputCol("keys")
+      .setOutputCol("values")
+      .setBucketLength(4.0)
+      .setSeed(12345)
+
+    val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(rp, df, df, 3.0)
+    assert(precision == 1.0)
+    assert(recall >= 0.7)
+  }
+}

From 59cccbda489f25add3e10997e950de7e88704aa7 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Fri, 28 Oct 2016 20:14:38 -0700
Subject: [PATCH 149/162] [SPARK-18164][SQL] ForeachSink should fail the Spark
 job if `process` throws exception

## What changes were proposed in this pull request?

Fixed the issue that ForeachSink didn't rethrow the exception.

## How was this patch tested?

The fixed unit test.

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #15674 from zsxwing/foreach-sink-error.
---
 .../sql/execution/streaming/ForeachSink.scala |  7 ++-----
 .../streaming/ForeachSinkSuite.scala          | 19 ++++++++++++++-----
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ForeachSink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ForeachSink.scala
index 082664aa23f04..24f98b9211f12 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ForeachSink.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ForeachSink.scala
@@ -68,19 +68,16 @@ class ForeachSink[T : Encoder](writer: ForeachWriter[T]) extends Sink with Seria
       }
     datasetWithIncrementalExecution.foreachPartition { iter =>
       if (writer.open(TaskContext.getPartitionId(), batchId)) {
-        var isFailed = false
         try {
           while (iter.hasNext) {
             writer.process(iter.next())
           }
         } catch {
           case e: Throwable =>
-            isFailed = true
             writer.close(e)
+            throw e
         }
-        if (!isFailed) {
-          writer.close(null)
-        }
+        writer.close(null)
       } else {
         writer.close(null)
       }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/ForeachSinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/ForeachSinkSuite.scala
index 7928b8e8775c2..9e059216110f2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/ForeachSinkSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/ForeachSinkSuite.scala
@@ -23,8 +23,9 @@ import scala.collection.mutable
 
 import org.scalatest.BeforeAndAfter
 
+import org.apache.spark.SparkException
 import org.apache.spark.sql.ForeachWriter
-import org.apache.spark.sql.streaming.{OutputMode, StreamTest}
+import org.apache.spark.sql.streaming.{OutputMode, StreamingQueryException, StreamTest}
 import org.apache.spark.sql.test.SharedSQLContext
 
 class ForeachSinkSuite extends StreamTest with SharedSQLContext with BeforeAndAfter {
@@ -136,7 +137,7 @@ class ForeachSinkSuite extends StreamTest with SharedSQLContext with BeforeAndAf
     }
   }
 
-  test("foreach with error") {
+  testQuietly("foreach with error") {
     withTempDir { checkpointDir =>
       val input = MemoryStream[Int]
       val query = input.toDS().repartition(1).writeStream
@@ -148,16 +149,24 @@ class ForeachSinkSuite extends StreamTest with SharedSQLContext with BeforeAndAf
           }
         }).start()
       input.addData(1, 2, 3, 4)
-      query.processAllAvailable()
+
+      // Error in `process` should fail the Spark job
+      val e = intercept[StreamingQueryException] {
+        query.processAllAvailable()
+      }
+      assert(e.getCause.isInstanceOf[SparkException])
+      assert(e.getCause.getCause.getMessage === "error")
+      assert(query.isActive === false)
 
       val allEvents = ForeachSinkSuite.allEvents()
       assert(allEvents.size === 1)
       assert(allEvents(0)(0) === ForeachSinkSuite.Open(partition = 0, version = 0))
-      assert(allEvents(0)(1) ===  ForeachSinkSuite.Process(value = 1))
+      assert(allEvents(0)(1) === ForeachSinkSuite.Process(value = 1))
+
+      // `close` should be called with the error
       val errorEvent = allEvents(0)(2).asInstanceOf[ForeachSinkSuite.Close]
       assert(errorEvent.error.get.isInstanceOf[RuntimeException])
       assert(errorEvent.error.get.getMessage === "error")
-      query.stop()
     }
   }
 }

From d2d438d1d549628a0183e468ed11d6e85b5d6061 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Sat, 29 Oct 2016 06:49:57 +0200
Subject: [PATCH 150/162] [SPARK-18167][SQL] Add debug code for SQLQuerySuite
 flakiness when metastore partition pruning is enabled

## What changes were proposed in this pull request?

org.apache.spark.sql.hive.execution.SQLQuerySuite is flaking when hive partition pruning is enabled.
Based on the stack traces, it seems to be an old issue where Hive fails to cast a numeric partition column ("Invalid character string format for type DECIMAL"). There are two possibilities here: either we are somehow corrupting the partition table to have non-decimal values in that column, or there is a transient issue with Derby.

This PR logs the result of the retry when this exception is encountered, so we can confirm what is going on.

## How was this patch tested?

n/a

cc yhuai

Author: Eric Liang <ekl@databricks.com>

Closes #15676 from ericl/spark-18167.
---
 .../apache/spark/sql/hive/client/HiveShim.scala   | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
index 32387707612f4..4bbbd66132b75 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
@@ -24,6 +24,7 @@ import java.util.{ArrayList => JArrayList, List => JList, Map => JMap, Set => JS
 import java.util.concurrent.TimeUnit
 
 import scala.collection.JavaConverters._
+import scala.util.Try
 import scala.util.control.NonFatal
 
 import org.apache.hadoop.fs.{FileSystem, Path}
@@ -585,7 +586,19 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
         getAllPartitionsMethod.invoke(hive, table).asInstanceOf[JSet[Partition]]
       } else {
         logDebug(s"Hive metastore filter is '$filter'.")
-        getPartitionsByFilterMethod.invoke(hive, table, filter).asInstanceOf[JArrayList[Partition]]
+        try {
+          getPartitionsByFilterMethod.invoke(hive, table, filter)
+            .asInstanceOf[JArrayList[Partition]]
+        } catch {
+          case e: InvocationTargetException =>
+            // SPARK-18167 retry to investigate the flaky test. This should be reverted before
+            // the release is cut.
+            val retry = Try(getPartitionsByFilterMethod.invoke(hive, table, filter))
+            val full = Try(getAllPartitionsMethod.invoke(hive, table))
+            logError("getPartitionsByFilter failed, retry success = " + retry.isSuccess)
+            logError("getPartitionsByFilter failed, full fetch success = " + full.isSuccess)
+            throw e
+        }
       }
 
     partitions.asScala.toSeq

From 505b927cb7ff037adb797b9c3b9ecac3f885b7c8 Mon Sep 17 00:00:00 2001
From: Liwei Lin <lwlin7@gmail.com>
Date: Sun, 30 Oct 2016 09:32:19 +0000
Subject: [PATCH 151/162] [SPARK-16312][FOLLOW-UP][STREAMING][KAFKA][DOC] Add
 java code snippet for Kafka 0.10 integration doc

## What changes were proposed in this pull request?

added java code snippet for Kafka 0.10 integration doc

## How was this patch tested?

SKIP_API=1 jekyll build

## Screenshot

![kafka-doc](https://cloud.githubusercontent.com/assets/15843379/19826272/bf0d8a4c-9db8-11e6-9e40-1396723df4bc.png)

Author: Liwei Lin <lwlin7@gmail.com>

Closes #15679 from lw-lin/kafka-010-examples.
---
 docs/streaming-kafka-0-10-integration.md | 133 +++++++++++++++++++++--
 1 file changed, 122 insertions(+), 11 deletions(-)

diff --git a/docs/streaming-kafka-0-10-integration.md b/docs/streaming-kafka-0-10-integration.md
index de95ea90137eb..c1ef396907db7 100644
--- a/docs/streaming-kafka-0-10-integration.md
+++ b/docs/streaming-kafka-0-10-integration.md
@@ -8,9 +8,9 @@ The Spark Streaming integration for Kafka 0.10 is similar in design to the 0.8 [
 ### Linking
 For Scala/Java applications using SBT/Maven project definitions, link your streaming application with the following artifact (see [Linking section](streaming-programming-guide.html#linking) in the main programming guide for further information).
 
-		groupId = org.apache.spark
-		artifactId = spark-streaming-kafka-0-10_{{site.SCALA_BINARY_VERSION}}
-		version = {{site.SPARK_VERSION_SHORT}}
+	groupId = org.apache.spark
+	artifactId = spark-streaming-kafka-0-10_{{site.SCALA_BINARY_VERSION}}
+	version = {{site.SPARK_VERSION_SHORT}}
 
 ### Creating a Direct Stream
  Note that the namespace for the import includes the version, org.apache.spark.streaming.kafka010
@@ -44,6 +44,42 @@ For Scala/Java applications using SBT/Maven project definitions, link your strea
 Each item in the stream is a [ConsumerRecord](http://kafka.apache.org/0100/javadoc/org/apache/kafka/clients/consumer/ConsumerRecord.html)
 </div>
 <div data-lang="java" markdown="1">
+	import java.util.*;
+	import org.apache.spark.SparkConf;
+	import org.apache.spark.TaskContext;
+	import org.apache.spark.api.java.*;
+	import org.apache.spark.api.java.function.*;
+	import org.apache.spark.streaming.api.java.*;
+	import org.apache.spark.streaming.kafka010.*;
+	import org.apache.kafka.clients.consumer.ConsumerRecord;
+	import org.apache.kafka.common.TopicPartition;
+	import org.apache.kafka.common.serialization.StringDeserializer;
+	import scala.Tuple2;
+	
+	Map<String, Object> kafkaParams = new HashMap<>();
+	kafkaParams.put("bootstrap.servers", "localhost:9092,anotherhost:9092");
+	kafkaParams.put("key.deserializer", StringDeserializer.class);
+	kafkaParams.put("value.deserializer", StringDeserializer.class);
+	kafkaParams.put("group.id", "use_a_separate_group_id_for_each_stream");
+	kafkaParams.put("auto.offset.reset", "latest");
+	kafkaParams.put("enable.auto.commit", false);
+	
+	Collection<String> topics = Arrays.asList("topicA", "topicB");
+	
+	final JavaInputDStream<ConsumerRecord<String, String>> stream =
+	  KafkaUtils.createDirectStream(
+	    streamingContext,
+	    LocationStrategies.PreferConsistent(),
+	    ConsumerStrategies.<String, String>Subscribe(topics, kafkaParams)
+	  );
+	
+	stream.mapToPair(
+	  new PairFunction<ConsumerRecord<String, String>, String, String>() {
+	    @Override
+	    public Tuple2<String, String> call(ConsumerRecord<String, String> record) {
+	      return new Tuple2<>(record.key(), record.value());
+	    }
+	  })
 </div>
 </div>
 
@@ -85,6 +121,20 @@ If you have a use case that is better suited to batch processing, you can create
 
 </div>
 <div data-lang="java" markdown="1">
+	// Import dependencies and create kafka params as in Create Direct Stream above
+
+	OffsetRange[] offsetRanges = {
+	  // topic, partition, inclusive starting offset, exclusive ending offset
+	  OffsetRange.create("test", 0, 0, 100),
+	  OffsetRange.create("test", 1, 0, 100)
+	};
+
+	JavaRDD<ConsumerRecord<String, String>> rdd = KafkaUtils.createRDD(
+	  sparkContext,
+	  kafkaParams,
+	  offsetRanges,
+	  LocationStrategies.PreferConsistent()
+	);
 </div>
 </div>
 
@@ -103,6 +153,20 @@ Note that you cannot use `PreferBrokers`, because without the stream there is no
 	}
 </div>
 <div data-lang="java" markdown="1">
+	stream.foreachRDD(new VoidFunction<JavaRDD<ConsumerRecord<String, String>>>() {
+	  @Override
+	  public void call(JavaRDD<ConsumerRecord<String, String>> rdd) {
+	    final OffsetRange[] offsetRanges = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
+	    rdd.foreachPartition(new VoidFunction<Iterator<ConsumerRecord<String, String>>>() {
+	      @Override
+	      public void call(Iterator<ConsumerRecord<String, String>> consumerRecords) {
+	        OffsetRange o = offsetRanges[TaskContext.get().partitionId()];
+	        System.out.println(
+	          o.topic() + " " + o.partition() + " " + o.fromOffset() + " " + o.untilOffset());
+	      }
+	    });
+	  }
+	});
 </div>
 </div>
 
@@ -120,15 +184,24 @@ Kafka has an offset commit API that stores offsets in a special Kafka topic.  By
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 	stream.foreachRDD { rdd =>
-	  val offsets = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
+	  val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
 
 	  // some time later, after outputs have completed
-	  stream.asInstanceOf[CanCommitOffsets].commitAsync(offsets)
+	  stream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
 	}
 
 As with HasOffsetRanges, the cast to CanCommitOffsets will only succeed if called on the result of createDirectStream, not after transformations.  The commitAsync call is threadsafe, but must occur after outputs if you want meaningful semantics.
 </div>
 <div data-lang="java" markdown="1">
+	stream.foreachRDD(new VoidFunction<JavaRDD<ConsumerRecord<String, String>>>() {
+	  @Override
+	  public void call(JavaRDD<ConsumerRecord<String, String>> rdd) {
+	    OffsetRange[] offsetRanges = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
+
+	    // some time later, after outputs have completed
+	    ((CanCommitOffsets) stream.inputDStream()).commitAsync(offsetRanges);
+	  }
+	});
 </div>
 </div>
 
@@ -141,7 +214,7 @@ For data stores that support transactions, saving offsets in the same transactio
 
 	// begin from the the offsets committed to the database
 	val fromOffsets = selectOffsetsFromYourDatabase.map { resultSet =>
-	  new TopicPartition(resultSet.string("topic")), resultSet.int("partition")) -> resultSet.long("offset")
+	  new TopicPartition(resultSet.string("topic"), resultSet.int("partition")) -> resultSet.long("offset")
 	}.toMap
 
 	val stream = KafkaUtils.createDirectStream[String, String](
@@ -155,16 +228,46 @@ For data stores that support transactions, saving offsets in the same transactio
 
 	  val results = yourCalculation(rdd)
 
-	  yourTransactionBlock {
-	    // update results
+	  // begin your transaction
 
-	    // update offsets where the end of existing offsets matches the beginning of this batch of offsets
+	  // update results
+	  // update offsets where the end of existing offsets matches the beginning of this batch of offsets
+	  // assert that offsets were updated correctly
 
-	    // assert that offsets were updated correctly
-	  }
+	  // end your transaction
 	}
 </div>
 <div data-lang="java" markdown="1">
+	// The details depend on your data store, but the general idea looks like this
+
+	// begin from the the offsets committed to the database
+	Map<TopicPartition, Long> fromOffsets = new HashMap<>();
+	for (resultSet : selectOffsetsFromYourDatabase)
+	  fromOffsets.put(new TopicPartition(resultSet.string("topic"), resultSet.int("partition")), resultSet.long("offset"));
+	}
+
+	JavaInputDStream<ConsumerRecord<String, String>> stream = KafkaUtils.createDirectStream(
+	  streamingContext,
+	  LocationStrategies.PreferConsistent(),
+	  ConsumerStrategies.<String, String>Assign(fromOffsets.keySet(), kafkaParams, fromOffsets)
+	);
+
+	stream.foreachRDD(new VoidFunction<JavaRDD<ConsumerRecord<String, String>>>() {
+	  @Override
+	  public void call(JavaRDD<ConsumerRecord<String, String>> rdd) {
+	    OffsetRange[] offsetRanges = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
+	    
+	    Object results = yourCalculation(rdd);
+
+	    // begin your transaction
+
+	    // update results
+	    // update offsets where the end of existing offsets matches the beginning of this batch of offsets
+	    // assert that offsets were updated correctly
+
+	    // end your transaction
+	  }
+	});
 </div>
 </div>
 
@@ -185,6 +288,14 @@ The new Kafka consumer [supports SSL](http://kafka.apache.org/documentation.html
 	)
 </div>
 <div data-lang="java" markdown="1">
+	Map<String, Object> kafkaParams = new HashMap<String, Object>();
+	// the usual params, make sure to change the port in bootstrap.servers if 9092 is not TLS
+	kafkaParams.put("security.protocol", "SSL");
+	kafkaParams.put("ssl.truststore.location", "/some-directory/kafka.client.truststore.jks");
+	kafkaParams.put("ssl.truststore.password", "test1234");
+	kafkaParams.put("ssl.keystore.location", "/some-directory/kafka.client.keystore.jks");
+	kafkaParams.put("ssl.keystore.password", "test1234");
+	kafkaParams.put("ssl.key.password", "test1234");
 </div>
 </div>
 

From a489567e36e671cee290f8d69188837a8b1a75b3 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Sun, 30 Oct 2016 09:36:23 +0000
Subject: [PATCH 152/162] [SPARK-3261][MLLIB] KMeans clusterer can return
 duplicate cluster centers

## What changes were proposed in this pull request?

Return potentially fewer than k cluster centers in cases where k distinct centroids aren't available or aren't selected.

## How was this patch tested?

Existing tests

Author: Sean Owen <sowen@cloudera.com>

Closes #15450 from srowen/SPARK-3261.
---
 .../apache/spark/ml/clustering/KMeans.scala   |   4 +-
 .../spark/mllib/clustering/KMeans.scala       |  27 ++--
 .../spark/mllib/clustering/KMeansSuite.scala  | 119 ++++++++++--------
 3 files changed, 85 insertions(+), 65 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
index 05ed3223ae537..85bb8c93b3fa9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -41,7 +41,9 @@ private[clustering] trait KMeansParams extends Params with HasMaxIter with HasFe
   with HasSeed with HasPredictionCol with HasTol {
 
   /**
-   * The number of clusters to create (k). Must be > 1. Default: 2.
+   * The number of clusters to create (k). Must be > 1. Note that it is possible for fewer than
+   * k clusters to be returned, for example, if there are fewer than k distinct points to cluster.
+   * Default: 2.
    * @group param
    */
   @Since("1.5.0")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
index 68a7b3b6763af..ed9c064879d01 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -56,13 +56,15 @@ class KMeans private (
   def this() = this(2, 20, KMeans.K_MEANS_PARALLEL, 2, 1e-4, Utils.random.nextLong())
 
   /**
-   * Number of clusters to create (k).
+   * Number of clusters to create (k). Note that it is possible for fewer than k clusters to
+   * be returned, for example, if there are fewer than k distinct points to cluster.
    */
   @Since("1.4.0")
   def getK: Int = k
 
   /**
-   * Set the number of clusters to create (k). Default: 2.
+   * Set the number of clusters to create (k). Note that it is possible for fewer than k clusters to
+   * be returned, for example, if there are fewer than k distinct points to cluster. Default: 2.
    */
   @Since("0.8.0")
   def setK(k: Int): this.type = {
@@ -323,7 +325,10 @@ class KMeans private (
    * Initialize a set of cluster centers at random.
    */
   private def initRandom(data: RDD[VectorWithNorm]): Array[VectorWithNorm] = {
-    data.takeSample(true, k, new XORShiftRandom(this.seed).nextInt()).map(_.toDense)
+    // Select without replacement; may still produce duplicates if the data has < k distinct
+    // points, so deduplicate the centroids to match the behavior of k-means|| in the same situation
+    data.takeSample(false, k, new XORShiftRandom(this.seed).nextInt())
+      .map(_.vector).distinct.map(new VectorWithNorm(_))
   }
 
   /**
@@ -335,7 +340,7 @@ class KMeans private (
    *
    * The original paper can be found at http://theory.stanford.edu/~sergei/papers/vldb12-kmpar.pdf.
    */
-  private def initKMeansParallel(data: RDD[VectorWithNorm]): Array[VectorWithNorm] = {
+  private[clustering] def initKMeansParallel(data: RDD[VectorWithNorm]): Array[VectorWithNorm] = {
     // Initialize empty centers and point costs.
     var costs = data.map(_ => Double.PositiveInfinity)
 
@@ -378,19 +383,21 @@ class KMeans private (
     costs.unpersist(blocking = false)
     bcNewCentersList.foreach(_.destroy(false))
 
-    if (centers.size == k) {
-      centers.toArray
+    val distinctCenters = centers.map(_.vector).distinct.map(new VectorWithNorm(_))
+
+    if (distinctCenters.size <= k) {
+      distinctCenters.toArray
     } else {
-      // Finally, we might have a set of more or less than k candidate centers; weight each
+      // Finally, we might have a set of more than k distinct candidate centers; weight each
       // candidate by the number of points in the dataset mapping to it and run a local k-means++
       // on the weighted centers to pick k of them
-      val bcCenters = data.context.broadcast(centers)
+      val bcCenters = data.context.broadcast(distinctCenters)
       val countMap = data.map(KMeans.findClosest(bcCenters.value, _)._1).countByValue()
 
       bcCenters.destroy(blocking = false)
 
-      val myWeights = centers.indices.map(countMap.getOrElse(_, 0L).toDouble).toArray
-      LocalKMeans.kMeansPlusPlus(0, centers.toArray, myWeights, k, 30)
+      val myWeights = distinctCenters.indices.map(countMap.getOrElse(_, 0L).toDouble).toArray
+      LocalKMeans.kMeansPlusPlus(0, distinctCenters.toArray, myWeights, k, 30)
     }
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala
index 2d35b312083c0..48bd41dc3e3bf 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala
@@ -29,6 +29,8 @@ class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   import org.apache.spark.mllib.clustering.KMeans.{K_MEANS_PARALLEL, RANDOM}
 
+  private val seed = 42
+
   test("single cluster") {
     val data = sc.parallelize(Array(
       Vectors.dense(1.0, 2.0, 6.0),
@@ -38,7 +40,7 @@ class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
 
     val center = Vectors.dense(1.0, 3.0, 4.0)
 
-    // No matter how many runs or iterations we use, we should get one cluster,
+    // No matter how many iterations we use, we should get one cluster,
     // centered at the mean of the points
 
     var model = KMeans.train(data, k = 1, maxIterations = 1)
@@ -50,44 +52,72 @@ class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
     model = KMeans.train(data, k = 1, maxIterations = 5)
     assert(model.clusterCenters.head ~== center absTol 1E-5)
 
-    model = KMeans.train(data, k = 1, maxIterations = 1, runs = 5)
-    assert(model.clusterCenters.head ~== center absTol 1E-5)
-
-    model = KMeans.train(data, k = 1, maxIterations = 1, runs = 5)
-    assert(model.clusterCenters.head ~== center absTol 1E-5)
-
-    model = KMeans.train(data, k = 1, maxIterations = 1, runs = 1, initializationMode = RANDOM)
+    model = KMeans.train(data, k = 1, maxIterations = 1, initializationMode = RANDOM)
     assert(model.clusterCenters.head ~== center absTol 1E-5)
 
     model = KMeans.train(
-      data, k = 1, maxIterations = 1, runs = 1, initializationMode = K_MEANS_PARALLEL)
+      data, k = 1, maxIterations = 1, initializationMode = K_MEANS_PARALLEL)
     assert(model.clusterCenters.head ~== center absTol 1E-5)
   }
 
-  test("no distinct points") {
+  test("fewer distinct points than clusters") {
     val data = sc.parallelize(
       Array(
         Vectors.dense(1.0, 2.0, 3.0),
         Vectors.dense(1.0, 2.0, 3.0),
         Vectors.dense(1.0, 2.0, 3.0)),
       2)
-    val center = Vectors.dense(1.0, 2.0, 3.0)
 
-    // Make sure code runs.
-    var model = KMeans.train(data, k = 2, maxIterations = 1)
-    assert(model.clusterCenters.size === 2)
-  }
+    var model = KMeans.train(data, k = 2, maxIterations = 1, initializationMode = "random")
+    assert(model.clusterCenters.length === 1)
 
-  test("more clusters than points") {
-    val data = sc.parallelize(
-      Array(
-        Vectors.dense(1.0, 2.0, 3.0),
-        Vectors.dense(1.0, 3.0, 4.0)),
-      2)
+    model = KMeans.train(data, k = 2, maxIterations = 1, initializationMode = "k-means||")
+    assert(model.clusterCenters.length === 1)
+  }
 
-    // Make sure code runs.
-    var model = KMeans.train(data, k = 3, maxIterations = 1)
-    assert(model.clusterCenters.size === 3)
+  test("unique cluster centers") {
+    val rng = new Random(seed)
+    val numDistinctPoints = 10
+    val points = (0 until numDistinctPoints).map(i => Vectors.dense(Array.fill(3)(rng.nextDouble)))
+    val data = sc.parallelize(points.flatMap(Array.fill(1 + rng.nextInt(3))(_)), 2)
+    val normedData = data.map(new VectorWithNorm(_))
+
+    // less centers than k
+    val km = new KMeans().setK(50)
+      .setMaxIterations(5)
+      .setInitializationMode("k-means||")
+      .setInitializationSteps(10)
+      .setSeed(seed)
+    val initialCenters = km.initKMeansParallel(normedData).map(_.vector)
+    assert(initialCenters.length === initialCenters.distinct.length)
+    assert(initialCenters.length <= numDistinctPoints)
+
+    val model = km.run(data)
+    val finalCenters = model.clusterCenters
+    assert(finalCenters.length === finalCenters.distinct.length)
+
+    // run local k-means
+    val k = 10
+    val km2 = new KMeans().setK(k)
+      .setMaxIterations(5)
+      .setInitializationMode("k-means||")
+      .setInitializationSteps(10)
+      .setSeed(seed)
+    val initialCenters2 = km2.initKMeansParallel(normedData).map(_.vector)
+    assert(initialCenters2.length === initialCenters2.distinct.length)
+    assert(initialCenters2.length === k)
+
+    val model2 = km2.run(data)
+    val finalCenters2 = model2.clusterCenters
+    assert(finalCenters2.length === finalCenters2.distinct.length)
+
+    val km3 = new KMeans().setK(k)
+      .setMaxIterations(5)
+      .setInitializationMode("random")
+      .setSeed(seed)
+    val model3 = km3.run(data)
+    val finalCenters3 = model3.clusterCenters
+    assert(finalCenters3.length === finalCenters3.distinct.length)
   }
 
   test("deterministic initialization") {
@@ -97,12 +127,12 @@ class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
 
     for (initMode <- Seq(RANDOM, K_MEANS_PARALLEL)) {
       // Create three deterministic models and compare cluster means
-      val model1 = KMeans.train(rdd, k = 10, maxIterations = 2, runs = 1,
-        initializationMode = initMode, seed = 42)
+      val model1 = KMeans.train(rdd, k = 10, maxIterations = 2,
+        initializationMode = initMode, seed = seed)
       val centers1 = model1.clusterCenters
 
-      val model2 = KMeans.train(rdd, k = 10, maxIterations = 2, runs = 1,
-        initializationMode = initMode, seed = 42)
+      val model2 = KMeans.train(rdd, k = 10, maxIterations = 2,
+        initializationMode = initMode, seed = seed)
       val centers2 = model2.clusterCenters
 
       centers1.zip(centers2).foreach { case (c1, c2) =>
@@ -119,7 +149,7 @@ class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
     )
     val data = sc.parallelize((1 to 100).flatMap(_ => smallData), 4)
 
-    // No matter how many runs or iterations we use, we should get one cluster,
+    // No matter how many iterations we use, we should get one cluster,
     // centered at the mean of the points
 
     val center = Vectors.dense(1.0, 3.0, 4.0)
@@ -134,17 +164,10 @@ class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
     model = KMeans.train(data, k = 1, maxIterations = 5)
     assert(model.clusterCenters.head ~== center absTol 1E-5)
 
-    model = KMeans.train(data, k = 1, maxIterations = 1, runs = 5)
+    model = KMeans.train(data, k = 1, maxIterations = 1, initializationMode = RANDOM)
     assert(model.clusterCenters.head ~== center absTol 1E-5)
 
-    model = KMeans.train(data, k = 1, maxIterations = 1, runs = 5)
-    assert(model.clusterCenters.head ~== center absTol 1E-5)
-
-    model = KMeans.train(data, k = 1, maxIterations = 1, runs = 1, initializationMode = RANDOM)
-    assert(model.clusterCenters.head ~== center absTol 1E-5)
-
-    model = KMeans.train(data, k = 1, maxIterations = 1, runs = 1,
-      initializationMode = K_MEANS_PARALLEL)
+    model = KMeans.train(data, k = 1, maxIterations = 1, initializationMode = K_MEANS_PARALLEL)
     assert(model.clusterCenters.head ~== center absTol 1E-5)
   }
 
@@ -165,7 +188,7 @@ class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
 
     data.persist()
 
-    // No matter how many runs or iterations we use, we should get one cluster,
+    // No matter how many iterations we use, we should get one cluster,
     // centered at the mean of the points
 
     val center = Vectors.sparse(n, Seq((0, 1.0), (1, 3.0), (2, 4.0)))
@@ -179,17 +202,10 @@ class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
     model = KMeans.train(data, k = 1, maxIterations = 5)
     assert(model.clusterCenters.head ~== center absTol 1E-5)
 
-    model = KMeans.train(data, k = 1, maxIterations = 1, runs = 5)
-    assert(model.clusterCenters.head ~== center absTol 1E-5)
-
-    model = KMeans.train(data, k = 1, maxIterations = 1, runs = 5)
+    model = KMeans.train(data, k = 1, maxIterations = 1, initializationMode = RANDOM)
     assert(model.clusterCenters.head ~== center absTol 1E-5)
 
-    model = KMeans.train(data, k = 1, maxIterations = 1, runs = 1, initializationMode = RANDOM)
-    assert(model.clusterCenters.head ~== center absTol 1E-5)
-
-    model = KMeans.train(data, k = 1, maxIterations = 1, runs = 1,
-      initializationMode = K_MEANS_PARALLEL)
+    model = KMeans.train(data, k = 1, maxIterations = 1, initializationMode = K_MEANS_PARALLEL)
     assert(model.clusterCenters.head ~== center absTol 1E-5)
 
     data.unpersist()
@@ -230,11 +246,6 @@ class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
     model = KMeans.train(rdd, k = 5, maxIterations = 10)
     assert(model.clusterCenters.sortBy(VectorWithCompare(_))
       .zip(points.sortBy(VectorWithCompare(_))).forall(x => x._1 ~== (x._2) absTol 1E-5))
-
-    // Neither should more runs
-    model = KMeans.train(rdd, k = 5, maxIterations = 10, runs = 5)
-    assert(model.clusterCenters.sortBy(VectorWithCompare(_))
-      .zip(points.sortBy(VectorWithCompare(_))).forall(x => x._1 ~== (x._2) absTol 1E-5))
   }
 
   test("two clusters") {
@@ -250,7 +261,7 @@ class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
 
     for (initMode <- Seq(RANDOM, K_MEANS_PARALLEL)) {
       // Two iterations are sufficient no matter where the initial centers are.
-      val model = KMeans.train(rdd, k = 2, maxIterations = 2, runs = 1, initMode)
+      val model = KMeans.train(rdd, k = 2, maxIterations = 2, initMode)
 
       val predicts = model.predict(rdd).collect()
 

From 3ad99f166494950665c137fd5dea636afa0feb10 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Sun, 30 Oct 2016 20:27:38 +0800
Subject: [PATCH 153/162] [SPARK-18146][SQL] Avoid using Union to chain
 together create table and repair partition commands

## What changes were proposed in this pull request?

The behavior of union is not well defined here. It is safer to explicitly execute these commands in order. The other use of `Union` in this way will be removed by https://github.com/apache/spark/pull/15633

## How was this patch tested?

Existing tests.

cc yhuai cloud-fan

Author: Eric Liang <ekhliang@gmail.com>
Author: Eric Liang <ekl@databricks.com>

Closes #15665 from ericl/spark-18146.
---
 .../scala/org/apache/spark/sql/DataFrameWriter.scala | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 7ff3522f547d3..11dd1df909938 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -388,16 +388,14 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
           partitionColumnNames = partitioningColumns.getOrElse(Nil),
           bucketSpec = getBucketSpec
         )
-        val createCmd = CreateTable(tableDesc, mode, Some(df.logicalPlan))
-        val cmd = if (tableDesc.partitionColumnNames.nonEmpty &&
+        df.sparkSession.sessionState.executePlan(
+          CreateTable(tableDesc, mode, Some(df.logicalPlan))).toRdd
+        if (tableDesc.partitionColumnNames.nonEmpty &&
             df.sparkSession.sqlContext.conf.manageFilesourcePartitions) {
           // Need to recover partitions into the metastore so our saved data is visible.
-          val recoverPartitionCmd = AlterTableRecoverPartitionsCommand(tableDesc.identifier)
-          Union(createCmd, recoverPartitionCmd)
-        } else {
-          createCmd
+          df.sparkSession.sessionState.executePlan(
+            AlterTableRecoverPartitionsCommand(tableDesc.identifier)).toRdd
         }
-        df.sparkSession.sessionState.executePlan(cmd).toRdd
     }
   }
 

From 90d3b91f4cb59d84fea7105d54ef8c87a7d5c6a2 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Sun, 30 Oct 2016 13:14:45 -0700
Subject: [PATCH 154/162] [SPARK-18103][SQL] Rename *FileCatalog to *FileIndex

## What changes were proposed in this pull request?

To reduce the number of components in SQL named *Catalog, rename *FileCatalog to *FileIndex. A FileIndex is responsible for returning the list of partitions / files to scan given a filtering expression.

```
TableFileCatalog => CatalogFileIndex
FileCatalog => FileIndex
ListingFileCatalog => InMemoryFileIndex
MetadataLogFileCatalog => MetadataLogFileIndex
PrunedTableFileCatalog => PrunedInMemoryFileIndex
```

cc yhuai marmbrus

## How was this patch tested?

N/A

Author: Eric Liang <ekl@databricks.com>
Author: Eric Liang <ekhliang@gmail.com>

Closes #15634 from ericl/rename-file-provider.
---
 .../spark/metrics/source/StaticSources.scala  |  2 +-
 .../spark/sql/execution/CacheManager.scala    |  2 +-
 ...leCatalog.scala => CatalogFileIndex.scala} | 24 ++++++-------
 .../execution/datasources/DataSource.scala    | 10 +++---
 .../{FileCatalog.scala => FileIndex.scala}    |  2 +-
 .../datasources/HadoopFsRelation.scala        |  4 +--
 ...eCatalog.scala => InMemoryFileIndex.scala} |  8 ++---
 ...scala => PartitioningAwareFileIndex.scala} | 16 ++++-----
 .../PruneFileSourcePartitions.scala           |  6 ++--
 .../streaming/CompactibleFileStreamLog.scala  |  4 +--
 .../streaming/FileStreamSource.scala          |  4 +--
 .../streaming/MetadataLogFileCatalog.scala    |  6 ++--
 .../datasources/FileCatalogSuite.scala        | 36 +++++++++----------
 .../datasources/FileSourceStrategySuite.scala |  2 +-
 .../ParquetPartitionDiscoverySuite.scala      |  2 +-
 .../sql/streaming/FileStreamSinkSuite.scala   |  6 ++--
 .../sql/streaming/FileStreamSourceSuite.scala |  2 +-
 .../spark/sql/hive/HiveMetastoreCatalog.scala |  4 +--
 .../spark/sql/hive/CachedTableSuite.scala     | 10 +++---
 .../hive/PartitionedTablePerfStatsSuite.scala |  2 +-
 .../PruneFileSourcePartitionsSuite.scala      |  6 ++--
 21 files changed, 79 insertions(+), 79 deletions(-)
 rename sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/{TableFileCatalog.scala => CatalogFileIndex.scala} (83%)
 rename sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/{FileCatalog.scala => FileIndex.scala} (99%)
 rename sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/{ListingFileCatalog.scala => InMemoryFileIndex.scala} (92%)
 rename sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/{PartitioningAwareFileCatalog.scala => PartitioningAwareFileIndex.scala} (96%)

diff --git a/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala b/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
index b54885b7ff8b0..3f7cfd9d2c11f 100644
--- a/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
@@ -76,7 +76,7 @@ object HiveCatalogMetrics extends Source {
   val METRIC_PARTITIONS_FETCHED = metricRegistry.counter(MetricRegistry.name("partitionsFetched"))
 
   /**
-   * Tracks the total number of files discovered off of the filesystem by ListingFileCatalog.
+   * Tracks the total number of files discovered off of the filesystem by InMemoryFileIndex.
    */
   val METRIC_FILES_DISCOVERED = metricRegistry.counter(MetricRegistry.name("filesDiscovered"))
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
index fb72c679e3628..526623a36d2a1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
@@ -177,7 +177,7 @@ class CacheManager extends Logging {
 
   /**
    * Traverses a given `plan` and searches for the occurrences of `qualifiedPath` in the
-   * [[org.apache.spark.sql.execution.datasources.FileCatalog]] of any [[HadoopFsRelation]] nodes
+   * [[org.apache.spark.sql.execution.datasources.FileIndex]] of any [[HadoopFsRelation]] nodes
    * in the plan. If found, we refresh the metadata and return true. Otherwise, this method returns
    * false.
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/CatalogFileIndex.scala
similarity index 83%
rename from sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/CatalogFileIndex.scala
index b459df5734d43..092aabc89a36c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/TableFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/CatalogFileIndex.scala
@@ -26,23 +26,23 @@ import org.apache.spark.sql.types.StructType
 
 
 /**
- * A [[FileCatalog]] for a metastore catalog table.
+ * A [[FileIndex]] for a metastore catalog table.
  *
  * @param sparkSession a [[SparkSession]]
  * @param table the metadata of the table
  * @param sizeInBytes the table's data size in bytes
  */
-class TableFileCatalog(
+class CatalogFileIndex(
     sparkSession: SparkSession,
     val table: CatalogTable,
-    override val sizeInBytes: Long) extends FileCatalog {
+    override val sizeInBytes: Long) extends FileIndex {
 
   protected val hadoopConf = sparkSession.sessionState.newHadoopConf
 
   private val fileStatusCache = FileStatusCache.newCache(sparkSession)
 
   assert(table.identifier.database.isDefined,
-    "The table identifier must be qualified in TableFileCatalog")
+    "The table identifier must be qualified in CatalogFileIndex")
 
   private val baseLocation = table.storage.locationUri
 
@@ -57,12 +57,12 @@ class TableFileCatalog(
   override def refresh(): Unit = fileStatusCache.invalidateAll()
 
   /**
-   * Returns a [[ListingFileCatalog]] for this table restricted to the subset of partitions
+   * Returns a [[InMemoryFileIndex]] for this table restricted to the subset of partitions
    * specified by the given partition-pruning filters.
    *
    * @param filters partition-pruning filters
    */
-  def filterPartitions(filters: Seq[Expression]): ListingFileCatalog = {
+  def filterPartitions(filters: Seq[Expression]): InMemoryFileIndex = {
     if (table.partitionColumnNames.nonEmpty) {
       val selectedPartitions = sparkSession.sessionState.catalog.listPartitionsByFilter(
         table.identifier, filters)
@@ -70,20 +70,20 @@ class TableFileCatalog(
         PartitionPath(p.toRow(partitionSchema), p.storage.locationUri.get)
       }
       val partitionSpec = PartitionSpec(partitionSchema, partitions)
-      new PrunedTableFileCatalog(
+      new PrunedInMemoryFileIndex(
         sparkSession, new Path(baseLocation.get), fileStatusCache, partitionSpec)
     } else {
-      new ListingFileCatalog(sparkSession, rootPaths, table.storage.properties, None)
+      new InMemoryFileIndex(sparkSession, rootPaths, table.storage.properties, None)
     }
   }
 
   override def inputFiles: Array[String] = filterPartitions(Nil).inputFiles
 
-  // `TableFileCatalog` may be a member of `HadoopFsRelation`, `HadoopFsRelation` may be a member
+  // `CatalogFileIndex` may be a member of `HadoopFsRelation`, `HadoopFsRelation` may be a member
   // of `LogicalRelation`, and `LogicalRelation` may be used as the cache key. So we need to
   // implement `equals` and `hashCode` here, to make it work with cache lookup.
   override def equals(o: Any): Boolean = o match {
-    case other: TableFileCatalog => this.table.identifier == other.table.identifier
+    case other: CatalogFileIndex => this.table.identifier == other.table.identifier
     case _ => false
   }
 
@@ -97,12 +97,12 @@ class TableFileCatalog(
  * @param tableBasePath The default base path of the Hive metastore table
  * @param partitionSpec The partition specifications from Hive metastore
  */
-private class PrunedTableFileCatalog(
+private class PrunedInMemoryFileIndex(
     sparkSession: SparkSession,
     tableBasePath: Path,
     fileStatusCache: FileStatusCache,
     override val partitionSpec: PartitionSpec)
-  extends ListingFileCatalog(
+  extends InMemoryFileIndex(
     sparkSession,
     partitionSpec.partitions.map(_.path),
     Map.empty,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index 5b8f05a396241..996109865fdc7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -202,7 +202,7 @@ case class DataSource(
         val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
         SparkHadoopUtil.get.globPathIfNecessary(qualified)
       }.toArray
-      val fileCatalog = new ListingFileCatalog(sparkSession, globbedPaths, options, None)
+      val fileCatalog = new InMemoryFileIndex(sparkSession, globbedPaths, options, None)
       val partitionSchema = fileCatalog.partitionSpec().partitionColumns
       val inferred = format.inferSchema(
         sparkSession,
@@ -364,7 +364,7 @@ case class DataSource(
       case (format: FileFormat, _)
           if hasMetadata(caseInsensitiveOptions.get("path").toSeq ++ paths) =>
         val basePath = new Path((caseInsensitiveOptions.get("path").toSeq ++ paths).head)
-        val fileCatalog = new MetadataLogFileCatalog(sparkSession, basePath)
+        val fileCatalog = new MetadataLogFileIndex(sparkSession, basePath)
         val dataSchema = userSpecifiedSchema.orElse {
           format.inferSchema(
             sparkSession,
@@ -417,12 +417,12 @@ case class DataSource(
 
         val fileCatalog = if (sparkSession.sqlContext.conf.manageFilesourcePartitions &&
             catalogTable.isDefined && catalogTable.get.partitionProviderIsHive) {
-          new TableFileCatalog(
+          new CatalogFileIndex(
             sparkSession,
             catalogTable.get,
             catalogTable.get.stats.map(_.sizeInBytes.toLong).getOrElse(0L))
         } else {
-          new ListingFileCatalog(
+          new InMemoryFileIndex(
             sparkSession, globbedPaths, options, partitionSchema)
         }
 
@@ -433,7 +433,7 @@ case class DataSource(
           format.inferSchema(
             sparkSession,
             caseInsensitiveOptions,
-            fileCatalog.asInstanceOf[ListingFileCatalog].allFiles())
+            fileCatalog.asInstanceOf[InMemoryFileIndex].allFiles())
         }.getOrElse {
           throw new AnalysisException(
             s"Unable to infer schema for $format at ${allPaths.take(2).mkString(",")}. " +
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileIndex.scala
similarity index 99%
rename from sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCatalog.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileIndex.scala
index dba64624c34b3..277223d52ec52 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileIndex.scala
@@ -33,7 +33,7 @@ case class PartitionDirectory(values: InternalRow, files: Seq[FileStatus])
  * An interface for objects capable of enumerating the root paths of a relation as well as the
  * partitions of a relation subject to some pruning expressions.
  */
-trait FileCatalog {
+trait FileIndex {
 
   /**
    * Returns the list of root input paths from which the catalog will get files. There may be a
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
index afad8898089bd..014abd454f5c0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.types.StructType
  * Acts as a container for all of the metadata required to read from a datasource. All discovery,
  * resolution and merging logic for schemas and partitions has been removed.
  *
- * @param location A [[FileCatalog]] that can enumerate the locations of all the files that
+ * @param location A [[FileIndex]] that can enumerate the locations of all the files that
  *                 comprise this relation.
  * @param partitionSchema The schema of the columns (if any) that are used to partition the relation
  * @param dataSchema The schema of any remaining columns.  Note that if any partition columns are
@@ -38,7 +38,7 @@ import org.apache.spark.sql.types.StructType
  * @param options Configuration used when reading / writing data.
  */
 case class HadoopFsRelation(
-    location: FileCatalog,
+    location: FileIndex,
     partitionSchema: StructType,
     dataSchema: StructType,
     bucketSpec: Option[BucketSpec],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
similarity index 92%
rename from sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
index d9d588388aaf1..7531f0ae02e75 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.types.StructType
 
 
 /**
- * A [[FileCatalog]] that generates the list of files to process by recursively listing all the
+ * A [[FileIndex]] that generates the list of files to process by recursively listing all the
  * files present in `paths`.
  *
  * @param rootPaths the list of root table paths to scan
@@ -34,13 +34,13 @@ import org.apache.spark.sql.types.StructType
  * @param partitionSchema an optional partition schema that will be use to provide types for the
  *                        discovered partitions
  */
-class ListingFileCatalog(
+class InMemoryFileIndex(
     sparkSession: SparkSession,
     override val rootPaths: Seq[Path],
     parameters: Map[String, String],
     partitionSchema: Option[StructType],
     fileStatusCache: FileStatusCache = NoopCache)
-  extends PartitioningAwareFileCatalog(
+  extends PartitioningAwareFileIndex(
     sparkSession, parameters, partitionSchema, fileStatusCache) {
 
   @volatile private var cachedLeafFiles: mutable.LinkedHashMap[Path, FileStatus] = _
@@ -79,7 +79,7 @@ class ListingFileCatalog(
   }
 
   override def equals(other: Any): Boolean = other match {
-    case hdfs: ListingFileCatalog => rootPaths.toSet == hdfs.rootPaths.toSet
+    case hdfs: InMemoryFileIndex => rootPaths.toSet == hdfs.rootPaths.toSet
     case _ => false
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala
similarity index 96%
rename from sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala
index cc4049e925905..a8a722dd3c620 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala
@@ -34,19 +34,19 @@ import org.apache.spark.sql.types.{StringType, StructType}
 import org.apache.spark.util.SerializableConfiguration
 
 /**
- * An abstract class that represents [[FileCatalog]]s that are aware of partitioned tables.
+ * An abstract class that represents [[FileIndex]]s that are aware of partitioned tables.
  * It provides the necessary methods to parse partition data based on a set of files.
  *
  * @param parameters as set of options to control partition discovery
  * @param userPartitionSchema an optional partition schema that will be use to provide types for
  *                            the discovered partitions
  */
-abstract class PartitioningAwareFileCatalog(
+abstract class PartitioningAwareFileIndex(
     sparkSession: SparkSession,
     parameters: Map[String, String],
     userPartitionSchema: Option[StructType],
-    fileStatusCache: FileStatusCache = NoopCache) extends FileCatalog with Logging {
-  import PartitioningAwareFileCatalog.BASE_PATH_PARAM
+    fileStatusCache: FileStatusCache = NoopCache) extends FileIndex with Logging {
+  import PartitioningAwareFileIndex.BASE_PATH_PARAM
 
   /** Returns the specification of the partitions inferred from the data. */
   def partitionSpec(): PartitionSpec
@@ -253,9 +253,9 @@ abstract class PartitioningAwareFileCatalog(
     }
     val discovered = if (pathsToFetch.length >=
         sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) {
-      PartitioningAwareFileCatalog.listLeafFilesInParallel(pathsToFetch, hadoopConf, sparkSession)
+      PartitioningAwareFileIndex.listLeafFilesInParallel(pathsToFetch, hadoopConf, sparkSession)
     } else {
-      PartitioningAwareFileCatalog.listLeafFilesInSerial(pathsToFetch, hadoopConf)
+      PartitioningAwareFileIndex.listLeafFilesInSerial(pathsToFetch, hadoopConf)
     }
     discovered.foreach { case (path, leafFiles) =>
       HiveCatalogMetrics.incrementFilesDiscovered(leafFiles.size)
@@ -266,7 +266,7 @@ abstract class PartitioningAwareFileCatalog(
   }
 }
 
-object PartitioningAwareFileCatalog extends Logging {
+object PartitioningAwareFileIndex extends Logging {
   val BASE_PATH_PARAM = "basePath"
 
   /** A serializable variant of HDFS's BlockLocation. */
@@ -383,7 +383,7 @@ object PartitioningAwareFileCatalog extends Logging {
     if (shouldFilterOut(name)) {
       Seq.empty[FileStatus]
     } else {
-      // [SPARK-17599] Prevent ListingFileCatalog from failing if path doesn't exist
+      // [SPARK-17599] Prevent InMemoryFileIndex from failing if path doesn't exist
       // Note that statuses only include FileStatus for the files and dirs directly under path,
       // and does not include anything else recursively.
       val statuses = try fs.listStatus(path) catch {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
index 8689017c3ed75..8566a8061034b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
@@ -28,7 +28,7 @@ private[sql] object PruneFileSourcePartitions extends Rule[LogicalPlan] {
         logicalRelation @
           LogicalRelation(fsRelation @
             HadoopFsRelation(
-              tableFileCatalog: TableFileCatalog,
+              catalogFileIndex: CatalogFileIndex,
               partitionSchema,
               _,
               _,
@@ -56,9 +56,9 @@ private[sql] object PruneFileSourcePartitions extends Rule[LogicalPlan] {
         ExpressionSet(normalizedFilters.filter(_.references.subsetOf(partitionSet)))
 
       if (partitionKeyFilters.nonEmpty) {
-        val prunedFileCatalog = tableFileCatalog.filterPartitions(partitionKeyFilters.toSeq)
+        val prunedFileIndex = catalogFileIndex.filterPartitions(partitionKeyFilters.toSeq)
         val prunedFsRelation =
-          fsRelation.copy(location = prunedFileCatalog)(sparkSession)
+          fsRelation.copy(location = prunedFileIndex)(sparkSession)
         val prunedLogicalRelation = logicalRelation.copy(
           relation = prunedFsRelation,
           expectedOutputAttributes = Some(logicalRelation.output))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala
index c14feea91ed7d..b26edeeb04009 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala
@@ -146,7 +146,7 @@ abstract class CompactibleFileStreamLog[T: ClassTag](
    */
   def allFiles(): Array[T] = {
     var latestId = getLatest().map(_._1).getOrElse(-1L)
-    // There is a race condition when `FileStreamSink` is deleting old files and `StreamFileCatalog`
+    // There is a race condition when `FileStreamSink` is deleting old files and `StreamFileIndex`
     // is calling this method. This loop will retry the reading to deal with the
     // race condition.
     while (true) {
@@ -158,7 +158,7 @@ abstract class CompactibleFileStreamLog[T: ClassTag](
         } catch {
           case e: IOException =>
             // Another process using `CompactibleFileStreamLog` may delete the batch files when
-            // `StreamFileCatalog` are reading. However, it only happens when a compaction is
+            // `StreamFileIndex` are reading. However, it only happens when a compaction is
             // deleting old files. If so, let's try the next compaction batch and we should find it.
             // Otherwise, this is a real IO issue and we should throw it.
             latestId = nextCompactionBatchId(latestId, compactInterval)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
index a392b82999021..680df01acc1a6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
@@ -24,7 +24,7 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
-import org.apache.spark.sql.execution.datasources.{DataSource, ListingFileCatalog, LogicalRelation}
+import org.apache.spark.sql.execution.datasources.{DataSource, InMemoryFileIndex, LogicalRelation}
 import org.apache.spark.sql.types.StructType
 
 /**
@@ -156,7 +156,7 @@ class FileStreamSource(
   private def fetchAllFiles(): Seq[(String, Long)] = {
     val startTime = System.nanoTime
     val globbedPaths = SparkHadoopUtil.get.globPathIfNecessary(qualifiedBasePath)
-    val catalog = new ListingFileCatalog(sparkSession, globbedPaths, options, Some(new StructType))
+    val catalog = new InMemoryFileIndex(sparkSession, globbedPaths, options, Some(new StructType))
     val files = catalog.allFiles().sortBy(_.getModificationTime).map { status =>
       (status.getPath.toUri.toString, status.getModificationTime)
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileCatalog.scala
index 82b67cb1ca6ee..aeaa134736937 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileCatalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileCatalog.scala
@@ -26,11 +26,11 @@ import org.apache.spark.sql.execution.datasources._
 
 
 /**
- * A [[FileCatalog]] that generates the list of files to processing by reading them from the
+ * A [[FileIndex]] that generates the list of files to processing by reading them from the
  * metadata log files generated by the [[FileStreamSink]].
  */
-class MetadataLogFileCatalog(sparkSession: SparkSession, path: Path)
-  extends PartitioningAwareFileCatalog(sparkSession, Map.empty, None) {
+class MetadataLogFileIndex(sparkSession: SparkSession, path: Path)
+  extends PartitioningAwareFileIndex(sparkSession, Map.empty, None) {
 
   private val metadataDirectory = new Path(path, FileStreamSink.metadataDir)
   logInfo(s"Reading streaming file log from $metadataDirectory")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala
index 9c43169cbf898..56df1face6364 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala
@@ -28,15 +28,15 @@ import org.apache.hadoop.fs.{FileStatus, Path, RawLocalFileSystem}
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.test.SharedSQLContext
 
-class FileCatalogSuite extends SharedSQLContext {
+class FileIndexSuite extends SharedSQLContext {
 
-  test("ListingFileCatalog: leaf files are qualified paths") {
+  test("InMemoryFileIndex: leaf files are qualified paths") {
     withTempDir { dir =>
       val file = new File(dir, "text.txt")
       stringToFile(file, "text")
 
       val path = new Path(file.getCanonicalPath)
-      val catalog = new ListingFileCatalog(spark, Seq(path), Map.empty, None) {
+      val catalog = new InMemoryFileIndex(spark, Seq(path), Map.empty, None) {
         def leafFilePaths: Seq[Path] = leafFiles.keys.toSeq
         def leafDirPaths: Seq[Path] = leafDirToChildrenFiles.keys.toSeq
       }
@@ -45,7 +45,7 @@ class FileCatalogSuite extends SharedSQLContext {
     }
   }
 
-  test("ListingFileCatalog: input paths are converted to qualified paths") {
+  test("InMemoryFileIndex: input paths are converted to qualified paths") {
     withTempDir { dir =>
       val file = new File(dir, "text.txt")
       stringToFile(file, "text")
@@ -59,42 +59,42 @@ class FileCatalogSuite extends SharedSQLContext {
       val qualifiedFilePath = fs.makeQualified(new Path(file.getCanonicalPath))
       require(qualifiedFilePath.toString.startsWith("file:"))
 
-      val catalog1 = new ListingFileCatalog(
+      val catalog1 = new InMemoryFileIndex(
         spark, Seq(unqualifiedDirPath), Map.empty, None)
       assert(catalog1.allFiles.map(_.getPath) === Seq(qualifiedFilePath))
 
-      val catalog2 = new ListingFileCatalog(
+      val catalog2 = new InMemoryFileIndex(
         spark, Seq(unqualifiedFilePath), Map.empty, None)
       assert(catalog2.allFiles.map(_.getPath) === Seq(qualifiedFilePath))
 
     }
   }
 
-  test("ListingFileCatalog: folders that don't exist don't throw exceptions") {
+  test("InMemoryFileIndex: folders that don't exist don't throw exceptions") {
     withTempDir { dir =>
       val deletedFolder = new File(dir, "deleted")
       assert(!deletedFolder.exists())
-      val catalog1 = new ListingFileCatalog(
+      val catalog1 = new InMemoryFileIndex(
         spark, Seq(new Path(deletedFolder.getCanonicalPath)), Map.empty, None)
       // doesn't throw an exception
       assert(catalog1.listLeafFiles(catalog1.rootPaths).isEmpty)
     }
   }
 
-  test("PartitioningAwareFileCatalog - file filtering") {
-    assert(!PartitioningAwareFileCatalog.shouldFilterOut("abcd"))
-    assert(PartitioningAwareFileCatalog.shouldFilterOut(".ab"))
-    assert(PartitioningAwareFileCatalog.shouldFilterOut("_cd"))
-    assert(!PartitioningAwareFileCatalog.shouldFilterOut("_metadata"))
-    assert(!PartitioningAwareFileCatalog.shouldFilterOut("_common_metadata"))
-    assert(PartitioningAwareFileCatalog.shouldFilterOut("_ab_metadata"))
-    assert(PartitioningAwareFileCatalog.shouldFilterOut("_cd_common_metadata"))
+  test("PartitioningAwareFileIndex - file filtering") {
+    assert(!PartitioningAwareFileIndex.shouldFilterOut("abcd"))
+    assert(PartitioningAwareFileIndex.shouldFilterOut(".ab"))
+    assert(PartitioningAwareFileIndex.shouldFilterOut("_cd"))
+    assert(!PartitioningAwareFileIndex.shouldFilterOut("_metadata"))
+    assert(!PartitioningAwareFileIndex.shouldFilterOut("_common_metadata"))
+    assert(PartitioningAwareFileIndex.shouldFilterOut("_ab_metadata"))
+    assert(PartitioningAwareFileIndex.shouldFilterOut("_cd_common_metadata"))
   }
 
-  test("SPARK-17613 - PartitioningAwareFileCatalog: base path w/o '/' at end") {
+  test("SPARK-17613 - PartitioningAwareFileIndex: base path w/o '/' at end") {
     class MockCatalog(
       override val rootPaths: Seq[Path])
-      extends PartitioningAwareFileCatalog(spark, Map.empty, None) {
+      extends PartitioningAwareFileIndex(spark, Map.empty, None) {
 
       override def refresh(): Unit = {}
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala
index c32254d9dfde2..d900ce7bb2370 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala
@@ -393,7 +393,7 @@ class FileSourceStrategySuite extends QueryTest with SharedSQLContext with Predi
           util.stringToFile(file, fileName)
         }
 
-        val fileCatalog = new ListingFileCatalog(
+        val fileCatalog = new InMemoryFileIndex(
           sparkSession = spark,
           rootPaths = Seq(new Path(tempDir)),
           parameters = Map.empty[String, String],
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
index f2a209e91962d..120a3a2ef33aa 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
@@ -634,7 +634,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
       val queryExecution = spark.read.parquet(dir.getCanonicalPath).queryExecution
       queryExecution.analyzed.collectFirst {
         case LogicalRelation(
-            HadoopFsRelation(location: PartitioningAwareFileCatalog, _, _, _, _, _), _, _) =>
+            HadoopFsRelation(location: PartitioningAwareFileIndex, _, _, _, _, _), _, _) =>
           assert(location.partitionSpec() === PartitionSpec.emptySpec)
       }.getOrElse {
         fail(s"Expecting a matching HadoopFsRelation, but got:\n$queryExecution")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
index 19c89f5c4100c..18b42a81a098c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.execution.DataSourceScanExec
 import org.apache.spark.sql.execution.datasources._
-import org.apache.spark.sql.execution.streaming.{FileStreamSinkWriter, MemoryStream, MetadataLogFileCatalog}
+import org.apache.spark.sql.execution.streaming.{FileStreamSinkWriter, MemoryStream, MetadataLogFileIndex}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
@@ -179,14 +179,14 @@ class FileStreamSinkSuite extends StreamTest {
         .add(StructField("id", IntegerType))
       assert(outputDf.schema === expectedSchema)
 
-      // Verify that MetadataLogFileCatalog is being used and the correct partitioning schema has
+      // Verify that MetadataLogFileIndex is being used and the correct partitioning schema has
       // been inferred
       val hadoopdFsRelations = outputDf.queryExecution.analyzed.collect {
         case LogicalRelation(baseRelation, _, _) if baseRelation.isInstanceOf[HadoopFsRelation] =>
           baseRelation.asInstanceOf[HadoopFsRelation]
       }
       assert(hadoopdFsRelations.size === 1)
-      assert(hadoopdFsRelations.head.location.isInstanceOf[MetadataLogFileCatalog])
+      assert(hadoopdFsRelations.head.location.isInstanceOf[MetadataLogFileIndex])
       assert(hadoopdFsRelations.head.partitionSchema.exists(_.name == "id"))
       assert(hadoopdFsRelations.head.dataSchema.exists(_.name == "value"))
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
index b9e9da9a1ec53..47018b3a3c495 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
@@ -879,7 +879,7 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
     val numFiles = 10000
 
     // This is to avoid running a spark job to list of files in parallel
-    // by the ListingFileCatalog.
+    // by the InMemoryFileIndex.
     spark.sessionState.conf.setConf(SQLConf.PARALLEL_PARTITION_DISCOVERY_THRESHOLD, numFiles * 2)
 
     withTempDirs { case (root, tmp) =>
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index d1de863ce3623..624ab747e442f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -200,7 +200,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
       val rootPaths: Seq[Path] = if (lazyPruningEnabled) {
         Seq(metastoreRelation.hiveQlTable.getDataLocation)
       } else {
-        // By convention (for example, see TableFileCatalog), the definition of a
+        // By convention (for example, see CatalogFileIndex), the definition of a
         // partitioned table's paths depends on whether that table has any actual partitions.
         // Partitioned tables without partitions use the location of the table's base path.
         // Partitioned tables with partitions use the locations of those partitions' data
@@ -227,7 +227,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
       val logicalRelation = cached.getOrElse {
         val sizeInBytes = metastoreRelation.statistics.sizeInBytes.toLong
         val fileCatalog = {
-          val catalog = new TableFileCatalog(
+          val catalog = new CatalogFileIndex(
             sparkSession, metastoreRelation.catalogTable, sizeInBytes)
           if (lazyPruningEnabled) {
             catalog
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
index ecdf4f14b3985..fc35304c80ecc 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.{AnalysisException, Dataset, QueryTest, SaveMode}
 import org.apache.spark.sql.catalyst.analysis.NoSuchTableException
 import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec
-import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation, TableFileCatalog}
+import org.apache.spark.sql.execution.datasources.{CatalogFileIndex, HadoopFsRelation, LogicalRelation}
 import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.test.SQLTestUtils
@@ -321,17 +321,17 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto
     sql("DROP TABLE cachedTable")
   }
 
-  test("cache a table using TableFileCatalog") {
+  test("cache a table using CatalogFileIndex") {
     withTable("test") {
       sql("CREATE TABLE test(i int) PARTITIONED BY (p int) STORED AS parquet")
       val tableMeta = spark.sharedState.externalCatalog.getTable("default", "test")
-      val tableFileCatalog = new TableFileCatalog(spark, tableMeta, 0)
+      val catalogFileIndex = new CatalogFileIndex(spark, tableMeta, 0)
 
       val dataSchema = StructType(tableMeta.schema.filterNot { f =>
         tableMeta.partitionColumnNames.contains(f.name)
       })
       val relation = HadoopFsRelation(
-        location = tableFileCatalog,
+        location = catalogFileIndex,
         partitionSchema = tableMeta.partitionSchema,
         dataSchema = dataSchema,
         bucketSpec = None,
@@ -343,7 +343,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto
 
       assert(spark.sharedState.cacheManager.lookupCachedData(plan).isDefined)
 
-      val sameCatalog = new TableFileCatalog(spark, tableMeta, 0)
+      val sameCatalog = new CatalogFileIndex(spark, tableMeta, 0)
       val sameRelation = HadoopFsRelation(
         location = sameCatalog,
         partitionSchema = tableMeta.partitionSchema,
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
index 476383a5b33a5..d8e31c4e39a5c 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
@@ -256,7 +256,7 @@ class PartitionedTablePerfStatsSuite
           // of doing plan cache validation based on the entire partition set.
           HiveCatalogMetrics.reset()
           assert(spark.sql("select * from test where partCol1 = 999").count() == 0)
-          // 5 from table resolution, another 5 from ListingFileCatalog
+          // 5 from table resolution, another 5 from InMemoryFileIndex
           assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount() == 10)
           assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 5)
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala
index 59639aacf3a3f..cdbc26cd5c576 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala
@@ -22,7 +22,7 @@ import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project}
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
-import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation, PruneFileSourcePartitions, TableFileCatalog}
+import org.apache.spark.sql.execution.datasources.{CatalogFileIndex, HadoopFsRelation, LogicalRelation, PruneFileSourcePartitions}
 import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.test.SQLTestUtils
@@ -45,13 +45,13 @@ class PruneFileSourcePartitionsSuite extends QueryTest with SQLTestUtils with Te
             |LOCATION '${dir.getAbsolutePath}'""".stripMargin)
 
         val tableMeta = spark.sharedState.externalCatalog.getTable("default", "test")
-        val tableFileCatalog = new TableFileCatalog(spark, tableMeta, 0)
+        val catalogFileIndex = new CatalogFileIndex(spark, tableMeta, 0)
 
         val dataSchema = StructType(tableMeta.schema.filterNot { f =>
           tableMeta.partitionColumnNames.contains(f.name)
         })
         val relation = HadoopFsRelation(
-          location = tableFileCatalog,
+          location = catalogFileIndex,
           partitionSchema = tableMeta.partitionSchema,
           dataSchema = dataSchema,
           bucketSpec = None,

From 8ae2da0b2551011e2f6cf02907a1e20c138a4b2f Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Sun, 30 Oct 2016 23:24:30 +0100
Subject: [PATCH 155/162] [SPARK-18106][SQL] ANALYZE TABLE should raise a
 ParseException for invalid option

## What changes were proposed in this pull request?

Currently, `ANALYZE TABLE` command accepts `identifier` for option `NOSCAN`. This PR raises a ParseException for unknown option.

**Before**
```scala
scala> sql("create table test(a int)")
res0: org.apache.spark.sql.DataFrame = []

scala> sql("analyze table test compute statistics blah")
res1: org.apache.spark.sql.DataFrame = []
```

**After**
```scala
scala> sql("create table test(a int)")
res0: org.apache.spark.sql.DataFrame = []

scala> sql("analyze table test compute statistics blah")
org.apache.spark.sql.catalyst.parser.ParseException:
Expected `NOSCAN` instead of `blah`(line 1, pos 0)
```

## How was this patch tested?

Pass the Jenkins test with a new test case.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #15640 from dongjoon-hyun/SPARK-18106.
---
 .../spark/sql/execution/SparkSqlParser.scala   | 10 +++++++---
 .../sql/execution/SparkSqlParserSuite.scala    | 18 ++++++++++++++++--
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index 1cc166d5a7a9d..fe183d0097d03 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -98,9 +98,13 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
    * }}}
    */
   override def visitAnalyze(ctx: AnalyzeContext): LogicalPlan = withOrigin(ctx) {
-    if (ctx.partitionSpec == null &&
-      ctx.identifier != null &&
-      ctx.identifier.getText.toLowerCase == "noscan") {
+    if (ctx.partitionSpec != null) {
+      logWarning(s"Partition specification is ignored: ${ctx.partitionSpec.getText}")
+    }
+    if (ctx.identifier != null) {
+      if (ctx.identifier.getText.toLowerCase != "noscan") {
+        throw new ParseException(s"Expected `NOSCAN` instead of `${ctx.identifier.getText}`", ctx)
+      }
       AnalyzeTableCommand(visitTableIdentifier(ctx.tableIdentifier))
     } else if (ctx.identifierSeq() == null) {
       AnalyzeTableCommand(visitTableIdentifier(ctx.tableIdentifier), noscan = false)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala
index 679150e9ae4c0..797fe9ffa8be1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala
@@ -23,8 +23,8 @@ import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat,
 import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.execution.command.{DescribeFunctionCommand, DescribeTableCommand,
-  ShowFunctionsCommand}
+import org.apache.spark.sql.execution.command.{AnalyzeTableCommand, DescribeFunctionCommand,
+  DescribeTableCommand, ShowFunctionsCommand}
 import org.apache.spark.sql.execution.datasources.{CreateTable, CreateTempViewUsing}
 import org.apache.spark.sql.internal.{HiveSerDe, SQLConf}
 import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructType}
@@ -220,4 +220,18 @@ class SparkSqlParserSuite extends PlanTest {
 
     intercept("explain describe tables x", "Unsupported SQL statement")
   }
+
+  test("SPARK-18106 analyze table") {
+    assertEqual("analyze table t compute statistics",
+      AnalyzeTableCommand(TableIdentifier("t"), noscan = false))
+    assertEqual("analyze table t compute statistics noscan",
+      AnalyzeTableCommand(TableIdentifier("t"), noscan = true))
+    assertEqual("analyze table t partition (a) compute statistics noscan",
+      AnalyzeTableCommand(TableIdentifier("t"), noscan = true))
+
+    intercept("analyze table t compute statistics xxxx",
+      "Expected `NOSCAN` instead of `xxxx`")
+    intercept("analyze table t partition (a) compute statistics xxxx",
+      "Expected `NOSCAN` instead of `xxxx`")
+  }
 }

From 2881a2d1d1a650a91df2c6a01275eba14a43b42a Mon Sep 17 00:00:00 2001
From: Hossein <hossein@databricks.com>
Date: Sun, 30 Oct 2016 16:17:23 -0700
Subject: [PATCH 156/162] [SPARK-17919] Make timeout to RBackend configurable
 in SparkR

## What changes were proposed in this pull request?

This patch makes RBackend connection timeout configurable by user.

## How was this patch tested?
N/A

Author: Hossein <hossein@databricks.com>

Closes #15471 from falaki/SPARK-17919.
---
 R/pkg/R/backend.R                             | 20 ++++++++--
 R/pkg/R/client.R                              |  2 +-
 R/pkg/R/sparkR.R                              |  8 +++-
 R/pkg/inst/worker/daemon.R                    |  4 +-
 R/pkg/inst/worker/worker.R                    |  7 +++-
 .../org/apache/spark/api/r/RBackend.scala     | 15 ++++++-
 .../apache/spark/api/r/RBackendHandler.scala  | 39 +++++++++++++++++--
 .../org/apache/spark/api/r/RRunner.scala      |  3 ++
 .../apache/spark/api/r/SparkRDefaults.scala   | 30 ++++++++++++++
 .../org/apache/spark/deploy/RRunner.scala     |  7 +++-
 docs/configuration.md                         | 15 +++++++
 11 files changed, 134 insertions(+), 16 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/api/r/SparkRDefaults.scala

diff --git a/R/pkg/R/backend.R b/R/pkg/R/backend.R
index 03e70bb2cb82e..0a789e6c379d6 100644
--- a/R/pkg/R/backend.R
+++ b/R/pkg/R/backend.R
@@ -108,13 +108,27 @@ invokeJava <- function(isStatic, objId, methodName, ...) {
   conn <- get(".sparkRCon", .sparkREnv)
   writeBin(requestMessage, conn)
 
-  # TODO: check the status code to output error information
   returnStatus <- readInt(conn)
+  handleErrors(returnStatus, conn)
+
+  # Backend will send +1 as keep alive value to prevent various connection timeouts
+  # on very long running jobs. See spark.r.heartBeatInterval
+  while (returnStatus == 1) {
+    returnStatus <- readInt(conn)
+    handleErrors(returnStatus, conn)
+  }
+
+  readObject(conn)
+}
+
+# Helper function to check for returned errors and print appropriate error message to user
+handleErrors <- function(returnStatus, conn) {
   if (length(returnStatus) == 0) {
     stop("No status is returned. Java SparkR backend might have failed.")
   }
-  if (returnStatus != 0) {
+
+  # 0 is success and +1 is reserved for heartbeats. Other negative values indicate errors.
+  if (returnStatus < 0) {
     stop(readString(conn))
   }
-  readObject(conn)
 }
diff --git a/R/pkg/R/client.R b/R/pkg/R/client.R
index 2d341d836c133..9d82814211bc5 100644
--- a/R/pkg/R/client.R
+++ b/R/pkg/R/client.R
@@ -19,7 +19,7 @@
 
 # Creates a SparkR client connection object
 # if one doesn't already exist
-connectBackend <- function(hostname, port, timeout = 6000) {
+connectBackend <- function(hostname, port, timeout) {
   if (exists(".sparkRcon", envir = .sparkREnv)) {
     if (isOpen(.sparkREnv[[".sparkRCon"]])) {
       cat("SparkRBackend client connection already exists\n")
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index cc6d591bb2f4c..6b4a2f2fdc85c 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -154,6 +154,7 @@ sparkR.sparkContext <- function(
   packages <- processSparkPackages(sparkPackages)
 
   existingPort <- Sys.getenv("EXISTING_SPARKR_BACKEND_PORT", "")
+  connectionTimeout <- as.numeric(Sys.getenv("SPARKR_BACKEND_CONNECTION_TIMEOUT", "6000"))
   if (existingPort != "") {
     if (length(packages) != 0) {
       warning(paste("sparkPackages has no effect when using spark-submit or sparkR shell",
@@ -187,6 +188,7 @@ sparkR.sparkContext <- function(
     backendPort <- readInt(f)
     monitorPort <- readInt(f)
     rLibPath <- readString(f)
+    connectionTimeout <- readInt(f)
     close(f)
     file.remove(path)
     if (length(backendPort) == 0 || backendPort == 0 ||
@@ -194,7 +196,9 @@ sparkR.sparkContext <- function(
         length(rLibPath) != 1) {
       stop("JVM failed to launch")
     }
-    assign(".monitorConn", socketConnection(port = monitorPort), envir = .sparkREnv)
+    assign(".monitorConn",
+           socketConnection(port = monitorPort, timeout = connectionTimeout),
+           envir = .sparkREnv)
     assign(".backendLaunched", 1, envir = .sparkREnv)
     if (rLibPath != "") {
       assign(".libPath", rLibPath, envir = .sparkREnv)
@@ -204,7 +208,7 @@ sparkR.sparkContext <- function(
 
   .sparkREnv$backendPort <- backendPort
   tryCatch({
-    connectBackend("localhost", backendPort)
+    connectBackend("localhost", backendPort, timeout = connectionTimeout)
   },
   error = function(err) {
     stop("Failed to connect JVM\n")
diff --git a/R/pkg/inst/worker/daemon.R b/R/pkg/inst/worker/daemon.R
index b92e6be995ca9..3a318b71ea06d 100644
--- a/R/pkg/inst/worker/daemon.R
+++ b/R/pkg/inst/worker/daemon.R
@@ -18,6 +18,7 @@
 # Worker daemon
 
 rLibDir <- Sys.getenv("SPARKR_RLIBDIR")
+connectionTimeout <- as.integer(Sys.getenv("SPARKR_BACKEND_CONNECTION_TIMEOUT", "6000"))
 dirs <- strsplit(rLibDir, ",")[[1]]
 script <- file.path(dirs[[1]], "SparkR", "worker", "worker.R")
 
@@ -26,7 +27,8 @@ script <- file.path(dirs[[1]], "SparkR", "worker", "worker.R")
 suppressPackageStartupMessages(library(SparkR))
 
 port <- as.integer(Sys.getenv("SPARKR_WORKER_PORT"))
-inputCon <- socketConnection(port = port, open = "rb", blocking = TRUE, timeout = 3600)
+inputCon <- socketConnection(
+    port = port, open = "rb", blocking = TRUE, timeout = connectionTimeout)
 
 while (TRUE) {
   ready <- socketSelect(list(inputCon))
diff --git a/R/pkg/inst/worker/worker.R b/R/pkg/inst/worker/worker.R
index cfe41ded200c2..03e7450147865 100644
--- a/R/pkg/inst/worker/worker.R
+++ b/R/pkg/inst/worker/worker.R
@@ -90,6 +90,7 @@ bootTime <- currentTimeSecs()
 bootElap <- elapsedSecs()
 
 rLibDir <- Sys.getenv("SPARKR_RLIBDIR")
+connectionTimeout <- as.integer(Sys.getenv("SPARKR_BACKEND_CONNECTION_TIMEOUT", "6000"))
 dirs <- strsplit(rLibDir, ",")[[1]]
 # Set libPaths to include SparkR package as loadNamespace needs this
 # TODO: Figure out if we can avoid this by not loading any objects that require
@@ -98,8 +99,10 @@ dirs <- strsplit(rLibDir, ",")[[1]]
 suppressPackageStartupMessages(library(SparkR))
 
 port <- as.integer(Sys.getenv("SPARKR_WORKER_PORT"))
-inputCon <- socketConnection(port = port, blocking = TRUE, open = "rb")
-outputCon <- socketConnection(port = port, blocking = TRUE, open = "wb")
+inputCon <- socketConnection(
+    port = port, blocking = TRUE, open = "rb", timeout = connectionTimeout)
+outputCon <- socketConnection(
+    port = port, blocking = TRUE, open = "wb", timeout = connectionTimeout)
 
 # read the index of the current partition inside the RDD
 partition <- SparkR:::readInt(inputCon)
diff --git a/core/src/main/scala/org/apache/spark/api/r/RBackend.scala b/core/src/main/scala/org/apache/spark/api/r/RBackend.scala
index 41d0a85ee3ad4..550746c552d02 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RBackend.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RBackend.scala
@@ -22,12 +22,13 @@ import java.net.{InetAddress, InetSocketAddress, ServerSocket}
 import java.util.concurrent.TimeUnit
 
 import io.netty.bootstrap.ServerBootstrap
-import io.netty.channel.{ChannelFuture, ChannelInitializer, EventLoopGroup}
+import io.netty.channel.{ChannelFuture, ChannelInitializer, ChannelOption, EventLoopGroup}
 import io.netty.channel.nio.NioEventLoopGroup
 import io.netty.channel.socket.SocketChannel
 import io.netty.channel.socket.nio.NioServerSocketChannel
 import io.netty.handler.codec.LengthFieldBasedFrameDecoder
 import io.netty.handler.codec.bytes.{ByteArrayDecoder, ByteArrayEncoder}
+import io.netty.handler.timeout.ReadTimeoutHandler
 
 import org.apache.spark.SparkConf
 import org.apache.spark.internal.Logging
@@ -43,7 +44,10 @@ private[spark] class RBackend {
 
   def init(): Int = {
     val conf = new SparkConf()
-    bossGroup = new NioEventLoopGroup(conf.getInt("spark.r.numRBackendThreads", 2))
+    val backendConnectionTimeout = conf.getInt(
+      "spark.r.backendConnectionTimeout", SparkRDefaults.DEFAULT_CONNECTION_TIMEOUT)
+    bossGroup = new NioEventLoopGroup(
+      conf.getInt("spark.r.numRBackendThreads", SparkRDefaults.DEFAULT_NUM_RBACKEND_THREADS))
     val workerGroup = bossGroup
     val handler = new RBackendHandler(this)
 
@@ -63,6 +67,7 @@ private[spark] class RBackend {
             // initialBytesToStrip = 4, i.e. strip out the length field itself
             new LengthFieldBasedFrameDecoder(Integer.MAX_VALUE, 0, 4, 0, 4))
           .addLast("decoder", new ByteArrayDecoder())
+          .addLast("readTimeoutHandler", new ReadTimeoutHandler(backendConnectionTimeout))
           .addLast("handler", handler)
       }
     })
@@ -110,6 +115,11 @@ private[spark] object RBackend extends Logging {
       val boundPort = sparkRBackend.init()
       val serverSocket = new ServerSocket(0, 1, InetAddress.getByName("localhost"))
       val listenPort = serverSocket.getLocalPort()
+      // Connection timeout is set by socket client. To make it configurable we will pass the
+      // timeout value to client inside the temp file
+      val conf = new SparkConf()
+      val backendConnectionTimeout = conf.getInt(
+        "spark.r.backendConnectionTimeout", SparkRDefaults.DEFAULT_CONNECTION_TIMEOUT)
 
       // tell the R process via temporary file
       val path = args(0)
@@ -118,6 +128,7 @@ private[spark] object RBackend extends Logging {
       dos.writeInt(boundPort)
       dos.writeInt(listenPort)
       SerDe.writeString(dos, RUtils.rPackages.getOrElse(""))
+      dos.writeInt(backendConnectionTimeout)
       dos.close()
       f.renameTo(new File(path))
 
diff --git a/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala b/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
index 1422ef888fd4a..9f5afa29d6d22 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
@@ -18,16 +18,19 @@
 package org.apache.spark.api.r
 
 import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream}
+import java.util.concurrent.TimeUnit
 
 import scala.collection.mutable.HashMap
 import scala.language.existentials
 
 import io.netty.channel.{ChannelHandlerContext, SimpleChannelInboundHandler}
 import io.netty.channel.ChannelHandler.Sharable
+import io.netty.handler.timeout.ReadTimeoutException
 
 import org.apache.spark.api.r.SerDe._
 import org.apache.spark.internal.Logging
-import org.apache.spark.util.Utils
+import org.apache.spark.SparkConf
+import org.apache.spark.util.{ThreadUtils, Utils}
 
 /**
  * Handler for RBackend
@@ -83,7 +86,29 @@ private[r] class RBackendHandler(server: RBackend)
           writeString(dos, s"Error: unknown method $methodName")
       }
     } else {
+      // To avoid timeouts when reading results in SparkR driver, we will be regularly sending
+      // heartbeat responses. We use special code +1 to signal the client that backend is
+      // alive and it should continue blocking for result.
+      val execService = ThreadUtils.newDaemonSingleThreadScheduledExecutor("SparkRKeepAliveThread")
+      val pingRunner = new Runnable {
+        override def run(): Unit = {
+          val pingBaos = new ByteArrayOutputStream()
+          val pingDaos = new DataOutputStream(pingBaos)
+          writeInt(pingDaos, +1)
+          ctx.write(pingBaos.toByteArray)
+        }
+      }
+      val conf = new SparkConf()
+      val heartBeatInterval = conf.getInt(
+        "spark.r.heartBeatInterval", SparkRDefaults.DEFAULT_HEARTBEAT_INTERVAL)
+      val backendConnectionTimeout = conf.getInt(
+        "spark.r.backendConnectionTimeout", SparkRDefaults.DEFAULT_CONNECTION_TIMEOUT)
+      val interval = Math.min(heartBeatInterval, backendConnectionTimeout - 1)
+
+      execService.scheduleAtFixedRate(pingRunner, interval, interval, TimeUnit.SECONDS)
       handleMethodCall(isStatic, objId, methodName, numArgs, dis, dos)
+      execService.shutdown()
+      execService.awaitTermination(1, TimeUnit.SECONDS)
     }
 
     val reply = bos.toByteArray
@@ -95,9 +120,15 @@ private[r] class RBackendHandler(server: RBackend)
   }
 
   override def exceptionCaught(ctx: ChannelHandlerContext, cause: Throwable): Unit = {
-    // Close the connection when an exception is raised.
-    cause.printStackTrace()
-    ctx.close()
+    cause match {
+      case timeout: ReadTimeoutException =>
+        // Do nothing. We don't want to timeout on read
+        logWarning("Ignoring read timeout in RBackendHandler")
+      case _ =>
+        // Close the connection when an exception is raised.
+        cause.printStackTrace()
+        ctx.close()
+    }
   }
 
   def handleMethodCall(
diff --git a/core/src/main/scala/org/apache/spark/api/r/RRunner.scala b/core/src/main/scala/org/apache/spark/api/r/RRunner.scala
index 496fdf851f7db..7ef64723d9593 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RRunner.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RRunner.scala
@@ -333,6 +333,8 @@ private[r] object RRunner {
     var rCommand = sparkConf.get("spark.sparkr.r.command", "Rscript")
     rCommand = sparkConf.get("spark.r.command", rCommand)
 
+    val rConnectionTimeout = sparkConf.getInt(
+      "spark.r.backendConnectionTimeout", SparkRDefaults.DEFAULT_CONNECTION_TIMEOUT)
     val rOptions = "--vanilla"
     val rLibDir = RUtils.sparkRPackagePath(isDriver = false)
     val rExecScript = rLibDir(0) + "/SparkR/worker/" + script
@@ -344,6 +346,7 @@ private[r] object RRunner {
     pb.environment().put("R_TESTS", "")
     pb.environment().put("SPARKR_RLIBDIR", rLibDir.mkString(","))
     pb.environment().put("SPARKR_WORKER_PORT", port.toString)
+    pb.environment().put("SPARKR_BACKEND_CONNECTION_TIMEOUT", rConnectionTimeout.toString)
     pb.redirectErrorStream(true)  // redirect stderr into stdout
     val proc = pb.start()
     val errThread = startStdoutThread(proc)
diff --git a/core/src/main/scala/org/apache/spark/api/r/SparkRDefaults.scala b/core/src/main/scala/org/apache/spark/api/r/SparkRDefaults.scala
new file mode 100644
index 0000000000000..af67cbbce4e51
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/api/r/SparkRDefaults.scala
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.api.r
+
+private[spark] object SparkRDefaults {
+
+  // Default value for spark.r.backendConnectionTimeout config
+  val DEFAULT_CONNECTION_TIMEOUT: Int = 6000
+
+  // Default value for spark.r.heartBeatInterval config
+  val DEFAULT_HEARTBEAT_INTERVAL: Int = 100
+
+  // Default value for spark.r.numRBackendThreads config
+  val DEFAULT_NUM_RBACKEND_THREADS = 2
+}
diff --git a/core/src/main/scala/org/apache/spark/deploy/RRunner.scala b/core/src/main/scala/org/apache/spark/deploy/RRunner.scala
index d0466830b2177..6eb53a8252205 100644
--- a/core/src/main/scala/org/apache/spark/deploy/RRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/RRunner.scala
@@ -25,7 +25,7 @@ import scala.collection.JavaConverters._
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.{SparkException, SparkUserAppException}
-import org.apache.spark.api.r.{RBackend, RUtils}
+import org.apache.spark.api.r.{RBackend, RUtils, SparkRDefaults}
 import org.apache.spark.util.RedirectThread
 
 /**
@@ -51,6 +51,10 @@ object RRunner {
       cmd
     }
 
+    //  Connection timeout set by R process on its connection to RBackend in seconds.
+    val backendConnectionTimeout = sys.props.getOrElse(
+      "spark.r.backendConnectionTimeout", SparkRDefaults.DEFAULT_CONNECTION_TIMEOUT.toString)
+
     // Check if the file path exists.
     // If not, change directory to current working directory for YARN cluster mode
     val rF = new File(rFile)
@@ -81,6 +85,7 @@ object RRunner {
         val builder = new ProcessBuilder((Seq(rCommand, rFileNormalized) ++ otherArgs).asJava)
         val env = builder.environment()
         env.put("EXISTING_SPARKR_BACKEND_PORT", sparkRBackendPort.toString)
+        env.put("SPARKR_BACKEND_CONNECTION_TIMEOUT", backendConnectionTimeout)
         val rPackageDir = RUtils.sparkRPackagePath(isDriver = true)
         // Put the R package directories into an env variable of comma-separated paths
         env.put("SPARKR_PACKAGE_DIR", rPackageDir.mkString(","))
diff --git a/docs/configuration.md b/docs/configuration.md
index 6600cb6c0ac09..780fc94908d38 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1890,6 +1890,21 @@ showDF(properties, numRows = 200, truncate = FALSE)
     <code>spark.r.shell.command</code> is used for sparkR shell while <code>spark.r.driver.command</code> is used for running R script.
   </td>
 </tr>
+<tr>
+  <td><code>spark.r.backendConnectionTimeout</code></td>
+  <td>6000</td>
+  <td>
+    Connection timeout set by R process on its connection to RBackend in seconds.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.r.heartBeatInterval</code></td>
+  <td>100</td>
+  <td>
+    Interval for heartbeats sents from SparkR backend to R process to prevent connection timeout.
+  </td>
+</tr>
+
 </table>
 
 #### Deploy

From b6879b8b3518c71c23262554fcb0fdad60287011 Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Sun, 30 Oct 2016 16:19:19 -0700
Subject: [PATCH 157/162] [SPARK-16137][SPARKR] randomForest for R

## What changes were proposed in this pull request?

Random Forest Regression and Classification for R
Clean-up/reordering generics.R

## How was this patch tested?

manual tests, unit tests

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #15607 from felixcheung/rrandomforest.
---
 R/pkg/NAMESPACE                               |   9 +-
 R/pkg/R/generics.R                            |  66 ++---
 R/pkg/R/mllib.R                               | 252 +++++++++++++++++-
 R/pkg/inst/tests/testthat/test_mllib.R        |  68 +++++
 .../org/apache/spark/ml/r/RWrappers.scala     |   4 +
 .../r/RandomForestClassificationWrapper.scala | 147 ++++++++++
 .../ml/r/RandomForestRegressionWrapper.scala  | 144 ++++++++++
 7 files changed, 656 insertions(+), 34 deletions(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassificationWrapper.scala
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/r/RandomForestRegressionWrapper.scala

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 7a89c01fee735..9cd6269f9a8f7 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -44,7 +44,8 @@ exportMethods("glm",
               "spark.gaussianMixture",
               "spark.als",
               "spark.kstest",
-              "spark.logit")
+              "spark.logit",
+              "spark.randomForest")
 
 # Job group lifecycle management methods
 export("setJobGroup",
@@ -350,7 +351,9 @@ export("as.DataFrame",
        "uncacheTable",
        "print.summary.GeneralizedLinearRegressionModel",
        "read.ml",
-       "print.summary.KSTest")
+       "print.summary.KSTest",
+       "print.summary.RandomForestRegressionModel",
+       "print.summary.RandomForestClassificationModel")
 
 export("structField",
        "structField.jobj",
@@ -375,6 +378,8 @@ S3method(print, structField)
 S3method(print, structType)
 S3method(print, summary.GeneralizedLinearRegressionModel)
 S3method(print, summary.KSTest)
+S3method(print, summary.RandomForestRegressionModel)
+S3method(print, summary.RandomForestClassificationModel)
 S3method(structField, character)
 S3method(structField, jobj)
 S3method(structType, jobj)
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 107e1c638be71..0271b26a10a90 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -1310,9 +1310,11 @@ setGeneric("window", function(x, ...) { standardGeneric("window") })
 #' @export
 setGeneric("year", function(x) { standardGeneric("year") })
 
-#' @rdname spark.glm
+###################### Spark.ML Methods ##########################
+
+#' @rdname fitted
 #' @export
-setGeneric("spark.glm", function(data, formula, ...) { standardGeneric("spark.glm") })
+setGeneric("fitted")
 
 #' @param x,y For \code{glm}: logical values indicating whether the response vector
 #'          and model matrix used in the fitting process should be returned as
@@ -1332,13 +1334,38 @@ setGeneric("predict", function(object, ...) { standardGeneric("predict") })
 #' @export
 setGeneric("rbind", signature = "...")
 
+#' @rdname spark.als
+#' @export
+setGeneric("spark.als", function(data, ...) { standardGeneric("spark.als") })
+
+#' @rdname spark.gaussianMixture
+#' @export
+setGeneric("spark.gaussianMixture",
+           function(data, formula, ...) { standardGeneric("spark.gaussianMixture") })
+
+#' @rdname spark.glm
+#' @export
+setGeneric("spark.glm", function(data, formula, ...) { standardGeneric("spark.glm") })
+
+#' @rdname spark.isoreg
+#' @export
+setGeneric("spark.isoreg", function(data, formula, ...) { standardGeneric("spark.isoreg") })
+
 #' @rdname spark.kmeans
 #' @export
 setGeneric("spark.kmeans", function(data, formula, ...) { standardGeneric("spark.kmeans") })
 
-#' @rdname fitted
+#' @rdname spark.kstest
 #' @export
-setGeneric("fitted")
+setGeneric("spark.kstest", function(data, ...) { standardGeneric("spark.kstest") })
+
+#' @rdname spark.lda
+#' @export
+setGeneric("spark.lda", function(data, ...) { standardGeneric("spark.lda") })
+
+#' @rdname spark.logit
+#' @export
+setGeneric("spark.logit", function(data, formula, ...) { standardGeneric("spark.logit") })
 
 #' @rdname spark.mlp
 #' @export
@@ -1348,13 +1375,14 @@ setGeneric("spark.mlp", function(data, ...) { standardGeneric("spark.mlp") })
 #' @export
 setGeneric("spark.naiveBayes", function(data, formula, ...) { standardGeneric("spark.naiveBayes") })
 
-#' @rdname spark.survreg
+#' @rdname spark.randomForest
 #' @export
-setGeneric("spark.survreg", function(data, formula) { standardGeneric("spark.survreg") })
+setGeneric("spark.randomForest",
+           function(data, formula, ...) { standardGeneric("spark.randomForest") })
 
-#' @rdname spark.lda
+#' @rdname spark.survreg
 #' @export
-setGeneric("spark.lda", function(data, ...) { standardGeneric("spark.lda") })
+setGeneric("spark.survreg", function(data, formula) { standardGeneric("spark.survreg") })
 
 #' @rdname spark.lda
 #' @export
@@ -1364,20 +1392,6 @@ setGeneric("spark.posterior", function(object, newData) { standardGeneric("spark
 #' @export
 setGeneric("spark.perplexity", function(object, data) { standardGeneric("spark.perplexity") })
 
-#' @rdname spark.isoreg
-#' @export
-setGeneric("spark.isoreg", function(data, formula, ...) { standardGeneric("spark.isoreg") })
-
-#' @rdname spark.gaussianMixture
-#' @export
-setGeneric("spark.gaussianMixture",
-           function(data, formula, ...) {
-             standardGeneric("spark.gaussianMixture")
-           })
-
-#' @rdname spark.logit
-#' @export
-setGeneric("spark.logit", function(data, formula, ...) { standardGeneric("spark.logit") })
 
 #' @param object a fitted ML model object.
 #' @param path the directory where the model is saved.
@@ -1385,11 +1399,3 @@ setGeneric("spark.logit", function(data, formula, ...) { standardGeneric("spark.
 #' @rdname write.ml
 #' @export
 setGeneric("write.ml", function(object, path, ...) { standardGeneric("write.ml") })
-
-#' @rdname spark.als
-#' @export
-setGeneric("spark.als", function(data, ...) { standardGeneric("spark.als") })
-
-#' @rdname spark.kstest
-#' @export
-setGeneric("spark.kstest", function(data, ...) { standardGeneric("spark.kstest") })
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 629f284b79f33..7a220b8d53a2f 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -102,6 +102,20 @@ setClass("KSTest", representation(jobj = "jobj"))
 #' @note LogisticRegressionModel since 2.1.0
 setClass("LogisticRegressionModel", representation(jobj = "jobj"))
 
+#' S4 class that represents a RandomForestRegressionModel
+#'
+#' @param jobj a Java object reference to the backing Scala RandomForestRegressionModel
+#' @export
+#' @note RandomForestRegressionModel since 2.1.0
+setClass("RandomForestRegressionModel", representation(jobj = "jobj"))
+
+#' S4 class that represents a RandomForestClassificationModel
+#'
+#' @param jobj a Java object reference to the backing Scala RandomForestClassificationModel
+#' @export
+#' @note RandomForestClassificationModel since 2.1.0
+setClass("RandomForestClassificationModel", representation(jobj = "jobj"))
+
 #' Saves the MLlib model to the input path
 #'
 #' Saves the MLlib model to the input path. For more information, see the specific
@@ -112,7 +126,7 @@ setClass("LogisticRegressionModel", representation(jobj = "jobj"))
 #' @seealso \link{spark.glm}, \link{glm},
 #' @seealso \link{spark.als}, \link{spark.gaussianMixture}, \link{spark.isoreg}, \link{spark.kmeans},
 #' @seealso \link{spark.lda}, \link{spark.logit}, \link{spark.mlp}, \link{spark.naiveBayes},
-#' @seealso \link{spark.survreg}
+#' @seealso \link{spark.randomForest}, \link{spark.survreg},
 #' @seealso \link{read.ml}
 NULL
 
@@ -125,7 +139,8 @@ NULL
 #' @export
 #' @seealso \link{spark.glm}, \link{glm},
 #' @seealso \link{spark.als}, \link{spark.gaussianMixture}, \link{spark.isoreg}, \link{spark.kmeans},
-#' @seealso \link{spark.logit}, \link{spark.mlp}, \link{spark.naiveBayes}, \link{spark.survreg}
+#' @seealso \link{spark.logit}, \link{spark.mlp}, \link{spark.naiveBayes},
+#' @seealso \link{spark.randomForest}, \link{spark.survreg}
 NULL
 
 write_internal <- function(object, path, overwrite = FALSE) {
@@ -1122,6 +1137,10 @@ read.ml <- function(path) {
     new("ALSModel", jobj = jobj)
   } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.LogisticRegressionWrapper")) {
     new("LogisticRegressionModel", jobj = jobj)
+  } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.RandomForestRegressorWrapper")) {
+    new("RandomForestRegressionModel", jobj = jobj)
+  } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.RandomForestClassifierWrapper")) {
+    new("RandomForestClassificationModel", jobj = jobj)
   } else {
     stop("Unsupported model: ", jobj)
   }
@@ -1617,3 +1636,232 @@ print.summary.KSTest <- function(x, ...) {
   cat(summaryStr, "\n")
   invisible(x)
 }
+
+#' Random Forest Model for Regression and Classification
+#'
+#' \code{spark.randomForest} fits a Random Forest Regression model or Classification model on
+#' a SparkDataFrame. Users can call \code{summary} to get a summary of the fitted Random Forest
+#' model, \code{predict} to make predictions on new data, and \code{write.ml}/\code{read.ml} to
+#' save/load fitted models.
+#' For more details, see
+#' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html}{Random Forest}
+#'
+#' @param data a SparkDataFrame for training.
+#' @param formula a symbolic description of the model to be fitted. Currently only a few formula
+#'                operators are supported, including '~', ':', '+', and '-'.
+#' @param type type of model, one of "regression" or "classification", to fit
+#' @param maxDepth Maximum depth of the tree (>= 0). (default = 5)
+#' @param maxBins Maximum number of bins used for discretizing continuous features and for choosing
+#'                how to split on features at each node. More bins give higher granularity. Must be
+#'                >= 2 and >= number of categories in any categorical feature. (default = 32)
+#' @param numTrees Number of trees to train (>= 1).
+#' @param impurity Criterion used for information gain calculation.
+#'                 For regression, must be "variance". For classification, must be one of
+#'                 "entropy" and "gini". (default = gini)
+#' @param minInstancesPerNode Minimum number of instances each child must have after split.
+#' @param minInfoGain Minimum information gain for a split to be considered at a tree node.
+#' @param checkpointInterval Param for set checkpoint interval (>= 1) or disable checkpoint (-1).
+#' @param featureSubsetStrategy The number of features to consider for splits at each tree node.
+#'        Supported options: "auto", "all", "onethird", "sqrt", "log2", (0.0-1.0], [1-n].
+#' @param seed integer seed for random number generation.
+#' @param subsamplingRate Fraction of the training data used for learning each decision tree, in
+#'                        range (0, 1]. (default = 1.0)
+#' @param probabilityCol column name for predicted class conditional probabilities, only for
+#'                       classification. (default = "probability")
+#' @param maxMemoryInMB Maximum memory in MB allocated to histogram aggregation.
+#' @param cacheNodeIds If FALSE, the algorithm will pass trees to executors to match instances with
+#'                     nodes.
+#' @param ... additional arguments passed to the method.
+#' @aliases spark.randomForest,SparkDataFrame,formula-method
+#' @return \code{spark.randomForest} returns a fitted Random Forest model.
+#' @rdname spark.randomForest
+#' @name spark.randomForest
+#' @export
+#' @examples
+#' \dontrun{
+#' # fit a Random Forest Regression Model
+#' df <- createDataFrame(longley)
+#' model <- spark.randomForest(df, Employed ~ ., type = "regression", maxDepth = 5, maxBins = 16)
+#'
+#' # get the summary of the model
+#' summary(model)
+#'
+#' # make predictions
+#' predictions <- predict(model, df)
+#'
+#' # save and load the model
+#' path <- "path/to/model"
+#' write.ml(model, path)
+#' savedModel <- read.ml(path)
+#' summary(savedModel)
+#'
+#' # fit a Random Forest Classification Model
+#' df <- createDataFrame(iris)
+#' model <- spark.randomForest(df, Species ~ Petal_Length + Petal_Width, "classification")
+#' }
+#' @note spark.randomForest since 2.1.0
+setMethod("spark.randomForest", signature(data = "SparkDataFrame", formula = "formula"),
+          function(data, formula, type = c("regression", "classification"),
+                   maxDepth = 5, maxBins = 32, numTrees = 20, impurity = NULL,
+                   minInstancesPerNode = 1, minInfoGain = 0.0, checkpointInterval = 10,
+                   featureSubsetStrategy = "auto", seed = NULL, subsamplingRate = 1.0,
+                   probabilityCol = "probability", maxMemoryInMB = 256, cacheNodeIds = FALSE) {
+            type <- match.arg(type)
+            formula <- paste(deparse(formula), collapse = "")
+            if (!is.null(seed)) {
+              seed <- as.character(as.integer(seed))
+            }
+            switch(type,
+                   regression = {
+                     if (is.null(impurity)) impurity <- "variance"
+                     impurity <- match.arg(impurity, "variance")
+                     jobj <- callJStatic("org.apache.spark.ml.r.RandomForestRegressorWrapper",
+                                         "fit", data@sdf, formula, as.integer(maxDepth),
+                                         as.integer(maxBins), as.integer(numTrees),
+                                         impurity, as.integer(minInstancesPerNode),
+                                         as.numeric(minInfoGain), as.integer(checkpointInterval),
+                                         as.character(featureSubsetStrategy), seed,
+                                         as.numeric(subsamplingRate),
+                                         as.integer(maxMemoryInMB), as.logical(cacheNodeIds))
+                     new("RandomForestRegressionModel", jobj = jobj)
+                   },
+                   classification = {
+                     if (is.null(impurity)) impurity <- "gini"
+                     impurity <- match.arg(impurity, c("gini", "entropy"))
+                     jobj <- callJStatic("org.apache.spark.ml.r.RandomForestClassifierWrapper",
+                                         "fit", data@sdf, formula, as.integer(maxDepth),
+                                         as.integer(maxBins), as.integer(numTrees),
+                                         impurity, as.integer(minInstancesPerNode),
+                                         as.numeric(minInfoGain), as.integer(checkpointInterval),
+                                         as.character(featureSubsetStrategy), seed,
+                                         as.numeric(subsamplingRate), as.character(probabilityCol),
+                                         as.integer(maxMemoryInMB), as.logical(cacheNodeIds))
+                     new("RandomForestClassificationModel", jobj = jobj)
+                   }
+            )
+          })
+
+# Makes predictions from a Random Forest Regression model or Classification model
+
+#' @param newData a SparkDataFrame for testing.
+#' @return \code{predict} returns a SparkDataFrame containing predicted labeled in a column named
+#' "prediction"
+#' @rdname spark.randomForest
+#' @aliases predict,RandomForestRegressionModel-method
+#' @export
+#' @note predict(randomForestRegressionModel) since 2.1.0
+setMethod("predict", signature(object = "RandomForestRegressionModel"),
+          function(object, newData) {
+            predict_internal(object, newData)
+          })
+
+#' @rdname spark.randomForest
+#' @aliases predict,RandomForestClassificationModel-method
+#' @export
+#' @note predict(randomForestClassificationModel) since 2.1.0
+setMethod("predict", signature(object = "RandomForestClassificationModel"),
+          function(object, newData) {
+            predict_internal(object, newData)
+          })
+
+# Save the Random Forest Regression or Classification model to the input path.
+
+#' @param object A fitted Random Forest regression model or classification model
+#' @param path The directory where the model is saved
+#' @param overwrite Overwrites or not if the output path already exists. Default is FALSE
+#'                  which means throw exception if the output path exists.
+#'
+#' @aliases write.ml,RandomForestRegressionModel,character-method
+#' @rdname spark.randomForest
+#' @export
+#' @note write.ml(RandomForestRegressionModel, character) since 2.1.0
+setMethod("write.ml", signature(object = "RandomForestRegressionModel", path = "character"),
+          function(object, path, overwrite = FALSE) {
+            write_internal(object, path, overwrite)
+          })
+
+#' @aliases write.ml,RandomForestClassificationModel,character-method
+#' @rdname spark.randomForest
+#' @export
+#' @note write.ml(RandomForestClassificationModel, character) since 2.1.0
+setMethod("write.ml", signature(object = "RandomForestClassificationModel", path = "character"),
+          function(object, path, overwrite = FALSE) {
+            write_internal(object, path, overwrite)
+          })
+
+#  Get the summary of an RandomForestRegressionModel model
+summary.randomForest <- function(model) {
+  jobj <- model@jobj
+  formula <- callJMethod(jobj, "formula")
+  numFeatures <- callJMethod(jobj, "numFeatures")
+  features <-  callJMethod(jobj, "features")
+  featureImportances <- callJMethod(callJMethod(jobj, "featureImportances"), "toString")
+  numTrees <- callJMethod(jobj, "numTrees")
+  treeWeights <- callJMethod(jobj, "treeWeights")
+  list(formula = formula,
+       numFeatures = numFeatures,
+       features = features,
+       featureImportances = featureImportances,
+       numTrees = numTrees,
+       treeWeights = treeWeights,
+       jobj = jobj)
+}
+
+#' @return \code{summary} returns the model's features as lists, depth and number of nodes
+#'                        or number of classes.
+#' @rdname spark.randomForest
+#' @aliases summary,RandomForestRegressionModel-method
+#' @export
+#' @note summary(RandomForestRegressionModel) since 2.1.0
+setMethod("summary", signature(object = "RandomForestRegressionModel"),
+          function(object) {
+            ans <- summary.randomForest(object)
+            class(ans) <- "summary.RandomForestRegressionModel"
+            ans
+          })
+
+#  Get the summary of an RandomForestClassificationModel model
+
+#' @rdname spark.randomForest
+#' @aliases summary,RandomForestClassificationModel-method
+#' @export
+#' @note summary(RandomForestClassificationModel) since 2.1.0
+setMethod("summary", signature(object = "RandomForestClassificationModel"),
+          function(object) {
+            ans <- summary.randomForest(object)
+            class(ans) <- "summary.RandomForestClassificationModel"
+            ans
+          })
+
+#  Prints the summary of Random Forest Regression Model
+print.summary.randomForest <- function(x) {
+  jobj <- x$jobj
+  cat("Formula: ", x$formula)
+  cat("\nNumber of features: ", x$numFeatures)
+  cat("\nFeatures: ", unlist(x$features))
+  cat("\nFeature importances: ", x$featureImportances)
+  cat("\nNumber of trees: ", x$numTrees)
+  cat("\nTree weights: ", unlist(x$treeWeights))
+
+  summaryStr <- callJMethod(jobj, "summary")
+  cat("\n", summaryStr, "\n")
+  invisible(x)
+}
+
+#' @param x summary object of Random Forest regression model or classification model
+#'          returned by \code{summary}.
+#' @rdname spark.randomForest
+#' @export
+#' @note print.summary.RandomForestRegressionModel since 2.1.0
+print.summary.RandomForestRegressionModel <- function(x, ...) {
+  print.summary.randomForest(x)
+}
+
+#  Prints the summary of Random Forest Classification Model
+
+#' @rdname spark.randomForest
+#' @export
+#' @note print.summary.RandomForestClassificationModel since 2.1.0
+print.summary.RandomForestClassificationModel <- function(x, ...) {
+  print.summary.randomForest(x)
+}
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index 6d1fccc7c0582..db98d0e45547e 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -871,4 +871,72 @@ test_that("spark.kstest", {
   expect_match(capture.output(stats)[1], "Kolmogorov-Smirnov test summary:")
 })
 
+test_that("spark.randomForest Regression", {
+  data <- suppressWarnings(createDataFrame(longley))
+  model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16,
+                              numTrees = 1)
+
+  predictions <- collect(predict(model, data))
+  expect_equal(predictions$prediction, c(60.323, 61.122, 60.171, 61.187,
+                                         63.221, 63.639, 64.989, 63.761,
+                                         66.019, 67.857, 68.169, 66.513,
+                                         68.655, 69.564, 69.331, 70.551),
+               tolerance = 1e-4)
+
+  stats <- summary(model)
+  expect_equal(stats$numTrees, 1)
+  expect_error(capture.output(stats), NA)
+  expect_true(length(capture.output(stats)) > 6)
+
+  model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16,
+                              numTrees = 20, seed = 123)
+  predictions <- collect(predict(model, data))
+  expect_equal(predictions$prediction, c(60.379, 61.096, 60.636, 62.258,
+                                         63.736, 64.296, 64.868, 64.300,
+                                         66.709, 67.697, 67.966, 67.252,
+                                         68.866, 69.593, 69.195, 69.658),
+               tolerance = 1e-4)
+  stats <- summary(model)
+  expect_equal(stats$numTrees, 20)
+
+  modelPath <- tempfile(pattern = "spark-randomForestRegression", fileext = ".tmp")
+  write.ml(model, modelPath)
+  expect_error(write.ml(model, modelPath))
+  write.ml(model, modelPath, overwrite = TRUE)
+  model2 <- read.ml(modelPath)
+  stats2 <- summary(model2)
+  expect_equal(stats$formula, stats2$formula)
+  expect_equal(stats$numFeatures, stats2$numFeatures)
+  expect_equal(stats$features, stats2$features)
+  expect_equal(stats$featureImportances, stats2$featureImportances)
+  expect_equal(stats$numTrees, stats2$numTrees)
+  expect_equal(stats$treeWeights, stats2$treeWeights)
+
+  unlink(modelPath)
+})
+
+test_that("spark.randomForest Classification", {
+  data <- suppressWarnings(createDataFrame(iris))
+  model <- spark.randomForest(data, Species ~ Petal_Length + Petal_Width, "classification",
+                              maxDepth = 5, maxBins = 16)
+
+  stats <- summary(model)
+  expect_equal(stats$numFeatures, 2)
+  expect_equal(stats$numTrees, 20)
+  expect_error(capture.output(stats), NA)
+  expect_true(length(capture.output(stats)) > 6)
+
+  modelPath <- tempfile(pattern = "spark-randomForestClassification", fileext = ".tmp")
+  write.ml(model, modelPath)
+  expect_error(write.ml(model, modelPath))
+  write.ml(model, modelPath, overwrite = TRUE)
+  model2 <- read.ml(modelPath)
+  stats2 <- summary(model2)
+  expect_equal(stats$depth, stats2$depth)
+  expect_equal(stats$numNodes, stats2$numNodes)
+  expect_equal(stats$numClasses, stats2$numClasses)
+
+  unlink(modelPath)
+})
+
 sparkR.session.stop()
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/RWrappers.scala b/mllib/src/main/scala/org/apache/spark/ml/r/RWrappers.scala
index 1df3662a5822b..0e09e18027ca7 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/RWrappers.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/RWrappers.scala
@@ -56,6 +56,10 @@ private[r] object RWrappers extends MLReader[Object] {
         ALSWrapper.load(path)
       case "org.apache.spark.ml.r.LogisticRegressionWrapper" =>
         LogisticRegressionWrapper.load(path)
+      case "org.apache.spark.ml.r.RandomForestRegressorWrapper" =>
+        RandomForestRegressorWrapper.load(path)
+      case "org.apache.spark.ml.r.RandomForestClassifierWrapper" =>
+        RandomForestClassifierWrapper.load(path)
       case _ =>
         throw new SparkException(s"SparkR read.ml does not support load $className")
     }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassificationWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassificationWrapper.scala
new file mode 100644
index 0000000000000..b0088ddaf3b1d
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassificationWrapper.scala
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.r
+
+import org.apache.hadoop.fs.Path
+import org.json4s._
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods._
+
+import org.apache.spark.ml.{Pipeline, PipelineModel}
+import org.apache.spark.ml.attribute.AttributeGroup
+import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}
+import org.apache.spark.ml.feature.RFormula
+import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.ml.util._
+import org.apache.spark.sql.{DataFrame, Dataset}
+
+private[r] class RandomForestClassifierWrapper private (
+  val pipeline: PipelineModel,
+  val formula: String,
+  val features: Array[String]) extends MLWritable {
+
+  private val DTModel: RandomForestClassificationModel =
+    pipeline.stages(1).asInstanceOf[RandomForestClassificationModel]
+
+  lazy val numFeatures: Int = DTModel.numFeatures
+  lazy val featureImportances: Vector = DTModel.featureImportances
+  lazy val numTrees: Int = DTModel.getNumTrees
+  lazy val treeWeights: Array[Double] = DTModel.treeWeights
+
+  def summary: String = DTModel.toDebugString
+
+  def transform(dataset: Dataset[_]): DataFrame = {
+    pipeline.transform(dataset).drop(DTModel.getFeaturesCol)
+  }
+
+  override def write: MLWriter = new
+      RandomForestClassifierWrapper.RandomForestClassifierWrapperWriter(this)
+}
+
+private[r] object RandomForestClassifierWrapper extends MLReadable[RandomForestClassifierWrapper] {
+  def fit(  // scalastyle:ignore
+      data: DataFrame,
+      formula: String,
+      maxDepth: Int,
+      maxBins: Int,
+      numTrees: Int,
+      impurity: String,
+      minInstancesPerNode: Int,
+      minInfoGain: Double,
+      checkpointInterval: Int,
+      featureSubsetStrategy: String,
+      seed: String,
+      subsamplingRate: Double,
+      probabilityCol: String,
+      maxMemoryInMB: Int,
+      cacheNodeIds: Boolean): RandomForestClassifierWrapper = {
+
+    val rFormula = new RFormula()
+      .setFormula(formula)
+    RWrapperUtils.checkDataColumns(rFormula, data)
+    val rFormulaModel = rFormula.fit(data)
+
+    // get feature names from output schema
+    val schema = rFormulaModel.transform(data).schema
+    val featureAttrs = AttributeGroup.fromStructField(schema(rFormulaModel.getFeaturesCol))
+      .attributes.get
+    val features = featureAttrs.map(_.name.get)
+
+    // assemble and fit the pipeline
+    val rfc = new RandomForestClassifier()
+      .setMaxDepth(maxDepth)
+      .setMaxBins(maxBins)
+      .setNumTrees(numTrees)
+      .setImpurity(impurity)
+      .setMinInstancesPerNode(minInstancesPerNode)
+      .setMinInfoGain(minInfoGain)
+      .setCheckpointInterval(checkpointInterval)
+      .setFeatureSubsetStrategy(featureSubsetStrategy)
+      .setSubsamplingRate(subsamplingRate)
+      .setMaxMemoryInMB(maxMemoryInMB)
+      .setCacheNodeIds(cacheNodeIds)
+      .setProbabilityCol(probabilityCol)
+      .setFeaturesCol(rFormula.getFeaturesCol)
+    if (seed != null && seed.length > 0) rfc.setSeed(seed.toLong)
+
+    val pipeline = new Pipeline()
+      .setStages(Array(rFormulaModel, rfc))
+      .fit(data)
+
+    new RandomForestClassifierWrapper(pipeline, formula, features)
+  }
+
+  override def read: MLReader[RandomForestClassifierWrapper] =
+    new RandomForestClassifierWrapperReader
+
+  override def load(path: String): RandomForestClassifierWrapper = super.load(path)
+
+  class RandomForestClassifierWrapperWriter(instance: RandomForestClassifierWrapper)
+    extends MLWriter {
+
+    override protected def saveImpl(path: String): Unit = {
+      val rMetadataPath = new Path(path, "rMetadata").toString
+      val pipelinePath = new Path(path, "pipeline").toString
+
+      val rMetadata = ("class" -> instance.getClass.getName) ~
+        ("formula" -> instance.formula) ~
+        ("features" -> instance.features.toSeq)
+      val rMetadataJson: String = compact(render(rMetadata))
+
+      sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath)
+      instance.pipeline.save(pipelinePath)
+    }
+  }
+
+  class RandomForestClassifierWrapperReader extends MLReader[RandomForestClassifierWrapper] {
+
+    override def load(path: String): RandomForestClassifierWrapper = {
+      implicit val format = DefaultFormats
+      val rMetadataPath = new Path(path, "rMetadata").toString
+      val pipelinePath = new Path(path, "pipeline").toString
+      val pipeline = PipelineModel.load(pipelinePath)
+
+      val rMetadataStr = sc.textFile(rMetadataPath, 1).first()
+      val rMetadata = parse(rMetadataStr)
+      val formula = (rMetadata \ "formula").extract[String]
+      val features = (rMetadata \ "features").extract[Array[String]]
+
+      new RandomForestClassifierWrapper(pipeline, formula, features)
+    }
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestRegressionWrapper.scala
new file mode 100644
index 0000000000000..c8874407fa75e
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestRegressionWrapper.scala
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.r
+
+import org.apache.hadoop.fs.Path
+import org.json4s._
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods._
+
+import org.apache.spark.ml.{Pipeline, PipelineModel}
+import org.apache.spark.ml.attribute.AttributeGroup
+import org.apache.spark.ml.feature.RFormula
+import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.ml.regression.{RandomForestRegressionModel, RandomForestRegressor}
+import org.apache.spark.ml.util._
+import org.apache.spark.sql.{DataFrame, Dataset}
+
+private[r] class RandomForestRegressorWrapper private (
+  val pipeline: PipelineModel,
+  val formula: String,
+  val features: Array[String]) extends MLWritable {
+
+  private val DTModel: RandomForestRegressionModel =
+    pipeline.stages(1).asInstanceOf[RandomForestRegressionModel]
+
+  lazy val numFeatures: Int = DTModel.numFeatures
+  lazy val featureImportances: Vector = DTModel.featureImportances
+  lazy val numTrees: Int = DTModel.getNumTrees
+  lazy val treeWeights: Array[Double] = DTModel.treeWeights
+
+  def summary: String = DTModel.toDebugString
+
+  def transform(dataset: Dataset[_]): DataFrame = {
+    pipeline.transform(dataset).drop(DTModel.getFeaturesCol)
+  }
+
+  override def write: MLWriter = new
+      RandomForestRegressorWrapper.RandomForestRegressorWrapperWriter(this)
+}
+
+private[r] object RandomForestRegressorWrapper extends MLReadable[RandomForestRegressorWrapper] {
+  def fit(  // scalastyle:ignore
+      data: DataFrame,
+      formula: String,
+      maxDepth: Int,
+      maxBins: Int,
+      numTrees: Int,
+      impurity: String,
+      minInstancesPerNode: Int,
+      minInfoGain: Double,
+      checkpointInterval: Int,
+      featureSubsetStrategy: String,
+      seed: String,
+      subsamplingRate: Double,
+      maxMemoryInMB: Int,
+      cacheNodeIds: Boolean): RandomForestRegressorWrapper = {
+
+    val rFormula = new RFormula()
+      .setFormula(formula)
+    RWrapperUtils.checkDataColumns(rFormula, data)
+    val rFormulaModel = rFormula.fit(data)
+
+    // get feature names from output schema
+    val schema = rFormulaModel.transform(data).schema
+    val featureAttrs = AttributeGroup.fromStructField(schema(rFormulaModel.getFeaturesCol))
+      .attributes.get
+    val features = featureAttrs.map(_.name.get)
+
+    // assemble and fit the pipeline
+    val rfr = new RandomForestRegressor()
+      .setMaxDepth(maxDepth)
+      .setMaxBins(maxBins)
+      .setNumTrees(numTrees)
+      .setImpurity(impurity)
+      .setMinInstancesPerNode(minInstancesPerNode)
+      .setMinInfoGain(minInfoGain)
+      .setCheckpointInterval(checkpointInterval)
+      .setFeatureSubsetStrategy(featureSubsetStrategy)
+      .setSubsamplingRate(subsamplingRate)
+      .setMaxMemoryInMB(maxMemoryInMB)
+      .setCacheNodeIds(cacheNodeIds)
+      .setFeaturesCol(rFormula.getFeaturesCol)
+    if (seed != null && seed.length > 0) rfr.setSeed(seed.toLong)
+
+    val pipeline = new Pipeline()
+      .setStages(Array(rFormulaModel, rfr))
+      .fit(data)
+
+    new RandomForestRegressorWrapper(pipeline, formula, features)
+  }
+
+  override def read: MLReader[RandomForestRegressorWrapper] = new RandomForestRegressorWrapperReader
+
+  override def load(path: String): RandomForestRegressorWrapper = super.load(path)
+
+  class RandomForestRegressorWrapperWriter(instance: RandomForestRegressorWrapper)
+    extends MLWriter {
+
+    override protected def saveImpl(path: String): Unit = {
+      val rMetadataPath = new Path(path, "rMetadata").toString
+      val pipelinePath = new Path(path, "pipeline").toString
+
+      val rMetadata = ("class" -> instance.getClass.getName) ~
+        ("formula" -> instance.formula) ~
+        ("features" -> instance.features.toSeq)
+      val rMetadataJson: String = compact(render(rMetadata))
+
+      sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath)
+      instance.pipeline.save(pipelinePath)
+    }
+  }
+
+  class RandomForestRegressorWrapperReader extends MLReader[RandomForestRegressorWrapper] {
+
+    override def load(path: String): RandomForestRegressorWrapper = {
+      implicit val format = DefaultFormats
+      val rMetadataPath = new Path(path, "rMetadata").toString
+      val pipelinePath = new Path(path, "pipeline").toString
+      val pipeline = PipelineModel.load(pipelinePath)
+
+      val rMetadataStr = sc.textFile(rMetadataPath, 1).first()
+      val rMetadata = parse(rMetadataStr)
+      val formula = (rMetadata \ "formula").extract[String]
+      val features = (rMetadata \ "features").extract[Array[String]]
+
+      new RandomForestRegressorWrapper(pipeline, formula, features)
+    }
+  }
+}

From 7c3786929205b962b430cf7fc292602c2993c193 Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Sun, 30 Oct 2016 16:21:37 -0700
Subject: [PATCH 158/162] [SPARK-18110][PYTHON][ML] add missing parameter in
 Python for RandomForest regression and classification

## What changes were proposed in this pull request?

Add subsmaplingRate to randomForestClassifier
Add varianceCol to randomForestRegressor
In Python

## How was this patch tested?

manual tests

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #15638 from felixcheung/pyrandomforest.
---
 python/pyspark/ml/classification.py | 11 ++++++-----
 python/pyspark/ml/regression.py     | 12 ++++++------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 3f763a10d4066..d9ff356b9403a 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -758,20 +758,21 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
                  probabilityCol="probability", rawPredictionCol="rawPrediction",
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini",
-                 numTrees=20, featureSubsetStrategy="auto", seed=None):
+                 numTrees=20, featureSubsetStrategy="auto", seed=None, subsamplingRate=1.0):
         """
         __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                  probabilityCol="probability", rawPredictionCol="rawPrediction", \
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", \
-                 numTrees=20, featureSubsetStrategy="auto", seed=None)
+                 numTrees=20, featureSubsetStrategy="auto", seed=None, subsamplingRate=1.0)
         """
         super(RandomForestClassifier, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.classification.RandomForestClassifier", self.uid)
         self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                          maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
-                         impurity="gini", numTrees=20, featureSubsetStrategy="auto")
+                         impurity="gini", numTrees=20, featureSubsetStrategy="auto",
+                         subsamplingRate=1.0)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
@@ -781,13 +782,13 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
                   probabilityCol="probability", rawPredictionCol="rawPrediction",
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                   maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None,
-                  impurity="gini", numTrees=20, featureSubsetStrategy="auto"):
+                  impurity="gini", numTrees=20, featureSubsetStrategy="auto", subsamplingRate=1.0):
         """
         setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                  probabilityCol="probability", rawPredictionCol="rawPrediction", \
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
                   maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None, \
-                  impurity="gini", numTrees=20, featureSubsetStrategy="auto")
+                  impurity="gini", numTrees=20, featureSubsetStrategy="auto", subsamplingRate=1.0)
         Sets params for linear classification.
         """
         kwargs = self.setParams._input_kwargs
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index 55d38033ef72a..9233d2e7e1a77 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -594,7 +594,7 @@ class RandomForestParams(TreeEnsembleParams):
     featureSubsetStrategy = \
         Param(Params._dummy(), "featureSubsetStrategy",
               "The number of features to consider for splits at each tree node. Supported " +
-              "options: " + ", ".join(supportedFeatureSubsetStrategies) + " (0.0-1.0], [1-n].",
+              "options: " + ", ".join(supportedFeatureSubsetStrategies) + ", (0.0-1.0], [1-n].",
               typeConverter=TypeConverters.toString)
 
     def __init__(self):
@@ -828,7 +828,7 @@ def featureImportances(self):
 @inherit_doc
 class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasSeed,
                             RandomForestParams, TreeRegressorParams, HasCheckpointInterval,
-                            JavaMLWritable, JavaMLReadable):
+                            JavaMLWritable, JavaMLReadable, HasVarianceCol):
     """
     `Random Forest <http://en.wikipedia.org/wiki/Random_forest>`_
     learning algorithm for regression.
@@ -876,13 +876,13 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
                  impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20,
-                 featureSubsetStrategy="auto"):
+                 featureSubsetStrategy="auto", varianceCol=None):
         """
         __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \
                  impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20, \
-                 featureSubsetStrategy="auto")
+                 featureSubsetStrategy="auto", varianceCol=None)
         """
         super(RandomForestRegressor, self).__init__()
         self._java_obj = self._new_java_obj(
@@ -900,13 +900,13 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                   maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
                   impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20,
-                  featureSubsetStrategy="auto"):
+                  featureSubsetStrategy="auto", varianceCol=None):
         """
         setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
                   maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \
                   impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20, \
-                  featureSubsetStrategy="auto")
+                  featureSubsetStrategy="auto", varianceCol=None)
         Sets params for linear regression.
         """
         kwargs = self.setParams._input_kwargs

From d2923f173265b66a4ec71c3c86ff71a58d5aeb3d Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Mon, 31 Oct 2016 00:11:33 -0700
Subject: [PATCH 159/162] [SPARK-18143][SQL] Ignore Structured Streaming event
 logs to avoid breaking history server

## What changes were proposed in this pull request?

Because of the refactoring work in Structured Streaming, the event logs generated by Strucutred Streaming in Spark 2.0.0 and 2.0.1 cannot be parsed.

This PR just ignores these logs in ReplayListenerBus because no places use them.
## How was this patch tested?
- Generated events logs using Spark 2.0.0 and 2.0.1, and saved them as `structured-streaming-query-event-logs-2.0.0.txt` and `structured-streaming-query-event-logs-2.0.1.txt`
- The new added test makes sure ReplayListenerBus will skip these bad jsons.

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #15663 from zsxwing/fix-event-log.
---
 .../spark/scheduler/ReplayListenerBus.scala   | 13 ++++++
 .../query-event-logs-version-2.0.0.txt        |  4 ++
 .../query-event-logs-version-2.0.1.txt        |  4 ++
 .../StreamingQueryListenerSuite.scala         | 42 +++++++++++++++++++
 4 files changed, 63 insertions(+)
 create mode 100644 sql/core/src/test/resources/structured-streaming/query-event-logs-version-2.0.0.txt
 create mode 100644 sql/core/src/test/resources/structured-streaming/query-event-logs-version-2.0.1.txt

diff --git a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
index 3eff8d952bfd6..2424586431aa0 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
@@ -72,6 +72,10 @@ private[spark] class ReplayListenerBus extends SparkListenerBus with Logging {
 
           postToAll(JsonProtocol.sparkEventFromJson(parse(currentLine)))
         } catch {
+          case e: ClassNotFoundException if KNOWN_REMOVED_CLASSES.contains(e.getMessage) =>
+            // Ignore events generated by Structured Streaming in Spark 2.0.0 and 2.0.1.
+            // It's safe since no place uses them.
+            logWarning(s"Dropped incompatible Structured Streaming log: $currentLine")
           case jpe: JsonParseException =>
             // We can only ignore exception from last line of the file that might be truncated
             // the last entry may not be the very last line in the event log, but we treat it
@@ -102,4 +106,13 @@ private[spark] object ReplayListenerBus {
 
   // utility filter that selects all event logs during replay
   val SELECT_ALL_FILTER: ReplayEventsFilter = { (eventString: String) => true }
+
+  /**
+   * Classes that were removed. Structured Streaming doesn't use them any more. However, parsing
+   * old json may fail and we can just ignore these failures.
+   */
+  val KNOWN_REMOVED_CLASSES = Set(
+    "org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgress",
+    "org.apache.spark.sql.streaming.StreamingQueryListener$QueryTerminated"
+  )
 }
diff --git a/sql/core/src/test/resources/structured-streaming/query-event-logs-version-2.0.0.txt b/sql/core/src/test/resources/structured-streaming/query-event-logs-version-2.0.0.txt
new file mode 100644
index 0000000000000..aa7e9a8c20c43
--- /dev/null
+++ b/sql/core/src/test/resources/structured-streaming/query-event-logs-version-2.0.0.txt
@@ -0,0 +1,4 @@
+{"Event":"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgress","queryInfo":{"name":"hello","id":0,"sourceStatuses":[{"description":"FileStreamSource[file:/Users/zsx/stream]","offsetDesc":"#0"}],"sinkStatus":{"description":"org.apache.spark.sql.execution.streaming.MemorySink@2b85b3a5","offsetDesc":"[#0]"}}}
+{"Event":"org.apache.spark.sql.streaming.StreamingQueryListener$QueryTerminated","queryInfo":{"name":"hello","id":0,"sourceStatuses":[{"description":"FileStreamSource[file:/Users/zsx/stream]","offsetDesc":"#0"}],"sinkStatus":{"description":"org.apache.spark.sql.execution.streaming.MemorySink@2b85b3a5","offsetDesc":"[#0]"}},"exception":null,"stackTrace":[]}
+{"Event":"org.apache.spark.sql.streaming.StreamingQueryListener$QueryTerminated","queryInfo":{"name":"hello","id":0,"sourceStatuses":[{"description":"FileStreamSource[file:/Users/zsx/stream]","offsetDesc":"#0"}],"sinkStatus":{"description":"org.apache.spark.sql.execution.streaming.MemorySink@514502dc","offsetDesc":"[-]"}},"exception":"Query hello terminated with exception: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost): java.lang.ArithmeticException: / by zero\n\tat $line15.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$anonfun$1.apply(<console>:25)\n\tat $line15.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$anonfun$1.apply(<console>:25)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)\n\tat org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)\n\tat org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370)\n\tat org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246)\n\tat org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240)\n\tat org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784)\n\tat org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784)\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:283)\n\tat org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)\n\tat org.apache.spark.scheduler.Task.run(Task.scala:85)\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)\n\tat java.lang.Thread.run(Thread.java:745)\n\nDriver stacktrace:","stackTrace":[{"methodName":"org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches","fileName":"StreamExecution.scala","lineNumber":208,"className":"org.apache.spark.sql.execution.streaming.StreamExecution","nativeMethod":false},{"methodName":"run","fileName":"StreamExecution.scala","lineNumber":120,"className":"org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1","nativeMethod":false}]}
+{"Event":"SparkListenerApplicationEnd","Timestamp":1477593059313}
diff --git a/sql/core/src/test/resources/structured-streaming/query-event-logs-version-2.0.1.txt b/sql/core/src/test/resources/structured-streaming/query-event-logs-version-2.0.1.txt
new file mode 100644
index 0000000000000..646cf107183b4
--- /dev/null
+++ b/sql/core/src/test/resources/structured-streaming/query-event-logs-version-2.0.1.txt
@@ -0,0 +1,4 @@
+{"Event":"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgress","queryInfo":{"name":"hello","id":0,"sourceStatuses":[{"description":"FileStreamSource[file:/Users/zsx/stream]","offsetDesc":"#0"}],"sinkStatus":{"description":"org.apache.spark.sql.execution.streaming.MemorySink@10e5ec94","offsetDesc":"[#0]"}}}
+{"Event":"org.apache.spark.sql.streaming.StreamingQueryListener$QueryTerminated","queryInfo":{"name":"hello","id":0,"sourceStatuses":[{"description":"FileStreamSource[file:/Users/zsx/stream]","offsetDesc":"#0"}],"sinkStatus":{"description":"org.apache.spark.sql.execution.streaming.MemorySink@10e5ec94","offsetDesc":"[#0]"}},"exception":null}
+{"Event":"org.apache.spark.sql.streaming.StreamingQueryListener$QueryTerminated","queryInfo":{"name":"hello","id":0,"sourceStatuses":[{"description":"FileStreamSource[file:/Users/zsx/stream]","offsetDesc":"#0"}],"sinkStatus":{"description":"org.apache.spark.sql.execution.streaming.MemorySink@70c61dc8","offsetDesc":"[-]"}},"exception":"org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost): java.lang.ArithmeticException: / by zero\n\tat $line15.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$anonfun$1.apply(<console>:25)\n\tat $line15.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$anonfun$1.apply(<console>:25)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)\n\tat org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)\n\tat org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370)\n\tat org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246)\n\tat org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240)\n\tat org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803)\n\tat org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803)\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:283)\n\tat org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)\n\tat org.apache.spark.scheduler.Task.run(Task.scala:86)\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)\n\tat java.lang.Thread.run(Thread.java:745)\n\nDriver stacktrace:\n\tat org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1454)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1442)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1441)\n\tat scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)\n\tat scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)\n\tat org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1441)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)\n\tat scala.Option.foreach(Option.scala:257)\n\tat org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:811)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1667)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1622)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1611)\n\tat org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)\n\tat org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:1890)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:1903)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:1916)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:1930)\n\tat org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:912)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\n\tat org.apache.spark.rdd.RDD.withScope(RDD.scala:358)\n\tat org.apache.spark.rdd.RDD.collect(RDD.scala:911)\n\tat org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:290)\n\tat org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$execute$1$1.apply(Dataset.scala:2193)\n\tat org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:57)\n\tat org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2546)\n\tat org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$execute$1(Dataset.scala:2192)\n\tat org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$collect$1.apply(Dataset.scala:2197)\n\tat org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$collect$1.apply(Dataset.scala:2197)\n\tat org.apache.spark.sql.Dataset.withCallback(Dataset.scala:2559)\n\tat org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collect(Dataset.scala:2197)\n\tat org.apache.spark.sql.Dataset.collect(Dataset.scala:2173)\n\tat org.apache.spark.sql.execution.streaming.MemorySink.addBatch(memory.scala:154)\n\tat org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runBatch(StreamExecution.scala:366)\n\tat org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1.apply$mcZ$sp(StreamExecution.scala:197)\n\tat org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:43)\n\tat org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches(StreamExecution.scala:187)\n\tat org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:124)\nCaused by: java.lang.ArithmeticException: / by zero\n\tat $line15.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$anonfun$1.apply(<console>:25)\n\tat $line15.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$anonfun$1.apply(<console>:25)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)\n\tat org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)\n\tat org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370)\n\tat org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246)\n\tat org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240)\n\tat org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803)\n\tat org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803)\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:283)\n\tat org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)\n\tat org.apache.spark.scheduler.Task.run(Task.scala:86)\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)\n\tat java.lang.Thread.run(Thread.java:745)\n"}
+{"Event":"SparkListenerApplicationEnd","Timestamp":1477701734609}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
index ff843865a017e..cebb32a0a56cc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
@@ -17,11 +17,14 @@
 
 package org.apache.spark.sql.streaming
 
+import scala.collection.mutable
+
 import org.scalactic.TolerantNumerics
 import org.scalatest.BeforeAndAfter
 import org.scalatest.PrivateMethodTester._
 
 import org.apache.spark.SparkException
+import org.apache.spark.scheduler._
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.functions._
@@ -206,6 +209,45 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
     assert(queryQueryTerminated.exception === newQueryTerminated.exception)
   }
 
+  test("ReplayListenerBus should ignore broken event jsons generated in 2.0.0") {
+    // query-event-logs-version-2.0.0.txt has all types of events generated by
+    // Structured Streaming in Spark 2.0.0.
+    // SparkListenerApplicationEnd is the only valid event and it's the last event. We use it
+    // to verify that we can skip broken jsons generated by Structured Streaming.
+    testReplayListenerBusWithBorkenEventJsons("query-event-logs-version-2.0.0.txt")
+  }
+
+  test("ReplayListenerBus should ignore broken event jsons generated in 2.0.1") {
+    // query-event-logs-version-2.0.1.txt has all types of events generated by
+    // Structured Streaming in Spark 2.0.1.
+    // SparkListenerApplicationEnd is the only valid event and it's the last event. We use it
+    // to verify that we can skip broken jsons generated by Structured Streaming.
+    testReplayListenerBusWithBorkenEventJsons("query-event-logs-version-2.0.1.txt")
+  }
+
+  private def testReplayListenerBusWithBorkenEventJsons(fileName: String): Unit = {
+    val input = getClass.getResourceAsStream(s"/structured-streaming/$fileName")
+    val events = mutable.ArrayBuffer[SparkListenerEvent]()
+    try {
+      val replayer = new ReplayListenerBus() {
+        // Redirect all parsed events to `events`
+        override def doPostEvent(
+            listener: SparkListenerInterface,
+            event: SparkListenerEvent): Unit = {
+          events += event
+        }
+      }
+      // Add a dummy listener so that "doPostEvent" will be called.
+      replayer.addListener(new SparkListener {})
+      replayer.replay(input, fileName)
+      // SparkListenerApplicationEnd is the only valid event
+      assert(events.size === 1)
+      assert(events(0).isInstanceOf[SparkListenerApplicationEnd])
+    } finally {
+      input.close()
+    }
+  }
+
   private def assertStreamingQueryInfoEquals(
       expected: StreamingQueryStatus,
       actual: StreamingQueryStatus): Unit = {

From 26b07f1908eeffd934b1e86fb4de02f69945e004 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Mon, 31 Oct 2016 10:10:22 +0000
Subject: [PATCH 160/162] [BUILD] Close stale Pull Requests.

Closes #11610
Closes #15411
Closes #15501
Closes #12613
Closes #12518
Closes #12026
Closes #15524
Closes #12693
Closes #12358
Closes #15588
Closes #15635
Closes #15678
Closes #14699
Closes #9008

Author: Sean Owen <sowen@cloudera.com>

Closes #15685 from srowen/CloseStalePRs.

From ee3953db2e1a68c91a00e39947c8e26f023d3774 Mon Sep 17 00:00:00 2001
From: Robert Kruszewski <robertk@palantir.com>
Date: Mon, 31 Oct 2016 18:49:15 +0000
Subject: [PATCH 161/162] fix merge

---
 .../sql/execution/datasources/FileFormat.scala    | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
index 9d153cec731a8..b31c4d51c7923 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
@@ -78,6 +78,21 @@ trait FileFormat {
     false
   }
 
+  /**
+    * Allow FileFormats to have a pluggable way to utilize pushed filters to eliminate partitions
+    * before execution. By default no pruning is performed and the original partitioning is
+    * preserved.
+    */
+  def filterPartitions(
+      filters: Seq[Filter],
+      schema: StructType,
+      conf: Configuration,
+      allFiles: Seq[FileStatus],
+      root: Path,
+      partitions: Seq[Partition]): Seq[Partition] = {
+    partitions
+  }
+
   /**
    * Returns whether a file with `path` could be splitted or not.
    */

From 08fac966f009bd6029d5507bb9e61fda4433b82a Mon Sep 17 00:00:00 2001
From: Robert Kruszewski <robertk@palantir.com>
Date: Mon, 31 Oct 2016 22:35:58 +0000
Subject: [PATCH 162/162] parquet bump

---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 3bb2065f0048d..c34db84259c5b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -134,7 +134,7 @@
     <!-- Version used for internal directory structure -->
     <hive.version.short>1.2.1</hive.version.short>
     <derby.version>10.12.1.1</derby.version>
-    <parquet.version>1.9.0-palantir2</parquet.version>
+    <parquet.version>1.9.0-palantir3</parquet.version>
     <hive.parquet.version>1.6.0</hive.parquet.version>
     <jetty.version>9.2.16.v20160414</jetty.version>
     <javaxservlet.version>3.1.0</javaxservlet.version>