From b735947d68f18109a09fcd1d1eee048f823651be Mon Sep 17 00:00:00 2001 From: Chao Sun Date: Thu, 5 Nov 2020 15:22:53 -0800 Subject: [PATCH 1/2] Remove shares Hadoop classes --- .../sql/hive/client/IsolatedClientLoader.scala | 13 +++---------- .../sql/hive/client/HadoopVersionInfoSuite.scala | 3 +-- .../spark/sql/hive/client/HiveClientBuilder.scala | 6 ++---- .../hive/client/HivePartitionFilteringSuite.scala | 4 ---- .../spark/sql/hive/client/HiveVersionSuite.scala | 7 ++----- 5 files changed, 8 insertions(+), 25 deletions(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala index f9946fe8e061..6990257044e0 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala @@ -52,12 +52,9 @@ private[hive] object IsolatedClientLoader extends Logging { config: Map[String, String] = Map.empty, ivyPath: Option[String] = None, sharedPrefixes: Seq[String] = Seq.empty, - barrierPrefixes: Seq[String] = Seq.empty, - sharesHadoopClasses: Boolean = true): IsolatedClientLoader = synchronized { + barrierPrefixes: Seq[String] = Seq.empty): IsolatedClientLoader = synchronized { val resolvedVersion = hiveVersion(hiveMetastoreVersion) - // We will first try to share Hadoop classes. If we cannot resolve the Hadoop artifact - // with the given version, we will use Hadoop 2.7 and then will not share Hadoop classes. - var _sharesHadoopClasses = sharesHadoopClasses + // We will use Hadoop 2.7 if we cannot resolve the Hadoop artifact. val files = if (resolvedVersions.contains((resolvedVersion, hadoopVersion))) { resolvedVersions((resolvedVersion, hadoopVersion)) } else { @@ -75,7 +72,6 @@ private[hive] object IsolatedClientLoader extends Logging { "again. Hadoop classes will not be shared between Spark and Hive metastore client. " + "It is recommended to set jars used by Hive metastore client through " + "spark.sql.hive.metastore.jars in the production environment.") - _sharesHadoopClasses = false (downloadVersion( resolvedVersion, fallbackVersion, ivyPath, remoteRepos), fallbackVersion) } @@ -89,7 +85,6 @@ private[hive] object IsolatedClientLoader extends Logging { execJars = files, hadoopConf = hadoopConf, config = config, - sharesHadoopClasses = _sharesHadoopClasses, sharedPrefixes = sharedPrefixes, barrierPrefixes = barrierPrefixes) } @@ -177,7 +172,6 @@ private[hive] object IsolatedClientLoader extends Logging { * @param config A set of options that will be added to the HiveConf of the constructed client. * @param isolationOn When true, custom versions of barrier classes will be constructed. Must be * true unless loading the version of hive that is on Spark's classloader. - * @param sharesHadoopClasses When true, we will share Hadoop classes between Spark and * @param baseClassLoader The spark classloader that is used to load shared classes. */ private[hive] class IsolatedClientLoader( @@ -187,7 +181,6 @@ private[hive] class IsolatedClientLoader( val execJars: Seq[URL] = Seq.empty, val config: Map[String, String] = Map.empty, val isolationOn: Boolean = true, - val sharesHadoopClasses: Boolean = true, val baseClassLoader: ClassLoader = Thread.currentThread().getContextClassLoader, val sharedPrefixes: Seq[String] = Seq.empty, val barrierPrefixes: Seq[String] = Seq.empty) @@ -204,7 +197,7 @@ private[hive] class IsolatedClientLoader( name.startsWith("org.apache.log4j") || // log4j1.x name.startsWith("org.apache.logging.log4j") || // log4j2 name.startsWith("org.apache.spark.") || - (sharesHadoopClasses && isHadoopClass) || + isHadoopClass || name.startsWith("scala.") || (name.startsWith("com.google") && !name.startsWith("com.google.cloud")) || name.startsWith("java.") || diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HadoopVersionInfoSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HadoopVersionInfoSuite.scala index 65492abf38cc..8d55356da28e 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HadoopVersionInfoSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HadoopVersionInfoSuite.scala @@ -49,8 +49,7 @@ class HadoopVersionInfoSuite extends SparkFunSuite { sparkConf = new SparkConf(), hadoopConf = hadoopConf, config = HiveClientBuilder.buildConf(Map.empty), - ivyPath = Some(ivyPath.getCanonicalPath), - sharesHadoopClasses = true) + ivyPath = Some(ivyPath.getCanonicalPath)) val jars = client.classLoader.getParent.asInstanceOf[URLClassLoader].getURLs .map(u => new File(u.toURI)) // Drop all Hadoop jars to use the existing Hadoop jars on the classpath diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientBuilder.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientBuilder.scala index 2ad3afcb214b..f40b4f00d9fd 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientBuilder.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientBuilder.scala @@ -46,15 +46,13 @@ private[client] object HiveClientBuilder { def buildClient( version: String, hadoopConf: Configuration, - extraConf: Map[String, String] = Map.empty, - sharesHadoopClasses: Boolean = true): HiveClient = { + extraConf: Map[String, String] = Map.empty): HiveClient = { IsolatedClientLoader.forVersion( hiveMetastoreVersion = version, hadoopVersion = VersionInfo.getVersion, sparkConf = new SparkConf(), hadoopConf = hadoopConf, config = buildConf(extraConf), - ivyPath = ivyPath, - sharesHadoopClasses = sharesHadoopClasses).createClient() + ivyPath = ivyPath).createClient() } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala index 2d615f6fdc26..7e10d498d041 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala @@ -272,10 +272,6 @@ class HivePartitionFilteringSuite(version: String) day1 :: day2 :: Nil) } - test("create client with sharesHadoopClasses = false") { - buildClient(new Configuration(), sharesHadoopClasses = false) - } - private def testMetastorePartitionFiltering( filterExpr: Expression, expectedDs: Seq[Int], diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveVersionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveVersionSuite.scala index dd58c302e019..02e9b7fb151f 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveVersionSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveVersionSuite.scala @@ -28,9 +28,7 @@ private[client] abstract class HiveVersionSuite(version: String) extends SparkFu override protected val enableAutoThreadAudit = false protected var client: HiveClient = null - protected def buildClient( - hadoopConf: Configuration, - sharesHadoopClasses: Boolean = true): HiveClient = { + protected def buildClient(hadoopConf: Configuration): HiveClient = { // Hive changed the default of datanucleus.schema.autoCreateAll from true to false and // hive.metastore.schema.verification from false to true since 2.0 // For details, see the JIRA HIVE-6113 and HIVE-12463 @@ -46,8 +44,7 @@ private[client] abstract class HiveVersionSuite(version: String) extends SparkFu HiveClientBuilder.buildClient( version, hadoopConf, - HiveUtils.formatTimeVarsForHiveClient(hadoopConf), - sharesHadoopClasses = sharesHadoopClasses) + HiveUtils.formatTimeVarsForHiveClient(hadoopConf)) } override def suiteName: String = s"${super.suiteName}($version)" From 0b035b2a9201a17d80582ff40b2b2b9cfff9b412 Mon Sep 17 00:00:00 2001 From: Chao Sun Date: Mon, 9 Nov 2020 18:05:43 -0800 Subject: [PATCH 2/2] Address comments --- .../apache/spark/sql/hive/client/IsolatedClientLoader.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala index 6990257044e0..9663e03ee6a7 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala @@ -69,8 +69,7 @@ private[hive] object IsolatedClientLoader extends Logging { val fallbackVersion = "2.7.4" logWarning(s"Failed to resolve Hadoop artifacts for the version $hadoopVersion. We " + s"will change the hadoop version from $hadoopVersion to $fallbackVersion and try " + - "again. Hadoop classes will not be shared between Spark and Hive metastore client. " + - "It is recommended to set jars used by Hive metastore client through " + + "again. It is recommended to set jars used by Hive metastore client through " + "spark.sql.hive.metastore.jars in the production environment.") (downloadVersion( resolvedVersion, fallbackVersion, ivyPath, remoteRepos), fallbackVersion)