-
Notifications
You must be signed in to change notification settings - Fork 29.3k
[SPARK-32852][SQL] spark.sql.hive.metastore.jars support HDFS location #29881
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
834c2a7
bf3c4b6
2cfd3e0
1063a08
6eaafea
6214ebf
e82ed52
f2869e9
c200d9b
e3395f5
0254297
c6475ee
e50050b
9278404
9ee1a86
4016327
cf0f846
b5241c8
6fbe082
b79ab0d
f1a4085
d631a75
efc5ae0
6e67c7b
ea9ef2b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,16 +25,18 @@ import java.util.concurrent.TimeUnit | |
| import scala.collection.JavaConverters._ | ||
| import scala.collection.mutable.HashMap | ||
| import scala.language.implicitConversions | ||
| import scala.util.control.NonFatal | ||
|
|
||
| import org.apache.commons.lang3.{JavaVersion, SystemUtils} | ||
| import org.apache.hadoop.conf.Configuration | ||
| import org.apache.hadoop.fs.Path | ||
| import org.apache.hadoop.hive.conf.HiveConf | ||
| import org.apache.hadoop.hive.conf.HiveConf.ConfVars | ||
| import org.apache.hadoop.hive.ql.session.SessionState | ||
| import org.apache.hadoop.util.VersionInfo | ||
| import org.apache.hive.common.util.HiveVersionInfo | ||
|
|
||
| import org.apache.spark.{SparkConf, SparkContext} | ||
| import org.apache.spark.{SparkConf, SparkContext, SparkEnv, SparkFiles} | ||
| import org.apache.spark.deploy.SparkHadoopUtil | ||
| import org.apache.spark.internal.Logging | ||
| import org.apache.spark.sql._ | ||
|
|
@@ -88,12 +90,20 @@ private[spark] object HiveUtils extends Logging { | |
| | <code>${builtinHiveVersion}</code> or not defined. | ||
| | 2. "maven" | ||
| | Use Hive jars of specified version downloaded from Maven repositories. | ||
| | 3. A classpath in the standard format for both Hive and Hadoop. | ||
| | 3. "path" | ||
| | A classpath configured by `spark.sql.hive.metastore.jars.path` in the standard format | ||
| | for both Hive and Hadoop. | ||
| """.stripMargin) | ||
| .version("1.4.0") | ||
| .stringConf | ||
| .createWithDefault("builtin") | ||
|
|
||
| val HIVE_METASTORE_JARS_PATH = buildStaticConf("spark.sql.hive.metastore.jars.path") | ||
| .doc(s"When ${HIVE_METASTORE_JARS} is set as `path`, use Hive jars configured by this") | ||
| .stringConf | ||
| .toSequence | ||
| .createWithDefault(Nil) | ||
|
|
||
| val CONVERT_METASTORE_PARQUET = buildConf("spark.sql.hive.convertMetastoreParquet") | ||
| .doc("When set to true, the built-in Parquet reader and writer are used to process " + | ||
| "parquet tables created by using the HiveQL syntax, instead of Hive serde.") | ||
|
|
@@ -396,23 +406,95 @@ private[spark] object HiveUtils extends Logging { | |
| config = configurations, | ||
| barrierPrefixes = hiveMetastoreBarrierPrefixes, | ||
| sharedPrefixes = hiveMetastoreSharedPrefixes) | ||
| } else { | ||
| // Convert to files and expand any directories. | ||
| val jars = | ||
| hiveMetastoreJars | ||
| .split(File.pathSeparator) | ||
| .flatMap { | ||
| case path if new File(path).getName == "*" => | ||
| val files = new File(path).getParentFile.listFiles() | ||
| if (files == null) { | ||
| logWarning(s"Hive jar path '$path' does not exist.") | ||
| } else if (hiveMetastoreJars == "path") { | ||
|
|
||
| val hiveMetastoreJarsPath: Seq[String] = conf.get(HiveUtils.HIVE_METASTORE_JARS_PATH) | ||
|
|
||
| def addLocalHiveJars(file: File): Seq[File] = { | ||
| if (file.getName == "*") { | ||
| val files = file.getParentFile.listFiles() | ||
| if (files == null) { | ||
| logWarning(s"Hive jar path '${file.getPath}' does not exist.") | ||
| Nil | ||
| } else { | ||
| files.filter(_.getName.toLowerCase(Locale.ROOT).endsWith(".jar")).toSeq | ||
| } | ||
| } else { | ||
| file :: Nil | ||
| } | ||
| } | ||
|
|
||
| def checkRemoteHiveJars(path: String): Seq[File] = { | ||
| try { | ||
| val hadoopPath = new Path(path) | ||
| val fs = hadoopPath.getFileSystem(hadoopConf) | ||
| if (hadoopPath.getName == "*") { | ||
| val parent = hadoopPath.getParent | ||
| if (!fs.exists(parent)) { | ||
| logWarning(s"Hive Jar ${path} does not exist.") | ||
| Nil | ||
| } else if (!fs.getFileStatus(parent).isDirectory) { | ||
| logWarning(s"Hive Jar ${parent} is not a directory.") | ||
| Nil | ||
| } else { | ||
| fs.listStatus(parent).map(file => | ||
| Utils.fetchFile(file.getPath.toUri.toString, | ||
| new File(SparkFiles.getRootDirectory()), conf, | ||
| SparkEnv.get.securityManager, hadoopConf, | ||
| System.currentTimeMillis(), useCache = false) | ||
| ) | ||
| } | ||
| } else { | ||
| if (!fs.exists(hadoopPath)) { | ||
| logWarning(s"Hive Jar ${path} does not exist.") | ||
| Nil | ||
| } else if (fs.getFileStatus(hadoopPath).isDirectory) { | ||
| logWarning(s"Hive Jar ${path} not allow directory without `*`") | ||
| Nil | ||
| } else { | ||
| files.filter(_.getName.toLowerCase(Locale.ROOT).endsWith(".jar")).toSeq | ||
| // Since tar/tar.gz file we can't know it's final path yet, not support it | ||
| Utils.fetchFile(hadoopPath.toUri.toString, | ||
| new File(SparkFiles.getRootDirectory()), conf, | ||
| SparkEnv.get.securityManager, hadoopConf, | ||
| System.currentTimeMillis(), useCache = false) :: Nil | ||
| } | ||
| case path => | ||
| new File(path) :: Nil | ||
| } | ||
| } catch { | ||
| case NonFatal(e) => | ||
| logError(s"Failed to find $path to Hive Jars", e) | ||
| Nil | ||
| } | ||
| } | ||
|
|
||
| // Convert to files and expand any directories. | ||
| val jars = | ||
| hiveMetastoreJarsPath | ||
| .flatMap { | ||
| case path if path.contains("\\") => | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I could not follow this. Do you want to check if this is a Windows path? If that's the case, you can use
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
All right, I will raise a pr to change this in
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Here need use condition |
||
| addLocalHiveJars(new File(path)) | ||
| case path => | ||
| val uri = new Path(path).toUri | ||
| uri.getScheme match { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't understand why we need to check the scheme and do things differently. Can you point to other places in Spark that do similar things to support this PR?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
In datasource file index, path is fully qualified URL to indicate other file systems.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. how about current change? Make the logic more simpler. |
||
| case null | "file" => | ||
| addLocalHiveJars(new File(uri.getPath)) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Assuming
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Add comment in doc of conf
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this comment on the option "path" instead of "classpath"?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Can't got your point, can you make it more clear? |
||
| case "local" => | ||
| new File("file:" + uri.getPath) :: Nil | ||
| case "http" | "https" | "ftp" => | ||
| try { | ||
| // validate and fetch URI file | ||
| Utils.fetchFile(uri.toURL.toString, | ||
| new File(SparkFiles.getRootDirectory()), conf, | ||
| SparkEnv.get.securityManager, hadoopConf, | ||
| System.currentTimeMillis(), useCache = false) :: Nil | ||
| } catch { | ||
| case _: Throwable => | ||
| logWarning(s"Hive Jars URI (${uri.toString}) is not a valid URL.") | ||
| Nil | ||
| } | ||
| case _ => | ||
| checkRemoteHiveJars(path) | ||
| } | ||
| } | ||
| .map(_.toURI.toURL) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This line can be removed, as
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Done |
||
|
|
||
| logInfo( | ||
|
|
@@ -427,6 +509,9 @@ private[spark] object HiveUtils extends Logging { | |
| isolationOn = true, | ||
| barrierPrefixes = hiveMetastoreBarrierPrefixes, | ||
| sharedPrefixes = hiveMetastoreSharedPrefixes) | ||
| } else { | ||
| throw new IllegalArgumentException(s"Please set ${HIVE_METASTORE_JARS.key} correctlly using" + | ||
| s" ${Seq("buildin", "maven", "path").mkString(", ")}.") | ||
| } | ||
| isolatedLoader.createClient() | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we use
DataSource.checkAndGlobPathIfNecessary?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yea, I will test this.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@sunchao Follow this suggestion, we can support nested hdfs path wildcard now such as hdfs://xx/xx//