diff --git a/clients/spark/build.sbt b/clients/spark/build.sbt index a098f1ec857..e70e3a0f516 100644 --- a/clients/spark/build.sbt +++ b/clients/spark/build.sbt @@ -1,6 +1,6 @@ -lazy val projectVersion = "0.15.0" +lazy val projectVersion = "0.15.1-support-emr-7.0.0" version := projectVersion -lazy val hadoopVersion = "3.2.1" +lazy val hadoopVersion = "3.3.6" ThisBuild / isSnapshot := false ThisBuild / scalaVersion := "2.12.12" @@ -63,7 +63,7 @@ libraryDependencies ++= Seq( "com.azure" % "azure-storage-blob" % "12.9.0", "com.azure" % "azure-storage-blob-batch" % "12.7.0", "com.azure" % "azure-identity" % "1.2.0", - "com.amazonaws" % "aws-java-sdk-bundle" % "1.12.194" % "provided", + "com.amazonaws" % "aws-java-sdk-bundle" % "1.12.367" % "provided", "com.google.cloud.bigdataoss" % "gcs-connector" % "hadoop3-2.2.18", "com.google.cloud" % "google-cloud-storage" % "2.35.0", // Snappy is JNI :-(. However it does claim to work with diff --git a/clients/spark/src/main/scala/io/treeverse/clients/LakeFSInputFormat.scala b/clients/spark/src/main/scala/io/treeverse/clients/LakeFSInputFormat.scala index c8ad4f6ad96..5abbd10c792 100644 --- a/clients/spark/src/main/scala/io/treeverse/clients/LakeFSInputFormat.scala +++ b/clients/spark/src/main/scala/io/treeverse/clients/LakeFSInputFormat.scala @@ -99,6 +99,9 @@ class EntryRecordReader[Proto <: GeneratedMessage with scalapb.Message[Proto]]( val gravelerSplit = split.asInstanceOf[GravelerSplit] + // Log the path before processing + logger.info(s"Processing file: ${gravelerSplit.path}") + val fs = gravelerSplit.path.getFileSystem(context.getConfiguration) fs.copyToLocalFile(false, gravelerSplit.path, new Path(localFile.getAbsolutePath), true) // TODO(johnnyaug) should we cache this? diff --git a/clients/spark/src/main/scala/io/treeverse/clients/StorageUtils.scala b/clients/spark/src/main/scala/io/treeverse/clients/StorageUtils.scala index 209d520e9e0..cba540ff49d 100644 --- a/clients/spark/src/main/scala/io/treeverse/clients/StorageUtils.scala +++ b/clients/spark/src/main/scala/io/treeverse/clients/StorageUtils.scala @@ -1,6 +1,10 @@ package io.treeverse.clients -import com.amazonaws.auth.AWSCredentialsProvider +import com.amazonaws.auth.{ + AWSCredentialsProvider, + DefaultAWSCredentialsProviderChain, + STSAssumeRoleSessionCredentialsProvider +} import com.amazonaws.client.builder.AwsClientBuilder import com.amazonaws.retry.PredefinedRetryPolicies.SDKDefaultRetryCondition import com.amazonaws.retry.RetryUtils @@ -11,6 +15,7 @@ import org.slf4j.{Logger, LoggerFactory} import java.net.URI import java.util.concurrent.TimeUnit +import java.util.UUID object StorageUtils { val StorageTypeS3 = "s3" @@ -144,10 +149,34 @@ object StorageUtils { builder.withRegion(region) else builder - val builderWithCredentials = credentialsProvider match { - case Some(cp) => builderWithEndpoint.withCredentials(cp) - case None => builderWithEndpoint - } + + // Check for Hadoop's assumed role configuration + val roleArn = System.getProperty("spark.hadoop.fs.s3a.assumed.role.arn") + + // Apply credentials based on configuration + val builderWithCredentials = + if (roleArn != null && !roleArn.isEmpty) { + // If we have a role ARN configured, assume that role + try { + val sessionName = "lakefs-gc-" + UUID.randomUUID().toString + val stsProvider = + new STSAssumeRoleSessionCredentialsProvider.Builder(roleArn, sessionName) + .withLongLivedCredentialsProvider(new DefaultAWSCredentialsProviderChain()) + .build() + + builderWithEndpoint.withCredentials(stsProvider) + } catch { + case e: Exception => + logger.info("Falling back to DefaultAWSCredentialsProviderChain") + builderWithEndpoint.withCredentials(new DefaultAWSCredentialsProviderChain()) + } + } else + ( + // Use standard AWSCredentialsProvider if available + builderWithEndpoint.withCredentials( + credentialsProvider.get.asInstanceOf[AWSCredentialsProvider] + ) + ) builderWithCredentials.build }