-
Notifications
You must be signed in to change notification settings - Fork 8
Implement EMR submitter #439
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 11 commits
b9e2bed
cfd2c7e
aedf7e8
8b6f853
5e65b01
4907271
d698de8
24f96d5
9219d42
a36db4a
0640477
236cb7c
457e81a
7922adf
cab5877
f0a8fb5
057ccc3
cb0ea83
dbbbcaa
0049d0d
f514245
ba46125
2fdff7b
a791589
0a1f8b7
1977ce3
006645c
f677231
2caa711
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,244 @@ | ||
| package ai.chronon.integrations.aws | ||
|
|
||
| import ai.chronon.integrations.aws.EmrSubmitter.DefaultClusterIdleTimeout | ||
| import ai.chronon.integrations.aws.EmrSubmitter.DefaultClusterInstanceCount | ||
| import ai.chronon.integrations.aws.EmrSubmitter.DefaultClusterInstanceType | ||
| import ai.chronon.spark.JobSubmitter | ||
| import ai.chronon.spark.JobSubmitterConstants._ | ||
| import ai.chronon.spark.JobType | ||
| import ai.chronon.spark.SparkJob | ||
| import ai.chronon.spark.{SparkJob => TypeSparkJob} | ||
| import software.amazon.awssdk.services.emr.EmrClient | ||
| import software.amazon.awssdk.services.emr.model.Application | ||
| import software.amazon.awssdk.services.emr.model.AutoTerminationPolicy | ||
| import software.amazon.awssdk.services.emr.model.BootstrapActionConfig | ||
| import software.amazon.awssdk.services.emr.model.CancelStepsRequest | ||
| import software.amazon.awssdk.services.emr.model.Configuration | ||
| import software.amazon.awssdk.services.emr.model.DescribeStepRequest | ||
| import software.amazon.awssdk.services.emr.model.HadoopJarStepConfig | ||
| import software.amazon.awssdk.services.emr.model.JobFlowInstancesConfig | ||
| import software.amazon.awssdk.services.emr.model.RunJobFlowRequest | ||
| import software.amazon.awssdk.services.emr.model.ScriptBootstrapActionConfig | ||
| import software.amazon.awssdk.services.emr.model.StepConfig | ||
| import scala.collection.JavaConverters._ | ||
|
|
||
| class EmrSubmitter(customerId: String, emrClient: EmrClient) extends JobSubmitter { | ||
|
|
||
| private val ClusterApplications = List( | ||
| "Flink", | ||
| "Zeppelin", | ||
| "JupyterEnterpriseGateway", | ||
| "Hive", | ||
| "Hadoop", | ||
| "Livy", | ||
| "Spark" | ||
| ) | ||
|
|
||
| private val EmrReleaseLabel = "emr-7.2.0" | ||
|
|
||
| // Customer specific infra configurations | ||
| private val CustomerToSubnetIdMap = Map( | ||
david-zlai marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| "canary" -> "subnet-085b2af531b50db44" | ||
| ) | ||
| private val CustomerToSecurityGroupIdMap = Map( | ||
| "canary" -> "sg-04fb79b5932a41298" | ||
| ) | ||
david-zlai marked this conversation as resolved.
Show resolved
Hide resolved
david-zlai marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| private val CopyS3FilesToMntScript = "copy_s3_files.sh" | ||
|
|
||
| override def submit(jobType: JobType, | ||
| jobProperties: Map[String, String], | ||
| files: List[String], | ||
| args: String*): String = { | ||
|
|
||
| val runJobFlowRequestBuilder = RunJobFlowRequest | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could we split out the cluster creation into its own utility class / methods?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah we can but it's pretty piecemeal at the moment. Like for a single RunJobFlowRequest, the cluster creation configurations include:
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. broke out to a new function. can do more cleanup though if we want
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think eventually we will want this as a separate class / utility object, and make that be a separate verb in |
||
| .builder() | ||
| .name(s"job-${java.util.UUID.randomUUID.toString}") | ||
|
|
||
| // Cluster infra configurations: | ||
| val customerSecurityGroupId = CustomerToSecurityGroupIdMap.getOrElse( | ||
| customerId, | ||
| throw new RuntimeException(s"No security group id found for $customerId")) | ||
| runJobFlowRequestBuilder | ||
| .autoTerminationPolicy( | ||
| AutoTerminationPolicy | ||
| .builder() | ||
| .idleTimeout(jobProperties.getOrElse(ClusterIdleTimeout, s"$DefaultClusterIdleTimeout").toLong) | ||
| .build()) | ||
| .configurations( | ||
| Configuration.builder | ||
| .classification("spark-hive-site") | ||
| .properties(Map( | ||
| "hive.metastore.client.factory.class" -> "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory").asJava) | ||
| .build() | ||
| ) | ||
| .applications(ClusterApplications.map(app => Application.builder().name(app).build()): _*) | ||
| // TODO: Could make this generalizable. Have logs saved where users want it | ||
| .logUri(s"s3://zipline-warehouse-${customerId}/emr/") | ||
| .instances( | ||
| JobFlowInstancesConfig | ||
| .builder() | ||
| // We may want to make master and slave instance types different in the future | ||
| .masterInstanceType(jobProperties.getOrElse(ClusterInstanceType, DefaultClusterInstanceType)) | ||
| .slaveInstanceType(jobProperties.getOrElse(ClusterInstanceType, DefaultClusterInstanceType)) | ||
| // Hack: We hardcode the subnet ID and sg id for each customer of Zipline. The subnet gets created from | ||
| // Terraform so we'll need to be careful that these don't get accidentally destroyed. | ||
| .ec2SubnetId( | ||
| CustomerToSubnetIdMap.getOrElse(customerId, | ||
| throw new RuntimeException(s"No subnet id found for $customerId"))) | ||
| .emrManagedMasterSecurityGroup(customerSecurityGroupId) | ||
| .emrManagedSlaveSecurityGroup(customerSecurityGroupId) | ||
| .instanceCount(jobProperties.getOrElse(ClusterInstanceCount, DefaultClusterInstanceCount).toInt) | ||
|
||
| .keepJobFlowAliveWhenNoSteps(true) // Keep the cluster alive after the job is done | ||
| .build()) | ||
| // TODO: need to double check that this is how we want our role names to be | ||
|
||
| .serviceRole(s"zipline_${customerId}_emr_service_role") | ||
| .jobFlowRole(s"zipline_${customerId}_emr_profile") | ||
|
||
| .releaseLabel(EmrReleaseLabel) | ||
|
|
||
| // Add single step (spark job) to run: | ||
| val sparkSubmitArgs = | ||
| Seq("spark-submit", | ||
| "--class", | ||
| jobProperties(MainClass), | ||
| jobProperties(JarURI)) ++ args // For EMR, we explicitly spark-submit the job | ||
| val stepConfig = StepConfig | ||
| .builder() | ||
| .name("Zipline Job") | ||
| .actionOnFailure("CANCEL_AND_WAIT") // want the cluster to not terminate if the step fails | ||
| .hadoopJarStep( | ||
| jobType match { | ||
| case SparkJob => | ||
| HadoopJarStepConfig | ||
| .builder() | ||
| // Using command-runner.jar from AWS: | ||
| // https://docs.aws.amazon.com/en_us/emr/latest/ReleaseGuide/emr-spark-submit-step.html | ||
| .jar("command-runner.jar") | ||
| .args(sparkSubmitArgs: _*) | ||
| .build() | ||
| // TODO: add flink | ||
| case _ => throw new IllegalArgumentException("Unsupported job type") | ||
| } | ||
| ) | ||
| .build() | ||
| runJobFlowRequestBuilder.steps(stepConfig) | ||
|
|
||
| // Add bootstrap actions if any | ||
| if (files.nonEmpty) { | ||
| val artifactsBucket = s"s3://zipline-artifacts-${customerId}/" | ||
| val bootstrapActionConfig = BootstrapActionConfig | ||
| .builder() | ||
| .name("EMR Submitter: Copy S3 Files") | ||
| .scriptBootstrapAction( | ||
| ScriptBootstrapActionConfig | ||
| .builder() | ||
| .path(artifactsBucket + CopyS3FilesToMntScript) | ||
| .args(files: _*) | ||
| .build()) | ||
| .build() | ||
| runJobFlowRequestBuilder.bootstrapActions(bootstrapActionConfig) | ||
|
||
| } | ||
|
|
||
| val jobFlowResponse = emrClient.runJobFlow( | ||
| runJobFlowRequestBuilder.build() | ||
| ) | ||
|
|
||
| jobFlowResponse.jobFlowId() | ||
| } | ||
|
|
||
| override def status(jobId: String): Unit = { | ||
| val describeStepResponse = emrClient.describeStep(DescribeStepRequest.builder().stepId(jobId).build()) | ||
| val status = describeStepResponse.step().status() | ||
| println(status) | ||
| } | ||
|
|
||
| override def kill(jobId: String): Unit = { | ||
| emrClient.cancelSteps(CancelStepsRequest.builder().stepIds(jobId).build()) | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Status/kill methods ignore cluster IDs.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. job is an overloaded term huh
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @david-zlai Yes, exactly! The term "job" is definitely overloaded in this context. In AWS EMR:
This terminology mismatch creates the potential issue I flagged. You could address this by either:
Which approach would work best for your implementation? ✏️ Learnings added
david-zlai marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| object EmrSubmitter { | ||
| def apply(): EmrSubmitter = { | ||
| val customerId = sys.env.getOrElse("CUSTOMER_ID", throw new Exception("CUSTOMER_ID not set")).toLowerCase | ||
|
|
||
| new EmrSubmitter(customerId, | ||
| EmrClient | ||
| .builder() | ||
| .build()) | ||
| } | ||
|
|
||
| private val ClusterInstanceTypeArgKeyword = "--cluster-instance-type" | ||
| private val ClusterInstanceCountArgKeyword = "--cluster-instance-count" | ||
| private val ClusterIdleTimeoutArgKeyword = "--cluster-idle-timeout" | ||
|
|
||
| private val DefaultClusterInstanceType = "m5.xlarge" | ||
| private val DefaultClusterInstanceCount = "3" | ||
| private val DefaultClusterIdleTimeout = 60 * 60 * 24 * 2 // 2 days in seconds | ||
|
||
|
|
||
| def main(args: Array[String]): Unit = { | ||
|
|
||
| // List of args that are not application args | ||
| val internalArgs = Set( | ||
| JarUriArgKeyword, | ||
| JobTypeArgKeyword, | ||
| MainClassKeyword, | ||
| FlinkMainJarUriArgKeyword, | ||
| FlinkSavepointUriArgKeyword, | ||
| ClusterInstanceTypeArgKeyword, | ||
| ClusterInstanceCountArgKeyword, | ||
| ClusterIdleTimeoutArgKeyword | ||
| ) | ||
|
|
||
| val userArgs = args.filter(arg => !internalArgs.exists(arg.startsWith)) | ||
|
|
||
| val jarUri = | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's sync up around this - I think there's a way we can cleanly do this consistently without needing to do a bunch of parsing ourselvess (and save ourselves headache).
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. todo: follow up pr to use the spark arg parser |
||
| args.find(_.startsWith(JarUriArgKeyword)).map(_.split("=")(1)).getOrElse(throw new Exception("Jar URI not found")) | ||
| val mainClass = args | ||
| .find(_.startsWith(MainClassKeyword)) | ||
| .map(_.split("=")(1)) | ||
| .getOrElse(throw new Exception("Main class not found")) | ||
| val jobTypeValue = args | ||
| .find(_.startsWith(JobTypeArgKeyword)) | ||
| .map(_.split("=")(1)) | ||
| .getOrElse(throw new Exception("Job type not found")) | ||
| val clusterInstanceType = | ||
| args.find(_.startsWith(ClusterInstanceTypeArgKeyword)).map(_.split("=")(1)).getOrElse(DefaultClusterInstanceType) | ||
| val clusterInstanceCount = args | ||
| .find(_.startsWith(ClusterInstanceCountArgKeyword)) | ||
| .map(_.split("=")(1)) | ||
| .getOrElse(DefaultClusterInstanceCount) | ||
| val clusterIdleTimeout = args | ||
| .find(_.startsWith(ClusterIdleTimeoutArgKeyword)) | ||
| .map(_.split("=")(1)) | ||
| .getOrElse(DefaultClusterIdleTimeout.toString) | ||
|
|
||
| val (jobType, jobProps) = jobTypeValue.toLowerCase match { | ||
| case "spark" => { | ||
| val baseProps = Map( | ||
| MainClass -> mainClass, | ||
| JarURI -> jarUri, | ||
| ClusterInstanceType -> clusterInstanceType, | ||
| ClusterInstanceCount -> clusterInstanceCount, | ||
| ClusterIdleTimeout -> clusterIdleTimeout | ||
| ) | ||
| (TypeSparkJob, baseProps) | ||
| } | ||
| // TODO: add flink | ||
| case _ => throw new Exception("Invalid job type") | ||
| } | ||
|
|
||
| val finalArgs = userArgs | ||
|
|
||
| val emrSubmitter = EmrSubmitter() | ||
| val jobId = emrSubmitter.submit( | ||
| jobType, | ||
| jobProps, | ||
| List.empty, | ||
| finalArgs: _* | ||
| ) | ||
|
|
||
| println("EMR job id: " + jobId) | ||
| println(s"Safe to exit. Follow the job status at: https://console.aws.amazon.com/emr/home#/clusterDetails/$jobId") | ||
|
|
||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change | ||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,8 @@ | ||||||||||||||
| #!/bin/bash | ||||||||||||||
| set -euxo pipefail | ||||||||||||||
| pwd | ||||||||||||||
|
|
||||||||||||||
| # Loop through all provided arguments (files). Copies files from S3 to /mnt/zipline/ | ||||||||||||||
|
||||||||||||||
| for s3_file in "$@"; do | ||||||||||||||
| aws s3 cp $s3_file /mnt/zipline/ | ||||||||||||||
| done | ||||||||||||||
|
||||||||||||||
| for s3_file in "$@"; do | |
| aws s3 cp $s3_file /mnt/zipline/ | |
| done | |
| for s3_file in "$@"; do | |
| aws s3 cp "$s3_file" /mnt/zipline/ | |
| done |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
something to flag - with Emr 7.2.0 we'll be on Flink 1.20.0 - this is different from our GCP flink version. So we'll need to either build jars with 1.17 and 1.20 / downgrade EMR / install flink 1.20 manually on our GCP clusters
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for flagging. I'm leaning towards keeping the emr release label hardcoded and set for users. Don't want users to accidentally set the wrong emr release and run into weird issues.