-
Notifications
You must be signed in to change notification settings - Fork 749
[GOBBLIN-1840] Helix Job scheduler should not try to replace running workflow if within configured time #3704
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 12 commits
29f5c2d
45c87ff
df182ff
3c552b7
88b9a02
d75e522
4ddd7c5
0e7ba4d
c449a71
3160d8b
ae2b58c
e6ea195
6e8358c
a15afd8
bbe4a0b
701881e
cfdc115
b598455
32ea7ed
2496212
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,10 @@ | |
| package org.apache.gobblin.cluster; | ||
|
|
||
| import java.io.IOException; | ||
| import java.time.Clock; | ||
| import java.time.Duration; | ||
| import java.time.Instant; | ||
| import java.time.temporal.ChronoUnit; | ||
| import java.util.Collection; | ||
| import java.util.Collections; | ||
| import java.util.List; | ||
|
|
@@ -110,15 +114,19 @@ public class GobblinHelixJobScheduler extends JobScheduler implements StandardMe | |
|
|
||
| private boolean startServicesCompleted; | ||
| private final long helixJobStopTimeoutMillis; | ||
| private final Duration jobSchedulingThrottleTimeout; | ||
| private ConcurrentHashMap<String, Instant> jobNameToNextSchedulableTime; | ||
| private boolean isThrottleEnabled; | ||
| private Clock clock; | ||
|
|
||
| public GobblinHelixJobScheduler(Config sysConfig, | ||
| HelixManager jobHelixManager, | ||
| Optional<HelixManager> taskDriverHelixManager, | ||
| EventBus eventBus, | ||
| Path appWorkDir, List<? extends Tag<?>> metadataTags, | ||
| SchedulerService schedulerService, | ||
| MutableJobCatalog jobCatalog) throws Exception { | ||
|
|
||
| MutableJobCatalog jobCatalog, | ||
| Clock clock) throws Exception { | ||
| super(ConfigUtils.configToProperties(sysConfig), schedulerService); | ||
| this.commonJobProperties = ConfigUtils.configToProperties(ConfigUtils.getConfigOrEmpty(sysConfig, COMMON_JOB_PROPS)); | ||
| this.jobHelixManager = jobHelixManager; | ||
|
|
@@ -162,6 +170,27 @@ public GobblinHelixJobScheduler(Config sysConfig, | |
| this.helixWorkflowListingTimeoutMillis = ConfigUtils.getLong(sysConfig, GobblinClusterConfigurationKeys.HELIX_WORKFLOW_LISTING_TIMEOUT_SECONDS, | ||
| GobblinClusterConfigurationKeys.DEFAULT_HELIX_WORKFLOW_LISTING_TIMEOUT_SECONDS) * 1000; | ||
|
|
||
| this.jobSchedulingThrottleTimeout = Duration.of(ConfigUtils.getLong(sysConfig, GobblinClusterConfigurationKeys.HELIX_JOB_SCHEDULING_THROTTLE_TIMEOUT_SECONDS_KEY, | ||
| GobblinClusterConfigurationKeys.DEFAULT_HELIX_JOB_SCHEDULING_THROTTLE_TIMEOUT_SECONDS_KEY), ChronoUnit.SECONDS); | ||
|
|
||
| this.jobNameToNextSchedulableTime = new ConcurrentHashMap<>(); | ||
|
|
||
| this.isThrottleEnabled = ConfigUtils.getBoolean(sysConfig, GobblinClusterConfigurationKeys.HELIX_JOB_SCHEDULING_THROTTLE_ENABLED_KEY, | ||
| GobblinClusterConfigurationKeys.DEFAULT_HELIX_JOB_SCHEDULING_THROTTLE_ENABLED_KEY); | ||
|
|
||
| this.clock = clock; | ||
| } | ||
|
|
||
| public GobblinHelixJobScheduler(Config sysConfig, | ||
| HelixManager jobHelixManager, | ||
| Optional<HelixManager> taskDriverHelixManager, | ||
| EventBus eventBus, | ||
| Path appWorkDir, List<? extends Tag<?>> metadataTags, | ||
| SchedulerService schedulerService, | ||
| MutableJobCatalog jobCatalog) throws Exception { | ||
|
|
||
| this(sysConfig, jobHelixManager, taskDriverHelixManager, eventBus, appWorkDir, metadataTags, | ||
| schedulerService, jobCatalog, Clock.systemUTC()); | ||
| } | ||
|
|
||
| @Override | ||
|
|
@@ -303,7 +332,7 @@ public Object get(long timeout, TimeUnit unit) throws InterruptedException, Exec | |
| } | ||
|
|
||
| @Subscribe | ||
| public void handleNewJobConfigArrival(NewJobConfigArrivalEvent newJobArrival) { | ||
| public synchronized void handleNewJobConfigArrival(NewJobConfigArrivalEvent newJobArrival) { | ||
|
Peiyingy marked this conversation as resolved.
|
||
| String jobUri = newJobArrival.getJobName(); | ||
| LOGGER.info("Received new job configuration of job " + jobUri); | ||
| try { | ||
|
|
@@ -315,24 +344,39 @@ public void handleNewJobConfigArrival(NewJobConfigArrivalEvent newJobArrival) { | |
| jobProps.setProperty(GobblinClusterConfigurationKeys.JOB_SPEC_URI, jobUri); | ||
|
|
||
| this.jobSchedulerMetrics.updateTimeBeforeJobScheduling(jobProps); | ||
|
|
||
| GobblinHelixJobLauncherListener listener = isThrottleEnabled ? | ||
| new GobblinThrottlingHelixJobLauncherListener(this.launcherMetrics, jobNameToNextSchedulableTime, | ||
| jobSchedulingThrottleTimeout, clock) | ||
| : new GobblinHelixJobLauncherListener(this.launcherMetrics); | ||
| if (jobProps.containsKey(ConfigurationKeys.JOB_SCHEDULE_KEY)) { | ||
| LOGGER.info("Scheduling job " + jobUri); | ||
| scheduleJob(jobProps, | ||
| new GobblinHelixJobLauncherListener(this.launcherMetrics)); | ||
| listener); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Does not need to be on a new line |
||
| } else { | ||
| LOGGER.info("No job schedule found, so running job " + jobUri); | ||
| LOGGER.info("No job schedule" | ||
| + " found, so running job " + jobUri); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Does not need to be on a new line |
||
| this.jobExecutor.execute(new NonScheduledJobRunner(jobProps, | ||
| new GobblinHelixJobLauncherListener(this.launcherMetrics))); | ||
| listener)); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Does not need to be on a new line |
||
| } | ||
|
|
||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Does not need a new line |
||
| } catch (JobException je) { | ||
| LOGGER.error("Failed to schedule or run job " + jobUri, je); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Update this log to say that you are resetting the clock |
||
| } | ||
| } | ||
|
|
||
| @Subscribe | ||
| public void handleUpdateJobConfigArrival(UpdateJobConfigArrivalEvent updateJobArrival) { | ||
| public synchronized void handleUpdateJobConfigArrival(UpdateJobConfigArrivalEvent updateJobArrival) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @homatthew are we sure this change won't affect performance when those message-handling methods will be called frequently? (That's why initially I suggested having job level lock)
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Summary of offline discussion:
Since the only blocking operation in the critical section is the delete operation, and there are infrequent deletes (usually this takes seconds to complete), we can go ahead with the change and add fine-grained locking in the future if necessary |
||
| LOGGER.info("Received update for job configuration of job " + updateJobArrival.getJobName()); | ||
| String jobName = updateJobArrival.getJobName(); | ||
|
|
||
| if (this.isThrottleEnabled && | ||
| this.jobNameToNextSchedulableTime.getOrDefault(jobName, Instant.ofEpochMilli(0)).isAfter(clock.instant())) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: This line is a bit dense. And to indicate beginning of time, the documentation for Also, intuitively it feels a little weird to read as "nextSchedulableTime is after current time". I feel it's more intuitive for it to be "current time is before nextSchedulableTime" i.e. or IMO even more readable |
||
| LOGGER.info("Replanning is skipped for job {}. Current time is " | ||
| + clock.instant() + " and the next schedulable time would be " | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. clock.instant() should be using the |
||
| + this.jobNameToNextSchedulableTime.getOrDefault(jobName, Instant.ofEpochMilli(0)), jobName); | ||
| return; | ||
| } | ||
|
|
||
| try { | ||
| handleDeleteJobConfigArrival(new DeleteJobConfigArrivalEvent(updateJobArrival.getJobName(), | ||
| updateJobArrival.getJobConfig())); | ||
|
|
@@ -360,7 +404,7 @@ private void waitForJobCompletion(String jobName) { | |
| } | ||
|
|
||
| @Subscribe | ||
| public void handleDeleteJobConfigArrival(DeleteJobConfigArrivalEvent deleteJobArrival) throws InterruptedException { | ||
| public synchronized void handleDeleteJobConfigArrival(DeleteJobConfigArrivalEvent deleteJobArrival) throws InterruptedException { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Super minor nit. Not sure if it's even worth implementing: Would we want to reset the I am not sure which behavior is more intuitive:
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The current behavior is (2). And to make the behavior (1), we would:
The delete operations are synchronous and the method is
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| LOGGER.info("Received delete for job configuration of job " + deleteJobArrival.getJobName()); | ||
| try { | ||
| unscheduleJob(deleteJobArrival.getJobName()); | ||
|
|
@@ -443,6 +487,10 @@ private void cancelJobIfRequired(DeleteJobConfigArrivalEvent deleteJobArrival) t | |
| } | ||
| } | ||
|
|
||
| public void setThrottleEnabled(boolean throttleEnabled) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use lombok |
||
| isThrottleEnabled = throttleEnabled; | ||
| } | ||
|
|
||
| /** | ||
| * This class is responsible for running non-scheduled jobs. | ||
| */ | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,64 @@ | ||||||||||||
| package org.apache.gobblin.cluster; | ||||||||||||
|
|
||||||||||||
| import java.time.Clock; | ||||||||||||
| import java.time.Duration; | ||||||||||||
| import java.time.Instant; | ||||||||||||
| import java.util.concurrent.ConcurrentHashMap; | ||||||||||||
|
|
||||||||||||
| import org.slf4j.Logger; | ||||||||||||
| import org.slf4j.LoggerFactory; | ||||||||||||
|
|
||||||||||||
| import org.apache.gobblin.runtime.JobContext; | ||||||||||||
| import org.apache.gobblin.runtime.JobState; | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
| /** | ||||||||||||
| * A job listener used when {@link GobblinHelixJobLauncher} launches a job. | ||||||||||||
| * In {@link GobblinHelixJobScheduler}, when throttling is enabled, this | ||||||||||||
| * listener would record jobName to next schedulable time to decide whether | ||||||||||||
| * the replanning should be executed or skipped. | ||||||||||||
| */ | ||||||||||||
| public class GobblinThrottlingHelixJobLauncherListener extends GobblinHelixJobLauncherListener { | ||||||||||||
|
|
||||||||||||
| public final static Logger LOG = LoggerFactory.getLogger(GobblinThrottlingHelixJobLauncherListener.class); | ||||||||||||
|
Peiyingy marked this conversation as resolved.
|
||||||||||||
| private ConcurrentHashMap<String, Instant> jobNameToNextSchedulableTime; | ||||||||||||
| private Duration helixJobSchedulingThrottleTimeout; | ||||||||||||
| private Clock clock; | ||||||||||||
|
|
||||||||||||
| public GobblinThrottlingHelixJobLauncherListener(GobblinHelixJobLauncherMetrics jobLauncherMetrics, | ||||||||||||
| ConcurrentHashMap jobNameToNextSchedulableTime, Duration helixJobSchedulingThrottleTimeout, Clock clock) { | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't the it should be |
||||||||||||
| super(jobLauncherMetrics); | ||||||||||||
| this.jobNameToNextSchedulableTime = jobNameToNextSchedulableTime; | ||||||||||||
| this.helixJobSchedulingThrottleTimeout = helixJobSchedulingThrottleTimeout; | ||||||||||||
| this.clock = clock; | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| @Override | ||||||||||||
| public void onJobPrepare(JobContext jobContext) | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why for the same job, why we try to update the schedulable time three times? once when we handle the message, once when we prepare the job, once when job start. This will be confusing reading the log. |
||||||||||||
| throws Exception { | ||||||||||||
| super.onJobPrepare(jobContext); | ||||||||||||
| Instant nextSchedulableTime = clock.instant().plus(helixJobSchedulingThrottleTimeout); | ||||||||||||
| jobNameToNextSchedulableTime.put(jobContext.getJobName(), nextSchedulableTime); | ||||||||||||
| LOG.info(jobContext.getJobName() + " finished prepare. The next schedulable time is " + nextSchedulableTime ); | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: grammar "finished preparing"
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. White space after the
Peiyingy marked this conversation as resolved.
Outdated
|
||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| @Override | ||||||||||||
| public void onJobCompletion(JobContext jobContext) | ||||||||||||
| throws Exception { | ||||||||||||
| super.onJobCompletion(jobContext); | ||||||||||||
| if (jobContext.getJobState().getState() == JobState.RunningState.FAILED) { | ||||||||||||
| jobNameToNextSchedulableTime.put(jobContext.getJobName(), Instant.ofEpochMilli(0)); | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add a log if the job failed. I see there is an existing log for the entire job context, but having a log specifically from the throttling scheduler would be important here for those not familiar with the code when they are debugging gobblin/gobblin-runtime/src/main/java/org/apache/gobblin/runtime/listeners/AbstractJobListener.java Lines 59 to 63 in 702cadf
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, instead of ofEpochMilli(0), let's use |
||||||||||||
| } else { | ||||||||||||
| Instant nextSchedulableTime = clock.instant().plus(helixJobSchedulingThrottleTimeout); | ||||||||||||
| jobNameToNextSchedulableTime.put(jobContext.getJobName(), nextSchedulableTime); | ||||||||||||
| LOG.info(jobContext.getJobName() + " finished completion. The next schedulable time is " + nextSchedulableTime ); | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe |
||||||||||||
| } | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| @Override | ||||||||||||
| public void onJobCancellation(JobContext jobContext) | ||||||||||||
| throws Exception { | ||||||||||||
| super.onJobCancellation(jobContext); | ||||||||||||
| jobNameToNextSchedulableTime.put(jobContext.getJobName(), Instant.ofEpochMilli(0)); | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as for job failed. We'd want something similar. |
||||||||||||
| } | ||||||||||||
| } | ||||||||||||
Uh oh!
There was an error while loading. Please reload this page.