-
Notifications
You must be signed in to change notification settings - Fork 749
[GOBBLIN-1840] Helix Job scheduler should not try to replace running workflow if within configured time #3704
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 8 commits
29f5c2d
45c87ff
df182ff
3c552b7
88b9a02
d75e522
4ddd7c5
0e7ba4d
c449a71
3160d8b
ae2b58c
e6ea195
6e8358c
a15afd8
bbe4a0b
701881e
cfdc115
b598455
32ea7ed
2496212
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,9 @@ | |
| package org.apache.gobblin.cluster; | ||
|
|
||
| import java.io.IOException; | ||
| import java.time.Duration; | ||
| import java.time.Instant; | ||
| import java.time.temporal.ChronoUnit; | ||
| import java.util.Collection; | ||
| import java.util.Collections; | ||
| import java.util.List; | ||
|
|
@@ -110,6 +113,8 @@ public class GobblinHelixJobScheduler extends JobScheduler implements StandardMe | |
|
|
||
| private boolean startServicesCompleted; | ||
| private final long helixJobStopTimeoutMillis; | ||
| private final Duration throttleTimeoutDurationSecs; | ||
| private ConcurrentHashMap<String, Instant> jobNameToStartTimeMap; | ||
|
|
||
| public GobblinHelixJobScheduler(Config sysConfig, | ||
| HelixManager jobHelixManager, | ||
|
|
@@ -162,6 +167,11 @@ public GobblinHelixJobScheduler(Config sysConfig, | |
| this.helixWorkflowListingTimeoutMillis = ConfigUtils.getLong(sysConfig, GobblinClusterConfigurationKeys.HELIX_WORKFLOW_LISTING_TIMEOUT_SECONDS, | ||
| GobblinClusterConfigurationKeys.DEFAULT_HELIX_WORKFLOW_LISTING_TIMEOUT_SECONDS) * 1000; | ||
|
|
||
| this.throttleTimeoutDurationSecs = Duration.of(ConfigUtils.getLong(sysConfig, GobblinClusterConfigurationKeys.HELIX_JOB_SCHEDULING_THROTTLE_TIMEOUT_SECONDS_KEY, | ||
| GobblinClusterConfigurationKeys.DEFAULT_HELIX_JOB_SCHEDULING_THROTTLE_TIMEOUT_SECONDS_KEY), ChronoUnit.SECONDS); | ||
|
|
||
| this.jobNameToStartTimeMap = new ConcurrentHashMap<>(); | ||
|
|
||
| } | ||
|
|
||
| @Override | ||
|
|
@@ -325,6 +335,8 @@ public void handleNewJobConfigArrival(NewJobConfigArrivalEvent newJobArrival) { | |
| this.jobExecutor.execute(new NonScheduledJobRunner(jobProps, | ||
| new GobblinHelixJobLauncherListener(this.launcherMetrics))); | ||
| } | ||
|
|
||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Does not need a new line |
||
| this.jobNameToStartTimeMap.put(jobUri, Instant.now()); | ||
| } catch (JobException je) { | ||
| LOGGER.error("Failed to schedule or run job " + jobUri, je); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Update this log to say that you are resetting the clock |
||
| } | ||
|
|
@@ -333,6 +345,20 @@ public void handleNewJobConfigArrival(NewJobConfigArrivalEvent newJobArrival) { | |
| @Subscribe | ||
| public void handleUpdateJobConfigArrival(UpdateJobConfigArrivalEvent updateJobArrival) { | ||
| LOGGER.info("Received update for job configuration of job " + updateJobArrival.getJobName()); | ||
| String jobName = updateJobArrival.getJobName(); | ||
| boolean isThrottleEnabled = PropertiesUtils.getPropAsBoolean(updateJobArrival.getJobConfig(), | ||
| GobblinClusterConfigurationKeys.HELIX_JOB_SCHEDULING_THROTTLE_ENABLED_KEY, | ||
| String.valueOf(GobblinClusterConfigurationKeys.DEFAULT_HELIX_JOB_SCHEDULING_THROTTLE_ENABLED_KEY)); | ||
|
|
||
| if (isThrottleEnabled && this.jobNameToStartTimeMap.containsKey(jobName)) { | ||
| Instant jobStartTime = this.jobNameToStartTimeMap.get(jobName); | ||
| Duration workflowRunningDuration = Duration.between(jobStartTime, Instant.now()); | ||
| if (workflowRunningDuration.minus(throttleTimeoutDurationSecs).isNegative()) { | ||
| LOGGER.info("Replanning is skipped for job {} ", jobName); | ||
|
Peiyingy marked this conversation as resolved.
Outdated
|
||
| return; | ||
| } | ||
| } | ||
|
|
||
| try { | ||
| handleDeleteJobConfigArrival(new DeleteJobConfigArrivalEvent(updateJobArrival.getJobName(), | ||
| updateJobArrival.getJobConfig())); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,6 +20,8 @@ | |
| import java.io.File; | ||
| import java.io.IOException; | ||
| import java.net.URL; | ||
| import java.nio.file.Files; | ||
| import java.time.Instant; | ||
| import java.util.Collections; | ||
| import java.util.Map; | ||
| import java.util.Properties; | ||
|
|
@@ -32,6 +34,7 @@ | |
| import org.apache.helix.HelixManagerFactory; | ||
| import org.apache.helix.InstanceType; | ||
| import org.assertj.core.util.Lists; | ||
| import org.mockito.MockedStatic; | ||
| import org.slf4j.Logger; | ||
| import org.slf4j.LoggerFactory; | ||
| import org.testng.Assert; | ||
|
|
@@ -52,6 +55,8 @@ | |
| import org.apache.gobblin.runtime.job_catalog.NonObservingFSJobCatalog; | ||
| import org.apache.gobblin.scheduler.SchedulerService; | ||
|
|
||
| import static org.mockito.Mockito.*; | ||
|
|
||
|
|
||
| /** | ||
| * Unit tests for {@link org.apache.gobblin.cluster.GobblinHelixJobScheduler}. | ||
|
|
@@ -61,7 +66,6 @@ | |
| public class GobblinHelixJobSchedulerTest { | ||
| public final static Logger LOG = LoggerFactory.getLogger(GobblinHelixJobSchedulerTest.class); | ||
|
|
||
| private HelixManager helixManager; | ||
| private FileSystem localFs; | ||
| private Path appWorkDir; | ||
| private final Closer closer = Closer.create(); | ||
|
|
@@ -70,9 +74,16 @@ public class GobblinHelixJobSchedulerTest { | |
| private GobblinTaskRunner gobblinTaskRunner; | ||
|
|
||
| private Thread thread; | ||
|
|
||
| private final String workflowIdSuffix1 = "_1504201348471"; | ||
| private final String workflowIdSuffix2 = "_1504201348472"; | ||
| private final String workflowIdSuffix3 = "_1504201348473"; | ||
|
|
||
| private Instant beginTime = Instant.ofEpochMilli(0); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| private Instant withinThrottlePeriod = Instant.ofEpochMilli(1); | ||
| private Instant exceedsThrottlePeriod = Instant.ofEpochMilli(3600001); | ||
|
|
||
| private String zkConnectingString; | ||
| private String helixClusterName; | ||
|
|
||
| @BeforeClass | ||
| public void setUp() | ||
|
|
@@ -96,17 +107,11 @@ public void setUp() | |
| ConfigValueFactory.fromAnyRef(sourceJsonFile.getAbsolutePath())) | ||
| .withValue(ConfigurationKeys.JOB_STATE_IN_STATE_STORE, ConfigValueFactory.fromAnyRef("true")).resolve(); | ||
|
|
||
| String zkConnectingString = baseConfig.getString(GobblinClusterConfigurationKeys.ZK_CONNECTION_STRING_KEY); | ||
| String helixClusterName = baseConfig.getString(GobblinClusterConfigurationKeys.HELIX_CLUSTER_NAME_KEY); | ||
| this.zkConnectingString = baseConfig.getString(GobblinClusterConfigurationKeys.ZK_CONNECTION_STRING_KEY); | ||
| this.helixClusterName = baseConfig.getString(GobblinClusterConfigurationKeys.HELIX_CLUSTER_NAME_KEY); | ||
|
|
||
| HelixUtils.createGobblinHelixCluster(zkConnectingString, helixClusterName); | ||
|
|
||
| this.helixManager = HelixManagerFactory | ||
| .getZKHelixManager(helixClusterName, TestHelper.TEST_HELIX_INSTANCE_NAME, InstanceType.CONTROLLER, | ||
| zkConnectingString); | ||
| this.closer.register(() -> helixManager.disconnect()); | ||
| this.helixManager.connect(); | ||
|
|
||
| this.localFs = FileSystem.getLocal(new Configuration()); | ||
|
|
||
| this.closer.register(() -> { | ||
|
|
@@ -129,58 +134,168 @@ public void setUp() | |
| this.thread.start(); | ||
| } | ||
|
|
||
| // Time span exceeds throttle timeout, within same workflow, throttle is enabled | ||
| // Job will be updated | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Comments describing the method should be a java doc instead of a regular comment |
||
| @Test | ||
| public void testNewJobAndUpdate() | ||
| throws Exception { | ||
| runWorkflowTest(exceedsThrottlePeriod, "UpdateSameWorkflowLongPeriodThrottle", | ||
| workflowIdSuffix1, workflowIdSuffix2, workflowIdSuffix2, workflowIdSuffix2, | ||
| true, true); | ||
| } | ||
|
|
||
| // Time span is within throttle timeout, within same workflow, throttle is enabled | ||
| // Job will not be updated | ||
| @Test | ||
| public void testUpdateSameWorkflowShortPeriodThrottle() | ||
| throws Exception { | ||
| runWorkflowTest(withinThrottlePeriod, "UpdateSameWorkflowShortPeriodThrottle", | ||
| workflowIdSuffix1, workflowIdSuffix2, workflowIdSuffix2, workflowIdSuffix1, | ||
| true, true); | ||
| } | ||
|
|
||
| // Time span exceeds throttle timeout, within same workflow, throttle is not enabled | ||
| // Job will be updated | ||
| @Test | ||
| public void testUpdateSameWorkflowLongPeriodNoThrottle() | ||
| throws Exception { | ||
| runWorkflowTest(exceedsThrottlePeriod, "UpdateSameWorkflowLongPeriodNoThrottle", | ||
| workflowIdSuffix1, workflowIdSuffix2, workflowIdSuffix2, workflowIdSuffix2, | ||
| false, true); | ||
| } | ||
|
|
||
| // Time span is within throttle timeout, within same workflow, throttle is not enabled | ||
| // Job will be updated | ||
| @Test | ||
| public void testUpdateSameWorkflowShortPeriodNoThrottle() | ||
|
Peiyingy marked this conversation as resolved.
|
||
| throws Exception { | ||
| runWorkflowTest(withinThrottlePeriod, "UpdateSameWorkflowShortPeriodNoThrottle", | ||
| workflowIdSuffix1, workflowIdSuffix2, workflowIdSuffix2, workflowIdSuffix2, | ||
| false, true); | ||
| } | ||
|
|
||
| // Time span is within throttle timeout, within different workflow, throttle is enabled | ||
| // Job will be updated | ||
| public void testUpdateDiffWorkflowShortPeriodThrottle() | ||
| throws Exception { | ||
| runWorkflowTest(withinThrottlePeriod, "UpdateDiffWorkflowShortPeriodThrottle", | ||
| workflowIdSuffix1, workflowIdSuffix2, workflowIdSuffix3, workflowIdSuffix3, | ||
| true, false); | ||
| } | ||
|
|
||
| // Time span is within throttle timeout, within different workflow, throttle is not enabled | ||
| // Job will be updated | ||
| @Test | ||
| public void testUpdateDiffWorkflowShortPeriodNoThrottle() | ||
| throws Exception { | ||
| runWorkflowTest(withinThrottlePeriod, "UpdateDiffWorkflowShortPeriodNoThrottle", | ||
| workflowIdSuffix1, workflowIdSuffix2, workflowIdSuffix3, workflowIdSuffix3, | ||
| false, false); | ||
| } | ||
|
|
||
| // Time span exceeds throttle timeout, within different workflow, throttle is enabled | ||
| // Job will be updated | ||
| @Test | ||
| public void testUpdateDiffWorkflowLongPeriodThrottle() | ||
| throws Exception { | ||
| runWorkflowTest(exceedsThrottlePeriod, "UpdateDiffWorkflowLongPeriodThrottle", | ||
| workflowIdSuffix1, workflowIdSuffix2, workflowIdSuffix3, workflowIdSuffix3, | ||
| true, false); | ||
| } | ||
|
|
||
| // Time span exceeds throttle timeout, within different workflow, throttle is not enabled | ||
| // Job will be updated | ||
| @Test | ||
| public void testUpdateDiffWorkflowLongPeriodNoThrottle() | ||
| throws Exception { | ||
| runWorkflowTest(exceedsThrottlePeriod, "UpdateDiffWorkflowLongPeriodNoThrottle", | ||
| workflowIdSuffix1, workflowIdSuffix2, workflowIdSuffix3, workflowIdSuffix3, | ||
| false, false); | ||
| } | ||
|
|
||
| private GobblinHelixJobScheduler createJobScheduler(HelixManager helixManager) throws Exception { | ||
| java.nio.file.Path p = Files.createTempDirectory(GobblinHelixJobScheduler.class.getSimpleName()); | ||
| Config config = ConfigFactory.empty().withValue(ConfigurationKeys.JOB_CONFIG_FILE_GENERAL_PATH_KEY, | ||
| ConfigValueFactory.fromAnyRef("/tmp/" + GobblinHelixJobScheduler.class.getSimpleName())); | ||
| ConfigValueFactory.fromAnyRef(p.toString())); | ||
| SchedulerService schedulerService = new SchedulerService(new Properties()); | ||
| NonObservingFSJobCatalog jobCatalog = new NonObservingFSJobCatalog(config); | ||
| jobCatalog.startAsync(); | ||
| GobblinHelixJobScheduler jobScheduler = | ||
| new GobblinHelixJobScheduler(ConfigFactory.empty(), this.helixManager, java.util.Optional.empty(), | ||
| new EventBus(), appWorkDir, Lists.emptyList(), schedulerService, jobCatalog); | ||
|
|
||
| final Properties properties1 = | ||
| GobblinHelixJobLauncherTest.generateJobProperties(this.baseConfig, "1", workflowIdSuffix1); | ||
| properties1.setProperty(GobblinClusterConfigurationKeys.CANCEL_RUNNING_JOB_ON_DELETE, "true"); | ||
| return new GobblinHelixJobScheduler(ConfigFactory.empty(), helixManager, java.util.Optional.empty(), | ||
| new EventBus(), appWorkDir, Lists.emptyList(), schedulerService, jobCatalog); | ||
| } | ||
|
|
||
| private NewJobConfigArrivalEvent createJobConfigArrivalEvent(Properties properties, String suffix) { | ||
| properties.setProperty(GobblinClusterConfigurationKeys.CANCEL_RUNNING_JOB_ON_DELETE, "true"); | ||
| NewJobConfigArrivalEvent newJobConfigArrivalEvent = | ||
| new NewJobConfigArrivalEvent(properties1.getProperty(ConfigurationKeys.JOB_NAME_KEY), properties1); | ||
| jobScheduler.handleNewJobConfigArrival(newJobConfigArrivalEvent); | ||
| properties1.setProperty(ConfigurationKeys.JOB_ID_KEY, | ||
| "job_" + properties1.getProperty(ConfigurationKeys.JOB_NAME_KEY) + workflowIdSuffix2); | ||
| Map<String, String> workflowIdMap; | ||
| this.helixManager.connect(); | ||
| new NewJobConfigArrivalEvent(properties.getProperty(ConfigurationKeys.JOB_NAME_KEY), properties); | ||
| properties.setProperty(ConfigurationKeys.JOB_ID_KEY, | ||
| "job_" + properties.getProperty(ConfigurationKeys.JOB_NAME_KEY) + suffix); | ||
| return newJobConfigArrivalEvent; | ||
| } | ||
|
|
||
| private void connectAndAssertWorkflowId(String expectedSuffix, NewJobConfigArrivalEvent newJobConfigArrivalEvent, HelixManager helixManager ) throws Exception { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Random question, but why do we use |
||
| helixManager.connect(); | ||
| String workFlowId = getWorkflowID(newJobConfigArrivalEvent, helixManager); | ||
| Assert.assertNotNull(workFlowId); | ||
| Assert.assertTrue(workFlowId.endsWith(expectedSuffix)); | ||
| } | ||
|
|
||
| String workFlowId = null; | ||
| private String getWorkflowID (NewJobConfigArrivalEvent newJobConfigArrivalEvent, HelixManager helixManager ) | ||
| throws Exception { | ||
| // endTime is manually set time period that we allow HelixUtils to fetch workflowIdMap before timeout | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe better wording:
|
||
| long endTime = System.currentTimeMillis() + 30000; | ||
|
Peiyingy marked this conversation as resolved.
|
||
| Map<String, String> workflowIdMap; | ||
| while (System.currentTimeMillis() < endTime) { | ||
| workflowIdMap = HelixUtils.getWorkflowIdsFromJobNames(this.helixManager, | ||
| Collections.singletonList(newJobConfigArrivalEvent.getJobName())); | ||
| try{ | ||
| workflowIdMap = HelixUtils.getWorkflowIdsFromJobNames(helixManager, | ||
| Collections.singletonList(newJobConfigArrivalEvent.getJobName())); | ||
| } catch(GobblinHelixUnexpectedStateException e){ | ||
| continue; | ||
| } | ||
| if (workflowIdMap.containsKey(newJobConfigArrivalEvent.getJobName())) { | ||
| workFlowId = workflowIdMap.get(newJobConfigArrivalEvent.getJobName()); | ||
| break; | ||
| return workflowIdMap.get(newJobConfigArrivalEvent.getJobName()); | ||
| } | ||
| Thread.sleep(100); | ||
| } | ||
| Assert.assertNotNull(workFlowId); | ||
| Assert.assertTrue(workFlowId.endsWith(workflowIdSuffix1)); | ||
| return null; | ||
| } | ||
|
|
||
| jobScheduler.handleUpdateJobConfigArrival( | ||
| new UpdateJobConfigArrivalEvent(properties1.getProperty(ConfigurationKeys.JOB_NAME_KEY), properties1)); | ||
| this.helixManager.connect(); | ||
| endTime = System.currentTimeMillis() + 30000; | ||
| while (System.currentTimeMillis() < endTime) { | ||
| workflowIdMap = HelixUtils.getWorkflowIdsFromJobNames(this.helixManager, | ||
| Collections.singletonList(newJobConfigArrivalEvent.getJobName())); | ||
| if (workflowIdMap.containsKey(newJobConfigArrivalEvent.getJobName())) { | ||
| workFlowId = workflowIdMap.get(newJobConfigArrivalEvent.getJobName()); | ||
| break; | ||
| private void runWorkflowTest(Instant mockedTime, String jobSuffix, String newJobWorkflowIdSuffix, | ||
| String updateWorkflowIdSuffix1, String updateWorkflowIdSuffix2, | ||
| String assertUpdateWorkflowIdSuffix, boolean isThrottleEnabled, boolean isSameWorkflow) throws Exception { | ||
| try (MockedStatic<Instant> mocked = mockStatic(Instant.class, CALLS_REAL_METHODS)) { | ||
| mocked.when(Instant::now).thenReturn(beginTime, mockedTime); | ||
|
|
||
| // helixManager is set to local variable to avoid the HelixManager (ZkClient) is not connected error across tests | ||
| HelixManager helixManager = HelixManagerFactory | ||
| .getZKHelixManager(helixClusterName, TestHelper.TEST_HELIX_INSTANCE_NAME, InstanceType.CONTROLLER, | ||
| zkConnectingString); | ||
| GobblinHelixJobScheduler jobScheduler = createJobScheduler(helixManager); | ||
| final Properties properties = | ||
| GobblinHelixJobLauncherTest.generateJobProperties( | ||
| this.baseConfig, jobSuffix, newJobWorkflowIdSuffix); | ||
| NewJobConfigArrivalEvent newJobConfigArrivalEvent = createJobConfigArrivalEvent(properties, updateWorkflowIdSuffix1); | ||
| jobScheduler.handleNewJobConfigArrival(newJobConfigArrivalEvent); | ||
| connectAndAssertWorkflowId(newJobWorkflowIdSuffix, newJobConfigArrivalEvent, helixManager); | ||
|
|
||
| if (isSameWorkflow) { | ||
| properties.setProperty(GobblinClusterConfigurationKeys.HELIX_JOB_SCHEDULING_THROTTLE_ENABLED_KEY, String.valueOf(isThrottleEnabled)); | ||
| jobScheduler.handleUpdateJobConfigArrival( | ||
| new UpdateJobConfigArrivalEvent(properties.getProperty(ConfigurationKeys.JOB_NAME_KEY), properties)); | ||
| connectAndAssertWorkflowId(assertUpdateWorkflowIdSuffix, newJobConfigArrivalEvent, helixManager); | ||
| } | ||
| else { | ||
| final Properties properties2 = | ||
| GobblinHelixJobLauncherTest.generateJobProperties( | ||
| this.baseConfig, jobSuffix + '2', updateWorkflowIdSuffix2); | ||
| NewJobConfigArrivalEvent newJobConfigArrivalEvent2 = | ||
| new NewJobConfigArrivalEvent(properties2.getProperty(ConfigurationKeys.JOB_NAME_KEY), properties2); | ||
| properties2.setProperty(GobblinClusterConfigurationKeys.HELIX_JOB_SCHEDULING_THROTTLE_ENABLED_KEY, String.valueOf(isThrottleEnabled)); | ||
| jobScheduler.handleUpdateJobConfigArrival( | ||
| new UpdateJobConfigArrivalEvent(properties2.getProperty(ConfigurationKeys.JOB_NAME_KEY), properties2)); | ||
| connectAndAssertWorkflowId(assertUpdateWorkflowIdSuffix, newJobConfigArrivalEvent2, helixManager); | ||
| } | ||
| Thread.sleep(100); | ||
| } | ||
| Assert.assertTrue(workFlowId.endsWith(workflowIdSuffix2)); | ||
| } | ||
|
|
||
| @AfterClass | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| mock-maker-inline | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think we need this anymore since we are not mocking any static classes |
||
Uh oh!
There was an error while loading. Please reload this page.