From 802140321c13979913723f1efaa87447d8d003cc Mon Sep 17 00:00:00 2001 From: Arjun Date: Fri, 21 Jul 2023 16:05:53 -0700 Subject: [PATCH 1/2] set number of failed attempts correctly --- .../runtime/listeners/EmailNotificationJobListener.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/gobblin-runtime/src/main/java/org/apache/gobblin/runtime/listeners/EmailNotificationJobListener.java b/gobblin-runtime/src/main/java/org/apache/gobblin/runtime/listeners/EmailNotificationJobListener.java index 02aa4a2423c..e4394689ae4 100644 --- a/gobblin-runtime/src/main/java/org/apache/gobblin/runtime/listeners/EmailNotificationJobListener.java +++ b/gobblin-runtime/src/main/java/org/apache/gobblin/runtime/listeners/EmailNotificationJobListener.java @@ -48,7 +48,10 @@ public void onJobCompletion(JobContext jobContext) { // Send out alert email if the maximum number of consecutive failures is reached if (jobState.getState() == JobState.RunningState.FAILED) { - int failures = jobState.getPropAsInt(ConfigurationKeys.JOB_FAILURES_KEY, 0) + jobContext.getDatasetStateFailures(); + // Number of fail attempts are increased during the commit phase by changing JOB_FAILURES_KEY. + // But if the job fails before commit, e.g. during workunit creation, this config is not set, + // so it makes sense to set the default value to 1 when the job state is FAILED + int failures = jobState.getPropAsInt(ConfigurationKeys.JOB_FAILURES_KEY, 1) + jobContext.getDatasetStateFailures(); int maxFailures = jobState.getPropAsInt(ConfigurationKeys.JOB_MAX_FAILURES_KEY, ConfigurationKeys.DEFAULT_JOB_MAX_FAILURES); if (alertEmailEnabled && failures >= maxFailures) { From 39778ee8d00a3bf59f050a38b3798ac4d304a3b7 Mon Sep 17 00:00:00 2001 From: Arjun Date: Fri, 21 Jul 2023 17:01:30 -0700 Subject: [PATCH 2/2] set number of failed attempts correctly --- .../java/org/apache/gobblin/runtime/AbstractJobLauncher.java | 2 ++ .../runtime/listeners/EmailNotificationJobListener.java | 5 +---- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/gobblin-runtime/src/main/java/org/apache/gobblin/runtime/AbstractJobLauncher.java b/gobblin-runtime/src/main/java/org/apache/gobblin/runtime/AbstractJobLauncher.java index 6e47cc482b9..3f2570c68f5 100644 --- a/gobblin-runtime/src/main/java/org/apache/gobblin/runtime/AbstractJobLauncher.java +++ b/gobblin-runtime/src/main/java/org/apache/gobblin/runtime/AbstractJobLauncher.java @@ -608,6 +608,8 @@ public WorkUnit apply(@Nullable WorkUnit input) { String errMsg = "Failed to launch and run job " + jobId + " due to " + t.getMessage(); LOG.error(errMsg + ": " + t, t); this.jobContext.getJobState().setJobFailureException(t); + jobState.setProp(ConfigurationKeys.JOB_FAILURES_KEY, + Integer.parseInt(jobState.getProp(ConfigurationKeys.JOB_FAILURES_KEY, "0")) + 1); } finally { try { troubleshooter.refineIssues(); diff --git a/gobblin-runtime/src/main/java/org/apache/gobblin/runtime/listeners/EmailNotificationJobListener.java b/gobblin-runtime/src/main/java/org/apache/gobblin/runtime/listeners/EmailNotificationJobListener.java index e4394689ae4..02aa4a2423c 100644 --- a/gobblin-runtime/src/main/java/org/apache/gobblin/runtime/listeners/EmailNotificationJobListener.java +++ b/gobblin-runtime/src/main/java/org/apache/gobblin/runtime/listeners/EmailNotificationJobListener.java @@ -48,10 +48,7 @@ public void onJobCompletion(JobContext jobContext) { // Send out alert email if the maximum number of consecutive failures is reached if (jobState.getState() == JobState.RunningState.FAILED) { - // Number of fail attempts are increased during the commit phase by changing JOB_FAILURES_KEY. - // But if the job fails before commit, e.g. during workunit creation, this config is not set, - // so it makes sense to set the default value to 1 when the job state is FAILED - int failures = jobState.getPropAsInt(ConfigurationKeys.JOB_FAILURES_KEY, 1) + jobContext.getDatasetStateFailures(); + int failures = jobState.getPropAsInt(ConfigurationKeys.JOB_FAILURES_KEY, 0) + jobContext.getDatasetStateFailures(); int maxFailures = jobState.getPropAsInt(ConfigurationKeys.JOB_MAX_FAILURES_KEY, ConfigurationKeys.DEFAULT_JOB_MAX_FAILURES); if (alertEmailEnabled && failures >= maxFailures) {