-
Notifications
You must be signed in to change notification settings - Fork 749
[GOBBLIN-1930] Improve Multi-active related logs and metrics #3800
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
00a26ee
e3bfa64
d4c7c2f
fff6670
07ea65c
627e7ec
afaedbe
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -43,14 +43,17 @@ public class RuntimeMetrics { | |
| public static final String GOBBLIN_SPEC_STORE_MESSAGE_PROCESSED= ServiceMetricNames.GOBBLIN_SERVICE_PREFIX + ".specStoreMonitor.message.processed"; | ||
| public static final String GOBBLIN_SPEC_STORE_PRODUCE_TO_CONSUME_DELAY_MILLIS = | ||
| ServiceMetricNames.GOBBLIN_SERVICE_PREFIX + ".specstoreMonitor.produce.to.consume.delay"; | ||
| public static final String GOBBLIN_DAG_ACTION_STORE_MONITOR_KILLS_INVOKED = ServiceMetricNames.GOBBLIN_SERVICE_PREFIX + ".dagActionStoreMonitor.kills.invoked"; | ||
| public static final String GOBBLIN_DAG_ACTION_STORE_MONITOR_MESSAGE_PROCESSED= ServiceMetricNames.GOBBLIN_SERVICE_PREFIX + ".dagActionStoreMonitor.message.processed"; | ||
| public static final String GOBBLIN_DAG_ACTION_STORE_MONITOR_RESUMES_INVOKED = ServiceMetricNames.GOBBLIN_SERVICE_PREFIX + ".dagActionStoreMonitor.resumes.invoked"; | ||
| public static final String GOBBLIN_DAG_ACTION_STORE_MONITOR_FLOWS_LAUNCHED = ServiceMetricNames.GOBBLIN_SERVICE_PREFIX + ".dagActionStoreMonitor.flows.launched"; | ||
| public static final String GOBBLIN_DAG_ACTION_STORE_MONITOR_UNEXPECTED_ERRORS = ServiceMetricNames.GOBBLIN_SERVICE_PREFIX + ".dagActionStoreMonitor.unexpected.errors"; | ||
| public static final String | ||
| GOBBLIN_DAG_ACTION_STORE_PRODUCE_TO_CONSUME_DELAY_MILLIS = ServiceMetricNames.GOBBLIN_SERVICE_PREFIX + ".dagActionStoreMonitor.produce.to.consume.delay"; | ||
| public static final String DAG_ACTION_STORE_MONITOR_PREFIX = "dagActionStoreMonitor"; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: this is no longer a prefix... but why anyway do you prefer to repeat so many times
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah good pt, updated to have the prefix contain |
||
| public static final String GOBBLIN_DAG_ACTION_STORE_MONITOR_KILLS_INVOKED = ServiceMetricNames.GOBBLIN_SERVICE_PREFIX + "." + DAG_ACTION_STORE_MONITOR_PREFIX + ".kills.invoked"; | ||
| public static final String GOBBLIN_DAG_ACTION_STORE_MONITOR_MESSAGE_PROCESSED = ServiceMetricNames.GOBBLIN_SERVICE_PREFIX + "." + DAG_ACTION_STORE_MONITOR_PREFIX + ".message.processed"; | ||
| public static final String GOBBLIN_DAG_ACTION_STORE_MONITOR_MESSAGES_FILTERED_OUT = ServiceMetricNames.GOBBLIN_SERVICE_PREFIX + "." + DAG_ACTION_STORE_MONITOR_PREFIX + ".messagesFilteredOut"; | ||
| public static final String GOBBLIN_DAG_ACTION_STORE_MONITOR_RESUMES_INVOKED = ServiceMetricNames.GOBBLIN_SERVICE_PREFIX + "." + DAG_ACTION_STORE_MONITOR_PREFIX + ".resumes.invoked"; | ||
| public static final String GOBBLIN_DAG_ACTION_STORE_MONITOR_FLOWS_LAUNCHED = ServiceMetricNames.GOBBLIN_SERVICE_PREFIX + "." + DAG_ACTION_STORE_MONITOR_PREFIX + ".flows.launched"; | ||
|
|
||
| public static final String GOBBLIN_DAG_ACTION_STORE_FAILED_FLOW_LAUNCHED_SUBMISSIONS = ServiceMetricNames.GOBBLIN_SERVICE_PREFIX + "." + DAG_ACTION_STORE_MONITOR_PREFIX + ".failedFlowLaunchSubmissions"; | ||
| public static final String GOBBLIN_DAG_ACTION_STORE_MONITOR_UNEXPECTED_ERRORS = ServiceMetricNames.GOBBLIN_SERVICE_PREFIX + "." + DAG_ACTION_STORE_MONITOR_PREFIX + ".unexpected.errors"; | ||
| public static final String | ||
| GOBBLIN_DAG_ACTION_STORE_PRODUCE_TO_CONSUME_DELAY_MILLIS = ServiceMetricNames.GOBBLIN_SERVICE_PREFIX + "." + DAG_ACTION_STORE_MONITOR_PREFIX + ".produce.to.consume.delay"; | ||
| public static final String GOBBLIN_MYSQL_QUOTA_MANAGER_UNEXPECTED_ERRORS = ServiceMetricNames.GOBBLIN_SERVICE_PREFIX + "gobblin.mysql.quota.manager.unexpected.errors"; | ||
| public static final String GOBBLIN_MYSQL_QUOTA_MANAGER_QUOTA_REQUESTS_EXCEEDED = ServiceMetricNames.GOBBLIN_SERVICE_PREFIX + "gobblin.mysql.quota.manager.quotaRequests.exceeded"; | ||
| public static final String GOBBLIN_MYSQL_QUOTA_MANAGER_TIME_TO_CHECK_QUOTA = ServiceMetricNames.GOBBLIN_SERVICE_PREFIX + "gobblin.mysql.quota.manager.time.to.check.quota"; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -510,13 +510,18 @@ public void handleLaunchFlowEvent(DagActionStore.DagAction launchAction) { | |
| this.dagActionStore.get().deleteDagAction(launchAction); | ||
| } catch (URISyntaxException e) { | ||
| log.warn("Could not create URI object for flowId {} due to exception {}", flowId, e.getMessage()); | ||
| this.dagManagerMetrics.incrementFailedLaunchCount(); | ||
| } catch (SpecNotFoundException e) { | ||
| log.warn("Spec not found for flowId {} due to exception {}", flowId, e.getMessage()); | ||
| this.dagManagerMetrics.incrementFailedLaunchCount(); | ||
| } catch (IOException e) { | ||
| log.warn("Failed to add Job Execution Plan for flowId {} OR delete dag action from dagActionStore (check " | ||
| + "stacktrace) due to exception {}", flowId, e.getMessage()); | ||
| this.dagManagerMetrics.incrementFailedLaunchCount(); | ||
| } catch (InterruptedException e) { | ||
| log.warn("SpecCompiler failed to reach healthy state before compilation of flowId {}. Exception: ", flowId, e); | ||
| log.warn("SpecCompiler failed to reach healthy state before compilation of flowId {} due to exception {}", flowId, | ||
| e); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess there's the possibility I may be misremembering... and pursuing this needlessly... but my expectation is that a stacktrace would only be written when calling this form: if you call the form: are you certain it will print the ST, when the last arg is
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are two |
||
| this.dagManagerMetrics.incrementFailedLaunchCount(); | ||
| } | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -58,8 +58,10 @@ public class DagActionStoreChangeMonitor extends HighLevelConsumer { | |
| private ContextAwareMeter killsInvoked; | ||
| private ContextAwareMeter resumesInvoked; | ||
| private ContextAwareMeter flowsLaunched; | ||
| private ContextAwareMeter failedFlowLaunchSubmissions; | ||
| private ContextAwareMeter unexpectedErrors; | ||
| private ContextAwareMeter messageProcessedMeter; | ||
| private ContextAwareMeter messageFilteredOutMeter; | ||
| private ContextAwareGauge produceToConsumeDelayMillis; // Reports delay from all partitions in one gauge | ||
|
|
||
| private volatile Long produceToConsumeDelayValue = -1L; | ||
|
|
@@ -130,30 +132,34 @@ protected void processMessage(DecodeableKafkaRecord message) { | |
| String changeIdentifier = tid + key; | ||
| if (!ChangeMonitorUtils.shouldProcessMessage(changeIdentifier, dagActionsSeenCache, operation, | ||
| produceTimestamp.toString())) { | ||
| this.messageFilteredOutMeter.mark(); | ||
| return; | ||
| } | ||
|
|
||
| // Used to easily log information to identify the dag action | ||
| DagActionStore.DagAction dagAction = new DagActionStore.DagAction(flowGroup, flowName, flowExecutionId, | ||
| dagActionType); | ||
|
|
||
| // We only expect INSERT and DELETE operations done to this table. INSERTs correspond to any type of | ||
| // {@link DagActionStore.FlowActionType} flow requests that have to be processed. DELETEs require no action. | ||
| try { | ||
| if (operation.equals("INSERT")) { | ||
| if (dagActionType.equals(DagActionStore.FlowActionType.RESUME)) { | ||
| log.info("Received insert dag action and about to send resume flow request"); | ||
| log.info("Received insert dag action and about to send resume flow request for: {}", dagAction); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: too conversational. how about: (i.e. won't the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was thinking about unifying all the logs like this as well, let me make the change. I will add a bit more context in message but use this format. |
||
| dagManager.handleResumeFlowRequest(flowGroup, flowName,Long.parseLong(flowExecutionId)); | ||
| this.resumesInvoked.mark(); | ||
| } else if (dagActionType.equals(DagActionStore.FlowActionType.KILL)) { | ||
| log.info("Received insert dag action and about to send kill flow request"); | ||
| log.info("Received insert dag action and about to send kill flow request for: {}", dagAction); | ||
| dagManager.handleKillFlowRequest(flowGroup, flowName, Long.parseLong(flowExecutionId)); | ||
| this.killsInvoked.mark(); | ||
| } else if (dagActionType.equals(DagActionStore.FlowActionType.LAUNCH)) { | ||
| // If multi-active scheduler is NOT turned on we should not receive these type of events | ||
| if (!this.isMultiActiveSchedulerEnabled) { | ||
| this.unexpectedErrors.mark(); | ||
| throw new RuntimeException(String.format("Received LAUNCH dagAction while not in multi-active scheduler " | ||
| + "mode for flowAction: %s", | ||
| new DagActionStore.DagAction(flowGroup, flowName, flowExecutionId, dagActionType))); | ||
| + "mode for flowAction: %s", dagAction)); | ||
| } | ||
| log.info("Received insert dag action and about to forward launch request to DagManager"); | ||
| log.info("Received insert dag action and about to forward launch request to DagManager for: {}", dagAction); | ||
| submitFlowToDagManagerHelper(flowGroup, flowName); | ||
| } else { | ||
| log.warn("Received unsupported dagAction {}. Expected to be a KILL, RESUME, or LAUNCH", dagActionType); | ||
|
|
@@ -191,19 +197,19 @@ protected void submitFlowToDagManagerHelper(String flowGroup, String flowName) { | |
| this.orchestrator.submitFlowToDagManager(spec); | ||
| } catch (URISyntaxException e) { | ||
| log.warn("Could not create URI object for flowId {}. Exception {}", flowId, e.getMessage()); | ||
| this.unexpectedErrors.mark(); | ||
| this.failedFlowLaunchSubmissions.mark(); | ||
| return; | ||
| } catch (SpecNotFoundException e) { | ||
| log.warn("Spec not found for flowId {} due to exception {}", flowId, e.getMessage()); | ||
| this.unexpectedErrors.mark(); | ||
| this.failedFlowLaunchSubmissions.mark(); | ||
| return; | ||
| } catch (IOException e) { | ||
| log.warn("Failed to add Job Execution Plan for flowId {} due to exception {}", flowId, e.getMessage()); | ||
| this.unexpectedErrors.mark(); | ||
| this.failedFlowLaunchSubmissions.mark(); | ||
| return; | ||
| } catch (InterruptedException e) { | ||
| log.warn("SpecCompiler failed to reach healthy state before compilation of flowId {}. Exception: ", flowId, e); | ||
| this.unexpectedErrors.mark(); | ||
| this.failedFlowLaunchSubmissions.mark(); | ||
| return; | ||
| } | ||
| // Only mark this if the dag was successfully added | ||
|
|
@@ -216,8 +222,10 @@ protected void createMetrics() { | |
| this.killsInvoked = this.getMetricContext().contextAwareMeter(RuntimeMetrics.GOBBLIN_DAG_ACTION_STORE_MONITOR_KILLS_INVOKED); | ||
| this.resumesInvoked = this.getMetricContext().contextAwareMeter(RuntimeMetrics.GOBBLIN_DAG_ACTION_STORE_MONITOR_RESUMES_INVOKED); | ||
| this.flowsLaunched = this.getMetricContext().contextAwareMeter(RuntimeMetrics.GOBBLIN_DAG_ACTION_STORE_MONITOR_FLOWS_LAUNCHED); | ||
| this.failedFlowLaunchSubmissions = this.getMetricContext().contextAwareMeter(RuntimeMetrics.GOBBLIN_DAG_ACTION_STORE_FAILED_FLOW_LAUNCHED_SUBMISSIONS); | ||
| this.unexpectedErrors = this.getMetricContext().contextAwareMeter(RuntimeMetrics.GOBBLIN_DAG_ACTION_STORE_MONITOR_UNEXPECTED_ERRORS); | ||
| this.messageProcessedMeter = this.getMetricContext().contextAwareMeter(RuntimeMetrics.GOBBLIN_DAG_ACTION_STORE_MONITOR_MESSAGE_PROCESSED); | ||
| this.messageFilteredOutMeter = this.getMetricContext().contextAwareMeter(RuntimeMetrics.GOBBLIN_DAG_ACTION_STORE_MONITOR_MESSAGES_FILTERED_OUT); | ||
| this.produceToConsumeDelayMillis = this.getMetricContext().newContextAwareGauge(RuntimeMetrics.GOBBLIN_DAG_ACTION_STORE_PRODUCE_TO_CONSUME_DELAY_MILLIS, () -> produceToConsumeDelayValue); | ||
| this.getMetricContext().register(this.produceToConsumeDelayMillis); | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
wondering... couldn't this be a
dagManager.metric?dagManager.failedLaunchEventsOn...?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I want to be clear that this is a failure that occurs related to handling all
dagActionrelated code changes and easily find them when they may originate fromdagActionStoreMonitor,dagManager, or other locations. We also don't use adagManagerprefix for otherdagManagermetrics for some reason