Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Provide detailed diagnostic message when allocation timeout #660

Merged
merged 1 commit into from
Apr 17, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
import com.linkedin.tony.util.Utils;

import static com.linkedin.tony.Constants.SIDECAR_TB_ROLE_NAME;
import static com.linkedin.tony.TonyConfigurationKeys.CONTAINER_ALLOCATION_TIMEOUT;
import static com.linkedin.tony.TonyConfigurationKeys.getGroupDependentIgnoredKey;

public abstract class MLGenericRuntime extends AbstractFrameworkRuntime {
Expand Down Expand Up @@ -128,8 +129,9 @@ public boolean isHealthy(Configuration tonyConf) {
* https://github.com/linkedin/TonY/issues/573.
* So it's necessary to release reserved container resources on AM when containers allocation timeout reached in GANG mode.
*/
if (containerAllocationTimeout(tonyConf)) {
session.setFinalStatus(FinalApplicationStatus.FAILED, "Container allocation timeout.");
String diagnostics = containerAllocationTimeout(tonyConf);
if (diagnostics != null) {
session.setFinalStatus(FinalApplicationStatus.FAILED, diagnostics);
return false;
}

Expand Down Expand Up @@ -276,29 +278,31 @@ private Map<String, List<String>> getMemberInGroups(Map<String, List<String>> gr
return memberInGroups;
}

private boolean containerAllocationTimeout(Configuration tonyConf) {
private String containerAllocationTimeout(Configuration tonyConf) {
String distributedModeVal = tonyConf.get(TonyConfigurationKeys.APPLICATION_DISTRIBUTED_MODE,
TonyConfigurationKeys.DEFAULT_APPLICATION_DISTRIBUTED_MODE);
TonyConfigurationKeys.DistributedMode distributedMode =
TonyConfigurationKeys.DistributedMode.valueOf(distributedModeVal.toUpperCase());
if (distributedMode != TonyConfigurationKeys.DistributedMode.GANG) {
return false;
return null;
}

// When not setting container allocation timeout, it will always return false.
int containerAllocationTimeout = tonyConf.getInt(TonyConfigurationKeys.CONTAINER_ALLOCATION_TIMEOUT,
int containerAllocationTimeout = tonyConf.getInt(CONTAINER_ALLOCATION_TIMEOUT,
TonyConfigurationKeys.DEFAULT_CONTAINER_ALLOCATION_TIMEOUT);
if (containerAllocationTimeout <= 0) {
return false;
return null;
}

if (session.getTotalTasks() - session.getNumRegisteredTasks() > 0
&& System.currentTimeMillis() - runtimeInitialTime > containerAllocationTimeout) {
log.error("Container Allocation timeout, total required tasks number: " + session.getTotalTasks()
+ ", allocated tasks number: " + session.getRegisteredTasks());
return true;
String diagnostics = "Task executors allocation timeout(" + CONTAINER_ALLOCATION_TIMEOUT
+ "=" + containerAllocationTimeout + "). Total required number: "
+ session.getTotalTasks() + ", allocated number: " + session.getRegisteredTasks();
log.error(diagnostics);
return diagnostics;
}
return false;
return null;
}

@VisibleForTesting
Expand Down