apache · abstractdog · Nov 19, 2021 · Apr 10, 2020 · Oct 20, 2021 · rbalamohan
diff --git a/tez-api/src/main/java/org/apache/tez/dag/api/TezConfiguration.java b/tez-api/src/main/java/org/apache/tez/dag/api/TezConfiguration.java
@@ -300,6 +300,24 @@ public TezConfiguration(boolean loadDefaults) {
       TEZ_AM_PREFIX + "max.allowed.time-sec.for-read-error";
   public static final int TEZ_AM_MAX_ALLOWED_TIME_FOR_TASK_READ_ERROR_SEC_DEFAULT = 300;
 
+  /**
+   * Double value. Assuming that a certain number of downstream hosts reported fetch failure for a
+   * given upstream host, this config drives the max allowed ratio of (downstream hosts) / (all hosts).
+   * The total number of used hosts are tracked by AMNodeTracker, which divides the distinct number of
+   * downstream hosts blaming source(upstream) tasks in a given vertex. If the fraction is beyond this
+   * limit, the upstream task attempt is marked as failed (so blamed for the fetch failure).
+   * E.g. if this set to 0.2, in case of 3 different hosts reporting fetch failure
+   * for the same upstream host in a cluster which currently utilizes 10 nodes, the upstream task
+   * is immediately blamed for the fetch failure.
+   *
+   * Expert level setting.
+   */
+  @ConfigurationScope(Scope.AM)
+  @ConfigurationProperty(type="integer")
+  public static final String TEZ_AM_MAX_ALLOWED_DOWNSTREAM_HOST_FAILURES_FRACTION =
+      TEZ_AM_PREFIX + "max.allowed.downstream.host.failures.fraction";
+  public static final double TEZ_AM_MAX_ALLOWED_DOWNSTREAM_HOST_FAILURES_FRACTION_DEFAULT = 0.2;
+
   /**
    * Boolean value. Determines when the final outputs to data sinks are committed. Commit is an
    * output specific operation and typically involves making the output visible for consumption.

diff --git a/tez-api/src/main/java/org/apache/tez/runtime/api/events/InputReadErrorEvent.java b/tez-api/src/main/java/org/apache/tez/runtime/api/events/InputReadErrorEvent.java
@@ -62,33 +62,44 @@ public final class InputReadErrorEvent extends Event {
    */
   private final boolean isDiskErrorAtSource;
 
+  /**
+   * The localhostName of the destination task attempt.
+   */
+  private final String destinationLocalhostName;
+
   private InputReadErrorEvent(final String diagnostics, final int index, final int version,
-      final int numFailures, boolean isLocalFetch, boolean isDiskErrorAtSource) {
+      final int numFailures, boolean isLocalFetch, boolean isDiskErrorAtSource, String destinationLocalhostName) {
     super();
     this.diagnostics = diagnostics;
     this.index = index;
     this.version = version;
     this.numFailures = numFailures;
     this.isLocalFetch = isLocalFetch;
     this.isDiskErrorAtSource = isDiskErrorAtSource;
+    this.destinationLocalhostName = destinationLocalhostName;
   }
 
   public static InputReadErrorEvent create(String diagnostics, int index, int version,
       boolean isLocalFetch, boolean isDiskErrorAtSource) {
-    return create(diagnostics, index, version, 1, isLocalFetch, isDiskErrorAtSource);
+    return create(diagnostics, index, version, 1, isLocalFetch, isDiskErrorAtSource, null);
   }
 
   public static InputReadErrorEvent create(String diagnostics, int index, int version) {
-    return create(diagnostics, index, version, 1, false, false);
+    return create(diagnostics, index, version, 1, false, false, null);
+  }
+
+  public static InputReadErrorEvent create(String diagnostics, int index, int version, boolean isLocalFetch,
+      boolean isDiskErrorAtSource, String destinationLocalhostName) {
+    return create(diagnostics, index, version, 1, isLocalFetch, isDiskErrorAtSource, destinationLocalhostName);
   }
 
   /**
    * Create an InputReadErrorEvent.
    */
-  public static InputReadErrorEvent create(final String diagnostics, final int index,
-      final int version, final int numFailures, boolean isLocalFetch, boolean isDiskErrorAtSource) {
-    return new InputReadErrorEvent(diagnostics, index, version, numFailures, isLocalFetch,
-        isDiskErrorAtSource);
+  public static InputReadErrorEvent create(final String diagnostics, final int index, final int version,
+      final int numFailures, boolean isLocalFetch, boolean isDiskErrorAtSource, String destinationLocalhostName) {
+    return new InputReadErrorEvent(diagnostics, index, version, numFailures, isLocalFetch, isDiskErrorAtSource,
+        destinationLocalhostName);
   }
 
   public String getDiagnostics() {
@@ -118,6 +129,10 @@ public boolean isDiskErrorAtSource() {
     return isDiskErrorAtSource;
   }
 
+  public String getDestinationLocalhostName(){
+    return destinationLocalhostName;
+  }
+
   @Override
   public int hashCode() {
     return Objects.hash(index, version);

diff --git a/tez-api/src/main/java/org/apache/tez/serviceplugins/api/ContainerLauncherContext.java b/tez-api/src/main/java/org/apache/tez/serviceplugins/api/ContainerLauncherContext.java
@@ -80,7 +80,7 @@ void containerCompleted(ContainerId containerId, int exitStatus, String diagnost
    * Get the number of nodes being handled by the specified source
    *
    * @param sourceName the relevant source name
-   * @return the initial payload
+   * @return the number of nodes
    */
   int getNumNodes(String sourceName);
 

diff --git a/tez-api/src/main/proto/Events.proto b/tez-api/src/main/proto/Events.proto
@@ -41,6 +41,7 @@ message InputReadErrorEventProto {
   optional int32 version = 3;
   optional bool is_local_fetch = 4;
   optional bool is_disk_error_at_source = 5;
+  optional string destination_localhost_name = 6;
 }
 
 message InputFailedEventProto {

diff --git a/tez-dag/src/main/java/org/apache/tez/dag/app/dag/Vertex.java b/tez-dag/src/main/java/org/apache/tez/dag/app/dag/Vertex.java
@@ -230,8 +230,13 @@ interface VertexConfig {
      * @return tez.am.max.allowed.time-sec.for-read-error.
      */
     int getMaxAllowedTimeForTaskReadErrorSec();
+    /**
+     * @return tez.am.max.allowed.downstream.host.failures.fraction.
+     */
+    double getMaxAllowedDownstreamHostFailuresFraction();
   }
 
   void incrementRejectedTaskAttemptCount();
   int getRejectedTaskAttemptCount();
+  Map<String, Set<String>> getDownstreamBlamingHosts();
 }
diff --git a/tez-dag/src/main/java/org/apache/tez/dag/app/dag/event/TaskAttemptEventOutputFailed.java b/tez-dag/src/main/java/org/apache/tez/dag/app/dag/event/TaskAttemptEventOutputFailed.java
@@ -28,9 +28,9 @@ public class TaskAttemptEventOutputFailed extends TaskAttemptEvent
   private TezEvent inputFailedEvent;
   private int consumerTaskNumber;
 
-  public TaskAttemptEventOutputFailed(TezTaskAttemptID attemptId,
+  public TaskAttemptEventOutputFailed(TezTaskAttemptID sourceTaskAttemptId,
       TezEvent tezEvent, int numConsumers) {
-    super(attemptId, TaskAttemptEventType.TA_OUTPUT_FAILED);
+    super(sourceTaskAttemptId, TaskAttemptEventType.TA_OUTPUT_FAILED);
     this.inputFailedEvent = tezEvent;
     this.consumerTaskNumber = numConsumers;
   }

diff --git a/tez-dag/src/main/java/org/apache/tez/dag/app/dag/impl/Edge.java b/tez-dag/src/main/java/org/apache/tez/dag/app/dag/impl/Edge.java
@@ -374,7 +374,7 @@ public void sendTezEventToSourceTasks(TezEvent tezEvent) throws AMUserCodeExcept
     if (!bufferEvents.get()) {
       switch (tezEvent.getEventType()) {
       case INPUT_READ_ERROR_EVENT:
-        InputReadErrorEvent event = (InputReadErrorEvent) tezEvent.getEvent();
+        InputReadErrorEvent inputReadErrorEvent = (InputReadErrorEvent) tezEvent.getEvent();
         TezTaskAttemptID destAttemptId = tezEvent.getSourceInfo()
             .getTaskAttemptID();
         int destTaskIndex = destAttemptId.getTaskID().getId();
@@ -383,10 +383,10 @@ public void sendTezEventToSourceTasks(TezEvent tezEvent) throws AMUserCodeExcept
         try {
           if (onDemandRouting) {
             srcTaskIndex = ((EdgeManagerPluginOnDemand) edgeManager).routeInputErrorEventToSource(
-                destTaskIndex, event.getIndex());            
+                destTaskIndex, inputReadErrorEvent.getIndex());
           } else {
-            srcTaskIndex = edgeManager.routeInputErrorEventToSource(event,
-                destTaskIndex, event.getIndex());
+            srcTaskIndex = edgeManager.routeInputErrorEventToSource(inputReadErrorEvent,
+                destTaskIndex, inputReadErrorEvent.getIndex());
           }
           Preconditions.checkArgument(srcTaskIndex >= 0,
               "SourceTaskIndex should not be negative,"
@@ -414,11 +414,10 @@ public void sendTezEventToSourceTasks(TezEvent tezEvent) throws AMUserCodeExcept
               " edgeManager=" + edgeManager.getClass().getName());
         }
         TezTaskID srcTaskId = srcTask.getTaskId();
-        int taskAttemptIndex = event.getVersion();
+        int srcTaskAttemptIndex = inputReadErrorEvent.getVersion();
         TezTaskAttemptID srcTaskAttemptId = TezTaskAttemptID.getInstance(srcTaskId,
-            taskAttemptIndex);
-        sendEvent(new TaskAttemptEventOutputFailed(srcTaskAttemptId,
-            tezEvent, numConsumers));
+            srcTaskAttemptIndex);
+        sendEvent(new TaskAttemptEventOutputFailed(srcTaskAttemptId, tezEvent, numConsumers));
         break;
       default:
         throw new TezUncheckedException("Unhandled tez event type: "

diff --git a/tez-dag/src/main/java/org/apache/tez/dag/app/dag/impl/TaskAttemptImpl.java b/tez-dag/src/main/java/org/apache/tez/dag/app/dag/impl/TaskAttemptImpl.java
@@ -194,7 +194,7 @@ public static DataEventDependencyInfo fromProto(DataEventDependencyInfoProto pro
   private Container container;
   private long allocationTime;
   private ContainerId containerId;
-  private NodeId containerNodeId;
+  protected NodeId containerNodeId;
   private String nodeHttpAddress;
   private String nodeRackName;
 
@@ -1793,85 +1793,130 @@ protected static class OutputReportedFailedTransition implements
   MultipleArcTransition<TaskAttemptImpl, TaskAttemptEvent, TaskAttemptStateInternal> {
 
     @Override
-    public TaskAttemptStateInternal transition(TaskAttemptImpl attempt,
+    public TaskAttemptStateInternal transition(TaskAttemptImpl sourceAttempt,
         TaskAttemptEvent event) {
       TaskAttemptEventOutputFailed outputFailedEvent = 
           (TaskAttemptEventOutputFailed) event;
-      TezEvent tezEvent = outputFailedEvent.getInputFailedEvent();
-      TezTaskAttemptID failedDestTaId = tezEvent.getSourceInfo().getTaskAttemptID();
-      InputReadErrorEvent readErrorEvent = (InputReadErrorEvent)tezEvent.getEvent();
+      TezEvent inputFailedEvent = outputFailedEvent.getInputFailedEvent();
+      TezTaskAttemptID failedDestTaId = inputFailedEvent.getSourceInfo().getTaskAttemptID();
+
+      InputReadErrorEvent readErrorEvent = (InputReadErrorEvent)inputFailedEvent.getEvent();
       int failedInputIndexOnDestTa = readErrorEvent.getIndex();
-      if (readErrorEvent.getVersion() != attempt.getID().getId()) {
-        throw new TezUncheckedException(attempt.getID()
+
+      if (readErrorEvent.getVersion() != sourceAttempt.getID().getId()) {
+        throw new TezUncheckedException(sourceAttempt.getID()
             + " incorrectly blamed for read error from " + failedDestTaId
             + " at inputIndex " + failedInputIndexOnDestTa + " version"
             + readErrorEvent.getVersion());
       }
-      LOG.info(attempt.getID()
-            + " blamed for read error from " + failedDestTaId
-            + " at inputIndex " + failedInputIndexOnDestTa);
-      long time = attempt.clock.getTime();
-      Long firstErrReportTime = attempt.uniquefailedOutputReports.get(failedDestTaId);
+      // source host: where the data input is supposed to come from
+      String sHost = sourceAttempt.getNodeId().getHost();
+      // destination: where the data is tried to be fetched to
+      String dHost = readErrorEvent.getDestinationLocalhostName();
+
+      LOG.info("{} (on {}) blamed for read error from {} (on {}) at inputIndex {}", sourceAttempt.getID(),
+          sHost, failedDestTaId, dHost, failedInputIndexOnDestTa);
+
+      boolean tooManyDownstreamHostsBlamedTheSameUpstreamHost = false;
+      Map<String, Set<String>> downstreamBlamingHosts = sourceAttempt.getVertex().getDownstreamBlamingHosts();
+      if (!downstreamBlamingHosts.containsKey(sHost)) {
+        LOG.info("Host {} is blamed for fetch failure from {} for the first time", sHost, dHost);
+        downstreamBlamingHosts.put(sHost, new HashSet<String>());
+      }
+
+      downstreamBlamingHosts.get(sHost).add(dHost);
+      int currentNumberOfFailingDownstreamHosts = downstreamBlamingHosts.get(sHost).size();
+      int numNodes = getNumNodes(sourceAttempt);
+      float hostFailureFraction = numNodes > 0 ? ((float) currentNumberOfFailingDownstreamHosts) / numNodes : 0;
+      double maxAllowedHostFailureFraction = sourceAttempt.getVertex().getVertexConfig()
+          .getMaxAllowedDownstreamHostFailuresFraction();
+
+      if (hostFailureFraction > maxAllowedHostFailureFraction) {
+        LOG.info("Host will be marked fail: {} because of host failure fraction {} is beyond the limit {}", sHost,
+            hostFailureFraction, maxAllowedHostFailureFraction);
+        tooManyDownstreamHostsBlamedTheSameUpstreamHost = true;
+      }
 
+      long time = sourceAttempt.clock.getTime();
+
+      Long firstErrReportTime = sourceAttempt.uniquefailedOutputReports.get(failedDestTaId);
       if (firstErrReportTime == null) {
-        attempt.uniquefailedOutputReports.put(failedDestTaId, time);
+        sourceAttempt.uniquefailedOutputReports.put(failedDestTaId, time);
         firstErrReportTime = time;
       }
 
-      int maxAllowedOutputFailures = attempt.getVertex().getVertexConfig()
+      int maxAllowedOutputFailures = sourceAttempt.getVertex().getVertexConfig()
           .getMaxAllowedOutputFailures();
-      int maxAllowedTimeForTaskReadErrorSec = attempt.getVertex()
+      int maxAllowedTimeForTaskReadErrorSec = sourceAttempt.getVertex()
           .getVertexConfig().getMaxAllowedTimeForTaskReadErrorSec();
-      double maxAllowedOutputFailuresFraction = attempt.getVertex()
+      double maxAllowedOutputFailuresFraction = sourceAttempt.getVertex()
           .getVertexConfig().getMaxAllowedOutputFailuresFraction();
 
       int readErrorTimespanSec = (int)((time - firstErrReportTime)/1000);
       boolean crossTimeDeadline = readErrorTimespanSec >= maxAllowedTimeForTaskReadErrorSec;
 
-      int runningTasks = attempt.appContext.getCurrentDAG().getVertex(
+      int runningTasks = sourceAttempt.appContext.getCurrentDAG().getVertex(
           failedDestTaId.getTaskID().getVertexID()).getRunningTasks();
-      float failureFraction = runningTasks > 0 ? ((float) attempt.uniquefailedOutputReports.size()) / runningTasks : 0;
+      float failureFraction =
+          runningTasks > 0 ? ((float) sourceAttempt.uniquefailedOutputReports.size()) / runningTasks : 0;
       boolean withinFailureFractionLimits =
           (failureFraction <= maxAllowedOutputFailuresFraction);
       boolean withinOutputFailureLimits =
-          (attempt.uniquefailedOutputReports.size() < maxAllowedOutputFailures);
+          (sourceAttempt.uniquefailedOutputReports.size() < maxAllowedOutputFailures);
 
       // If needed we can launch a background task without failing this task
       // to generate a copy of the output just in case.
       // If needed we can consider only running consumer tasks
       if (!crossTimeDeadline && withinFailureFractionLimits && withinOutputFailureLimits
-          && !(readErrorEvent.isLocalFetch() || readErrorEvent.isDiskErrorAtSource())) {
-        return attempt.getInternalState();
+          && !(readErrorEvent.isLocalFetch() || readErrorEvent.isDiskErrorAtSource())
+          && !tooManyDownstreamHostsBlamedTheSameUpstreamHost) {
+        return sourceAttempt.getInternalState();
       }
-      String message = attempt.getID() + " being failed for too many output errors. "
+      String message = sourceAttempt.getID() + " being failed for too many output errors. "
           + "failureFraction=" + failureFraction
           + ", MAX_ALLOWED_OUTPUT_FAILURES_FRACTION="
           + maxAllowedOutputFailuresFraction
-          + ", uniquefailedOutputReports=" + attempt.uniquefailedOutputReports.size()
+          + ", uniquefailedOutputReports=" + sourceAttempt.uniquefailedOutputReports.size()
           + ", MAX_ALLOWED_OUTPUT_FAILURES=" + maxAllowedOutputFailures
+          + ", hostFailureFraction=" + hostFailureFraction
+          + " (" + currentNumberOfFailingDownstreamHosts + " / " + numNodes + ")"
+          + ", MAX_ALLOWED_DOWNSTREAM_HOST_FAILURES_FRACTION="
+          + maxAllowedHostFailureFraction
           + ", MAX_ALLOWED_TIME_FOR_TASK_READ_ERROR_SEC="
           + maxAllowedTimeForTaskReadErrorSec
           + ", readErrorTimespan=" + readErrorTimespanSec
           + ", isLocalFetch=" + readErrorEvent.isLocalFetch()
           + ", isDiskErrorAtSource=" + readErrorEvent.isDiskErrorAtSource();
 
       LOG.info(message);
-      attempt.addDiagnosticInfo(message);
+      sourceAttempt.addDiagnosticInfo(message);
       // send input failed event
-      attempt.sendInputFailedToConsumers();
+      sourceAttempt.sendInputFailedToConsumers();
       // Not checking for leafVertex since a READ_ERROR should only be reported for intermediate tasks.
-      if (attempt.getInternalState() == TaskAttemptStateInternal.SUCCEEDED) {
+      if (sourceAttempt.getInternalState() == TaskAttemptStateInternal.SUCCEEDED) {
         (new TerminatedAfterSuccessHelper(FAILED_HELPER)).transition(
-            attempt, event);
+            sourceAttempt, event);
         return TaskAttemptStateInternal.FAILED;
       } else {
         (new TerminatedWhileRunningTransition(FAILED_HELPER)).transition(
-            attempt, event);
+            sourceAttempt, event);
         return TaskAttemptStateInternal.FAIL_IN_PROGRESS;
       }
       // TODO at some point. Nodes may be interested in FetchFailure info.
       // Can be used to blacklist nodes.
     }
+
+    private int getNumNodes(TaskAttemptImpl sourceAttempt) {
+      Vertex vertex = sourceAttempt.getVertex();
+      String taskSchedulerName = vertex.getServicePluginInfo().getTaskSchedulerName();
+      int sourceIndex = vertex.getAppContext().getTaskScheduerIdentifier(taskSchedulerName);
+      int numActiveNodes = vertex.getAppContext().getNodeTracker().getNumActiveNodes(sourceIndex);
+      if (LOG.isDebugEnabled()) {
+        int numAllNodes = vertex.getAppContext().getNodeTracker().getNumNodes(sourceIndex);
+        LOG.debug("Getting nodes, active/all: {}/{}", numActiveNodes, numAllNodes);
+      }
+      return numActiveNodes;
+    }
   }
 
   @VisibleForTesting

diff --git a/tez-dag/src/main/java/org/apache/tez/dag/app/dag/impl/VertexImpl.java b/tez-dag/src/main/java/org/apache/tez/dag/app/dag/impl/VertexImpl.java
@@ -264,6 +264,13 @@ public class VertexImpl implements org.apache.tez.dag.app.dag.Vertex, EventHandl
 
   final ServicePluginInfo servicePluginInfo;
 
+  /*
+   * For every upstream host (as map keys) contains every unique downstream hostnames that reported INPUT_READ_ERROR.
+   * This map helps to decide if there is a problem with the host that produced the map outputs. There is an assumption
+   * that if multiple downstream hosts report input errors for the same upstream host, then it's likely that the output
+   * has to be blamed and needs to rerun.
+   */
+  private final Map<String, Set<String>> downstreamBlamingHosts = Maps.newHashMap();
 
   private final float maxFailuresPercent;
   private boolean logSuccessDiagnostics = false;
@@ -4833,6 +4840,10 @@ static class VertexConfigImpl implements VertexConfig {
      * See tez.am.max.allowed.time-sec.for-read-error.
      */
     private final int maxAllowedTimeForTaskReadErrorSec;
+    /**
+     * See tez.am.max.allowed.downstream.host.failures.fraction.
+     */
+    private final double maxAllowedDownstreamHostFailuresFraction;
 
     public VertexConfigImpl(Configuration conf) {
       this.maxFailedTaskAttempts = conf.getInt(TezConfiguration.TEZ_AM_TASK_MAX_FAILED_ATTEMPTS,
@@ -4857,6 +4868,10 @@ public VertexConfigImpl(Configuration conf) {
       this.maxAllowedTimeForTaskReadErrorSec = conf.getInt(
           TezConfiguration.TEZ_AM_MAX_ALLOWED_TIME_FOR_TASK_READ_ERROR_SEC,
           TezConfiguration.TEZ_AM_MAX_ALLOWED_TIME_FOR_TASK_READ_ERROR_SEC_DEFAULT);
+
+      this.maxAllowedDownstreamHostFailuresFraction = conf.getDouble(
+          TezConfiguration.TEZ_AM_MAX_ALLOWED_DOWNSTREAM_HOST_FAILURES_FRACTION,
+          TezConfiguration.TEZ_AM_MAX_ALLOWED_DOWNSTREAM_HOST_FAILURES_FRACTION_DEFAULT);
     }
 
     @Override
@@ -4899,8 +4914,20 @@ public boolean getTaskRescheduleRelaxedLocality() {
     @Override public int getMaxAllowedTimeForTaskReadErrorSec() {
       return maxAllowedTimeForTaskReadErrorSec;
     }
+
+    /**
+     * @return maxAllowedDownstreamHostsReportingFetchFailure.
+     */
+    @Override public double getMaxAllowedDownstreamHostFailuresFraction() {
+      return maxAllowedDownstreamHostFailuresFraction;
+    }
   }
 
   @Override
   public AbstractService getSpeculator() { return speculator; }
+
+  @Override
+  public Map<String, Set<String>> getDownstreamBlamingHosts(){
+    return downstreamBlamingHosts;
+  }
 }