Skip to content
Merged
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
3b66856
IGNITE-13021 : First impl.
Vladsz83 Apr 4, 2020
675c069
IGNITE-13012 : merged with master. Minor fixes.
Vladsz83 May 22, 2020
e4ddf05
IGNITE-13012 : merged with master. Minor fixes.
Vladsz83 May 22, 2020
bf93ac1
IGNITE-13012 : halt timeouts on the ping.
Vladsz83 May 25, 2020
e729270
Merge remote-tracking branch 'origin/IGNITE-13012' into IGNITE-13012
Vladsz83 May 25, 2020
47a9f7d
IGNITE-13012 : +test.
Vladsz83 May 25, 2020
2c929fa
IGNITE-13012 : redeem of the timeouts. Fixed test.
Vladsz83 May 25, 2020
245943a
IGNITE-13012 : redeem of the timeouts. Fixed test.
Vladsz83 May 25, 2020
f7d58ae
Merge branch 'master' into IGNITE-13012
Vladsz83 May 26, 2020
8f4dabf
IGNITE-13012 : Fixed tests. + a test.
Vladsz83 May 27, 2020
62f5d6a
IGNITE-13012 : fix of coordinator failure test.
Vladsz83 May 27, 2020
dc23756
IGNITE-13012 : test fix
Vladsz83 May 28, 2020
7089343
IGNITE-13012 : Reverted tests. Failure detection timeout is shared with
Vladsz83 Jun 2, 2020
3515f40
IGNITE-13012 : fix.
Vladsz83 Jun 2, 2020
9dca4f1
IGNITE-13012 : + test.
Vladsz83 Jun 3, 2020
bd00c20
IGNITE-13012 : + test fix.
Vladsz83 Jun 3, 2020
c464725
IGNITE-13012 : +10ms as the timer granulation.
Vladsz83 Jun 4, 2020
5370831
IGNITE-13012 : test fixes.
Vladsz83 Jun 4, 2020
a9ad35e
IGNITE-13012 : + 10ms as acceptable code delay.
Vladsz83 Jun 4, 2020
a8fad43
IGNITE-13012 : test redeemed.
Vladsz83 Jun 5, 2020
45c426f
IGNITE-13016 : faster test.
Vladsz83 Jun 5, 2020
0d58fe4
Revert "IGNITE-13134 : test duration fix.
Vladsz83 Jun 8, 2020
7f7a608
IGNITE-13012 : faster test.
Vladsz83 Jun 9, 2020
dde7e7c
IGNITE-13012 : minority.
Vladsz83 Jun 9, 2020
7b40043
Merge branch 'master' into IGNITE-13012
Vladsz83 Jun 9, 2020
e1b9735
IGNITE-13012 : spelling fix.
Vladsz83 Jun 15, 2020
1b07dd5
IGNITE-13012 : empty lines.
Vladsz83 Jun 15, 2020
a4be000
IGNITE-13012 :renaming.
Vladsz83 Jun 15, 2020
d9c3108
IGNITE-13012 :renamings. Removes test.
Vladsz83 Jun 15, 2020
71435c2
reverted removal of 'public'
Vladsz83 Jun 23, 2020
322242a
IGNITE-13012 : removed redundant hasRemoteSrvNodes
Vladsz83 Jun 23, 2020
be6e2ed
Merge branch 'master' into IGNITE-13012
Vladsz83 Jun 23, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,22 @@ public IgniteSpiOperationTimeoutHelper(IgniteSpiAdapter adapter, boolean srvOp)
adapter.clientFailureDetectionTimeout();
}

/**
* Creates timeout helper based on time of last related operation.
*
* @param adapter SPI adapter.
* @param srvOp {@code True} if communicates with server node.
* @param lastOperStartNanos Time of last related operation in nanos.
*/
public IgniteSpiOperationTimeoutHelper(IgniteSpiAdapter adapter, boolean srvOp, long lastOperStartNanos) {
this(adapter, srvOp);

this.lastOperStartNanos = lastOperStartNanos;

if (lastOperStartNanos > 0)
timeout = failureDetectionTimeout;
}

/**
* Returns a timeout value to use for the next network operation.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,8 +199,11 @@ class ServerImpl extends TcpDiscoveryImpl {
/** */
private static final TcpDiscoveryAbstractMessage WAKEUP = new TcpDiscoveryDummyWakeupMessage();

/** When this interval pass connection check will be performed. */
private static final int CON_CHECK_INTERVAL = 500;
/** Maximal interval of connection check to next node in the ring. */
private static final long MAX_CON_CHECK_INTERVAL = 500;

/** Interval of checking connection to next node in the ring. */
private long connCheckInterval;

/** */
private IgniteThreadPoolExecutor utilityPool;
Expand Down Expand Up @@ -275,6 +278,9 @@ class ServerImpl extends TcpDiscoveryImpl {
/** Last time received message from ring. */
private volatile long lastRingMsgReceivedTime;

/** Time of last sent and acknowledged message. */
private volatile long lastRingMsgSentTime;

/** */
private volatile boolean nodeCompactRepresentationSupported =
true; //assume that local node supports this feature
Expand Down Expand Up @@ -356,8 +362,8 @@ class ServerImpl extends TcpDiscoveryImpl {
}

/** {@inheritDoc} */
@Override public long connectionCheckInterval() {
return CON_CHECK_INTERVAL;
@Override long connectionCheckInterval() {
return connCheckInterval;
}

/** {@inheritDoc} */
Expand All @@ -368,6 +374,15 @@ class ServerImpl extends TcpDiscoveryImpl {

lastRingMsgReceivedTime = 0;

lastRingMsgSentTime = 0;

long msgExchangeTimeout = spi.failureDetectionTimeoutEnabled() ? spi.failureDetectionTimeout() :
spi.getSocketTimeout() + spi.getAckTimeout();

// Since we take in account time of last sent message, the interval should be quite short to give enough piece
// of failure detection timeout as send-and-acknowledge timeout of the message to send.
connCheckInterval = Math.min(msgExchangeTimeout / 4, MAX_CON_CHECK_INTERVAL);

utilityPool = new IgniteThreadPoolExecutor("disco-pool",
spi.ignite().name(),
0,
Expand Down Expand Up @@ -2846,15 +2861,6 @@ private class RingMessageWorker extends MessageWorker<TcpDiscoveryAbstractMessag
/** Last time metrics update message has been sent. */
private long lastTimeMetricsUpdateMsgSentNanos = System.nanoTime() - U.millisToNanos(spi.metricsUpdateFreq);

/** Time when the last status message has been sent. */
private long lastTimeConnCheckMsgSent;

/** Flag that keeps info on whether the threshold is reached or not. */
private boolean failureThresholdReached;

/** Connection check threshold. */
private long connCheckThreshold;

/** */
private long lastRingMsgTimeNanos;

Expand All @@ -2873,8 +2879,6 @@ private class RingMessageWorker extends MessageWorker<TcpDiscoveryAbstractMessag
private RingMessageWorker(IgniteLogger log) {
super("tcp-disco-msg-worker-[]", log, 10, getWorkerRegistry(spi));

initConnectionCheckThreshold();

setBeforeEachPollAction(() -> {
updateHeartbeat();

Expand Down Expand Up @@ -3025,19 +3029,6 @@ private void nullifyDiscoData() {
joiningNodesDiscoDataList = null;
}

/**
* Initializes connection check frequency. Used only when failure detection timeout is enabled.
*/
private void initConnectionCheckThreshold() {
if (spi.failureDetectionTimeoutEnabled())
connCheckThreshold = spi.failureDetectionTimeout();
else
connCheckThreshold = Math.min(spi.getSocketTimeout(), spi.metricsUpdateFreq);

if (log.isInfoEnabled())
log.info("Connection check threshold is calculated: " + connCheckThreshold);
}

/**
*
*/
Expand Down Expand Up @@ -3146,9 +3137,6 @@ else if (msg instanceof TcpDiscoveryAuthFailedMessage)
if (msg.senderNodeId() != null && !msg.senderNodeId().equals(getLocalNodeId())) {
// Received a message from remote node.
onMessageExchanged();

// Reset the failure flag.
failureThresholdReached = false;
}

if (next != null && sock != null) {
Expand Down Expand Up @@ -3469,6 +3457,8 @@ else if (log.isTraceEnabled())
}
}

updateLastSentMessageTime();

if (log.isDebugEnabled())
log.debug("Initialized connection with next node: " + next.id());

Expand Down Expand Up @@ -3559,8 +3549,10 @@ else if (!spi.failureDetectionTimeoutEnabled() && (e instanceof

addFailedNodes(pendingMsg, failedNodes);

if (timeoutHelper == null)
timeoutHelper = new IgniteSpiOperationTimeoutHelper(spi, true);
if (timeoutHelper == null) {
timeoutHelper = new IgniteSpiOperationTimeoutHelper(spi, true,
lastRingMsgSentTime);
}

try {
spi.writeToSocket(sock, out, pendingMsg, timeoutHelper.nextTimeoutChunk(
Expand All @@ -3574,6 +3566,8 @@ else if (!spi.failureDetectionTimeoutEnabled() && (e instanceof

int res = spi.readReceipt(sock, timeoutHelper.nextTimeoutChunk(ackTimeout0));

updateLastSentMessageTime();

spi.stats.onMessageSent(pendingMsg, U.nanosToMillis(tsNanos0 - tsNanos));

if (log.isDebugEnabled())
Expand Down Expand Up @@ -3602,7 +3596,7 @@ else if (!spi.failureDetectionTimeoutEnabled() && (e instanceof
long tsNanos = System.nanoTime();

if (timeoutHelper == null)
timeoutHelper = new IgniteSpiOperationTimeoutHelper(spi, true);
timeoutHelper = new IgniteSpiOperationTimeoutHelper(spi, true, lastRingMsgSentTime);

addFailedNodes(msg, failedNodes);

Expand All @@ -3621,6 +3615,8 @@ else if (!spi.failureDetectionTimeoutEnabled() && (e instanceof

int res = spi.readReceipt(sock, timeoutHelper.nextTimeoutChunk(ackTimeout0));

updateLastSentMessageTime();

if (latencyCheck && log.isInfoEnabled())
log.info("Latency check message has been acked: " + msg.id());

Expand Down Expand Up @@ -6192,40 +6188,21 @@ private void checkMetricsReceiving() {
}

/**
* Check connection aliveness status.
* Check connection to next node in the ring.
*/
private void checkConnection() {
Boolean hasRemoteSrvNodes = null;

if (spi.failureDetectionTimeoutEnabled() && !failureThresholdReached &&
U.millisSinceNanos(locNode.lastExchangeTimeNanos()) >= connCheckThreshold &&
spiStateCopy() == CONNECTED &&
(hasRemoteSrvNodes = ring.hasRemoteServerNodes())) {

if (log.isInfoEnabled())
log.info("Local node seems to be disconnected from topology (failure detection timeout " +
"is reached) [failureDetectionTimeout=" + spi.failureDetectionTimeout() +
", connCheckInterval=" + CON_CHECK_INTERVAL + ']');

failureThresholdReached = true;

// Reset sent time deliberately to force sending connection check message.
lastTimeConnCheckMsgSent = 0;
}

long elapsed = (lastTimeConnCheckMsgSent + CON_CHECK_INTERVAL) - U.currentTimeMillis();
long elapsed = (lastRingMsgSentTime + U.millisToNanos(connCheckInterval)) - System.nanoTime();

if (elapsed > 0)
return;

if (hasRemoteSrvNodes == null)
hasRemoteSrvNodes = ring.hasRemoteServerNodes();

if (hasRemoteSrvNodes) {
if (hasRemoteSrvNodes)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not to call updateLastSentMessageTime method here as well?

Copy link
Contributor Author

@Vladsz83 Vladsz83 Jun 23, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not to call updateLastSentMessageTime method here as well?

We hasn't successfully sent message here, we hasn't received RES_OK.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As you can see, we call updateLastSentMessageTime() after successful reading spi.readReceipt or proper TcpDiscoveryHandshakeResponse. These are the places where we are sure the message was sent and connection is OK.

sendMessageAcrossRing(new TcpDiscoveryConnectionCheckMessage(locNode));

lastTimeConnCheckMsgSent = U.currentTimeMillis();
}
}

/** {@inheritDoc} */
Expand All @@ -6234,6 +6211,11 @@ private void checkConnection() {
}
}

/** Fixates time of last sent message. */
private void updateLastSentMessageTime() {
lastRingMsgSentTime = System.nanoTime();
}

/** Thread that executes {@link TcpServer}'s code. */
private class TcpServerThread extends IgniteSpiThread {
/** */
Expand Down Expand Up @@ -6569,7 +6551,7 @@ else if (req.changeTopology()) {
long now = U.currentTimeMillis();

// We got message from previous in less than double connection check interval.
boolean ok = rcvdTime + CON_CHECK_INTERVAL * 2 >= now;
boolean ok = rcvdTime + connCheckInterval * 2 >= now;
TcpDiscoveryNode previous = null;

if (ok) {
Expand Down Expand Up @@ -6618,7 +6600,7 @@ else if (req.changeTopology()) {
", checkPreviousNodeId=" + req.checkPreviousNodeId() +
", actualPreviousNode=" + previous +
", lastMessageReceivedTime=" + rcvdTime + ", now=" + now +
", connCheckInterval=" + CON_CHECK_INTERVAL + ']');
", connCheckInterval=" + connCheckInterval + ']');
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ public int boundPort() throws IgniteSpiException {
/**
* @return connection check interval.
*/
public long connectionCheckInterval() {
long connectionCheckInterval() {
return 0;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ public class CacheContinuousQueryFilterDeploymentFailedTest extends GridCommonAb
@Override protected IgniteConfiguration getConfiguration(String igniteInstanceName) throws Exception {
IgniteConfiguration cfg = super.getConfiguration(igniteInstanceName);

// Failure detection timeout > P2P class loading timeout which is set as network timeout.
cfg.setFailureDetectionTimeout(cfg.getNetworkTimeout() * 2);

((TestTcpDiscoverySpi)cfg.getDiscoverySpi()).discoveryHook(new DiscoveryHook() {
@Override public void afterDiscovery(DiscoveryCustomMessage customMsg) {
if (customMsg instanceof StopRoutineDiscoveryMessage)
Expand Down