Skip to content

Commit 5812d44

Browse files
authored
Retry behavior for synchronous calls during initialization (#2118)
* Retry behavior for synchronous calls during initialization * Add initialization retry tuning * Add implementation note
1 parent 0a27d46 commit 5812d44

File tree

7 files changed

+102
-6
lines changed

7 files changed

+102
-6
lines changed

operator/src/main/java/oracle/kubernetes/operator/TuningParameters.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ public static TuningParameters getInstance() {
3131
public PodTuning getPodTuning();
3232

3333
public static class MainTuning {
34+
public final int initializationRetryDelaySeconds;
3435
public final int domainPresenceFailureRetrySeconds;
3536
public final int domainPresenceFailureRetryMaxCount;
3637
public final int domainPresenceRecheckIntervalSeconds;
@@ -43,6 +44,7 @@ public static class MainTuning {
4344

4445
/**
4546
* create main tuning.
47+
* @param initializationRetryDelaySeconds initialization retry delay
4648
* @param domainPresenceFailureRetrySeconds domain presence failure retry
4749
* @param domainPresenceFailureRetryMaxCount domain presence failure retry max count
4850
* @param domainPresenceRecheckIntervalSeconds domain presence recheck interval
@@ -54,6 +56,7 @@ public static class MainTuning {
5456
* @param eventualLongDelay eventual long delay
5557
*/
5658
public MainTuning(
59+
int initializationRetryDelaySeconds,
5760
int domainPresenceFailureRetrySeconds,
5861
int domainPresenceFailureRetryMaxCount,
5962
int domainPresenceRecheckIntervalSeconds,
@@ -63,6 +66,7 @@ public MainTuning(
6366
int stuckPodRecheckSeconds,
6467
long initialShortDelay,
6568
long eventualLongDelay) {
69+
this.initializationRetryDelaySeconds = initializationRetryDelaySeconds;
6670
this.domainPresenceFailureRetrySeconds = domainPresenceFailureRetrySeconds;
6771
this.domainPresenceFailureRetryMaxCount = domainPresenceFailureRetryMaxCount;
6872
this.domainPresenceRecheckIntervalSeconds = domainPresenceRecheckIntervalSeconds;

operator/src/main/java/oracle/kubernetes/operator/TuningParametersImpl.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ private static void updateTuningParameters() {
4848
private void update() {
4949
MainTuning main =
5050
new MainTuning(
51+
(int) readTuningParameter("initializationRetryDelaySeconds", 5),
5152
(int) readTuningParameter("domainPresenceFailureRetrySeconds", 10),
5253
(int) readTuningParameter("domainPresenceFailureRetryMaxCount", 5),
5354
(int) readTuningParameter("domainPresenceRecheckIntervalSeconds", 120),

operator/src/main/java/oracle/kubernetes/operator/helpers/CallBuilder.java

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import java.util.List;
1111
import java.util.Map;
1212
import java.util.Optional;
13+
import java.util.concurrent.Callable;
1314
import java.util.function.Consumer;
1415
import javax.annotation.Nonnull;
1516

@@ -66,6 +67,9 @@
6667
import oracle.kubernetes.operator.calls.RetryStrategy;
6768
import oracle.kubernetes.operator.calls.SynchronousCallDispatcher;
6869
import oracle.kubernetes.operator.calls.SynchronousCallFactory;
70+
import oracle.kubernetes.operator.logging.LoggingFacade;
71+
import oracle.kubernetes.operator.logging.LoggingFactory;
72+
import oracle.kubernetes.operator.logging.MessageKeys;
6973
import oracle.kubernetes.operator.work.Step;
7074
import oracle.kubernetes.weblogic.domain.api.WeblogicApi;
7175
import oracle.kubernetes.weblogic.domain.model.Domain;
@@ -77,6 +81,8 @@
7781
/** Simplifies synchronous and asynchronous call patterns to the Kubernetes API Server. */
7882
@SuppressWarnings({"WeakerAccess", "UnusedReturnValue"})
7983
public class CallBuilder {
84+
private static final LoggingFacade LOGGER = LoggingFactory.getLogger("Operator", "Operator");
85+
8086
/** HTTP status code for "Not Found". */
8187
public static final int NOT_FOUND = 404;
8288

@@ -594,6 +600,54 @@ private <T> T executeSynchronousCall(
594600
return DISPATCHER.execute(factory, requestParams, helper);
595601
}
596602

603+
/**
604+
* Execute a synchronous call with a retry on failure.
605+
* @param call The call
606+
* @param retryDelaySeconds Retry delay in seconds
607+
* @param <T> Call return type
608+
* @return Results of operation, if successful
609+
* @throws Exception Exception types other than ApiException, which will cause failure
610+
*/
611+
public <T> T executeSynchronousCallWithRetry(Callable<T> call, int retryDelaySeconds) throws Exception {
612+
/*
613+
* Implementation Note: synchronous calls are only allowed during operator initialization.
614+
* All make-right work must be done with the asynchronous calling pattern. Therefore, since
615+
* we know that this method will only be invoked during operator initialization, we've chosen
616+
* not to put a limit on the number of retries. This is acceptable because the liveness probe will
617+
* eventually kill the operator if the initialization sequence does not complete.
618+
*
619+
* This call was specifically added to address the Istio-related use case where the operator attempts
620+
* to initialize prior to the Istio Envoy sidecar completing its initialization as described in this
621+
* Istio bug: https://github.com/istio/istio/issues/11130. However, the pattern will also work for
622+
* use cases where the Kubernetes master happens to temporarily unavailable just as the operator is
623+
* starting.
624+
*/
625+
T result = null;
626+
boolean complete = false;
627+
do {
628+
try {
629+
result = call.call();
630+
complete = true;
631+
} catch (RuntimeException re) {
632+
Throwable cause = re.getCause();
633+
if (cause instanceof ApiException) {
634+
LOGGER.warning(MessageKeys.EXCEPTION, cause);
635+
}
636+
} catch (Throwable t) {
637+
LOGGER.warning(MessageKeys.EXCEPTION, t);
638+
}
639+
640+
if (complete) {
641+
break;
642+
}
643+
644+
Thread.sleep(retryDelaySeconds * 1000L);
645+
646+
// We are intentionally not limiting the number of retries as described in the implementation note above.
647+
} while (true);
648+
return result;
649+
}
650+
597651
/**
598652
* Read namespace.
599653
*

operator/src/main/java/oracle/kubernetes/operator/helpers/HealthCheckHelper.java

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,12 @@
88
import java.util.Map;
99
import javax.annotation.Nonnull;
1010

11-
import io.kubernetes.client.openapi.ApiException;
1211
import io.kubernetes.client.openapi.models.V1ResourceRule;
1312
import io.kubernetes.client.openapi.models.V1SelfSubjectRulesReview;
1413
import io.kubernetes.client.openapi.models.V1SubjectRulesReviewStatus;
1514
import io.kubernetes.client.openapi.models.VersionInfo;
1615
import oracle.kubernetes.operator.Main;
16+
import oracle.kubernetes.operator.TuningParameters;
1717
import oracle.kubernetes.operator.helpers.AuthorizationProxy.Operation;
1818
import oracle.kubernetes.operator.helpers.AuthorizationProxy.Resource;
1919
import oracle.kubernetes.operator.logging.LoggingFacade;
@@ -205,9 +205,12 @@ public static KubernetesVersion performK8sVersionCheck() {
205205
LOGGER.fine(MessageKeys.VERIFY_K8S_MIN_VERSION);
206206

207207
try {
208-
return createAndValidateKubernetesVersion(new CallBuilder().readVersionCode());
209-
} catch (ApiException ae) {
210-
LOGGER.warning(MessageKeys.K8S_VERSION_CHECK_FAILURE, ae);
208+
CallBuilder cb = new CallBuilder();
209+
return createAndValidateKubernetesVersion(
210+
cb.executeSynchronousCallWithRetry(() -> cb.readVersionCode(),
211+
TuningParameters.getInstance().getMainTuning().initializationRetryDelaySeconds));
212+
} catch (Throwable t) {
213+
LOGGER.warning(MessageKeys.K8S_VERSION_CHECK_FAILURE, t);
211214
return KubernetesVersion.UNREADABLE;
212215
}
213216
}

operator/src/test/java/oracle/kubernetes/operator/helpers/CallBuilderTest.java

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ private static String toJson(Object object) {
5656

5757
@Before
5858
public void setUp() throws NoSuchFieldException {
59-
mementos.add(TestUtils.silenceOperatorLogger());
59+
mementos.add(TestUtils.silenceOperatorLogger().ignoringLoggedExceptions(ApiException.class));
6060
mementos.add(PseudoServletCallDispatcher.install(getHostPath()));
6161
}
6262

@@ -78,6 +78,34 @@ public void getVersionCode_returnsAVersionInfo() throws ApiException {
7878
assertThat(callBuilder.readVersionCode(), equalTo(versionInfo));
7979
}
8080

81+
@Test
82+
public void getVersionCode_firstAttemptFailsAndThenReturnsAVersionInfo() throws Exception {
83+
VersionInfo versionInfo = new VersionInfo().major("1").minor("2");
84+
defineHttpGetResponse("/version/", new FailOnceGetServlet(versionInfo, HTTP_BAD_REQUEST));
85+
86+
assertThat(callBuilder.executeSynchronousCallWithRetry(
87+
() -> callBuilder.readVersionCode(), 1), equalTo(versionInfo));
88+
}
89+
90+
static class FailOnceGetServlet extends JsonGetServlet {
91+
92+
final int errorCode;
93+
int numGetResponseCalled = 0;
94+
95+
FailOnceGetServlet(Object returnValue, int errorCode) {
96+
super(returnValue);
97+
this.errorCode = errorCode;
98+
}
99+
100+
@Override
101+
public WebResource getGetResponse() throws IOException {
102+
if (numGetResponseCalled++ > 0) {
103+
return super.getGetResponse();
104+
}
105+
return new WebResource("", errorCode);
106+
}
107+
}
108+
81109
@Test
82110
public void listDomains_returnsListasJson() throws ApiException {
83111
DomainList list = new DomainList().withItems(Arrays.asList(new Domain(), new Domain()));
@@ -132,6 +160,11 @@ private JsonServlet defineHttpGetResponse(String resourceName, Object response)
132160
return servlet;
133161
}
134162

163+
private void defineHttpGetResponse(
164+
String resourceName, PseudoServlet pseudoServlet) {
165+
defineResource(resourceName, pseudoServlet);
166+
}
167+
135168
private void defineHttpPostResponse(String resourceName, Object response) {
136169
defineResource(resourceName, new JsonPostServlet(response));
137170
}

operator/src/test/java/oracle/kubernetes/operator/helpers/TuningParametersStub.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ public static void setParameter(String key, String value) {
6565

6666
@Override
6767
public MainTuning getMainTuning() {
68-
return new MainTuning(2, 2, 2, 2, 2, 2, 30, 2L, 2L);
68+
return new MainTuning(5, 2, 2, 2, 2, 2, 2, 30, 2L, 2L);
6969
}
7070

7171
@Override

operator/src/test/java/oracle/kubernetes/operator/helpers/VersionCheckTest.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ public void setUp() throws Exception {
126126
consoleControl = TestUtils.silenceOperatorLogger().collectLogMessages(logRecords, LOG_KEYS);
127127
mementos.add(consoleControl);
128128
mementos.add(ClientFactoryStub.install());
129+
mementos.add(TuningParametersStub.install());
129130
mementos.add(testSupport.installSynchronousCallDispatcher());
130131
}
131132

0 commit comments

Comments
 (0)