Skip to content

Commit

Permalink
Gateway manager retry kernel updates (#1256)
Browse files Browse the repository at this point in the history
* Double check if a Gateway kernel was culled.

The GatewayMappingKernelManager keeps an internal record of all the remote kernels, and periodically syncs it with the Gateway server.

When it finds that a kernel it previously knew about is no longer in the Gateway server's list of kernels, it has to decide how to reconcile that.

Previously, it was assuming that the kernel was probably culled by the Gateway server, and thus removed it from its internal records.

However, it is conceivable that the list from the upstream Gateway server might have been incomplete due to any combination of bugs, race conditions, or transient network connectivity issues, etc.

If one of those such scenarios occurred, then the previous logic would have declared the kernel as lost.

This change makes the GatewayMappingKernelManager more resilient to such issues by double checking whether or not the kernel was actually removed in the upstream Gateway server.

It does this by attempting to update the GatewayKernelManager instance's model before deciding that the kernel has been culled.

* GatewayClient test for missing kernel list entries

This change extends the test_gateway.py suite to simulate kernels
being transiently missing from kernel list responses.

The new test fails without the update to the GatewayMappingKernelManager
to double check if kernels have been culled, and passes with it.

* Fix a lint error from a missing type annotation.
  • Loading branch information
ojarjur authored Apr 7, 2023
1 parent cd8010e commit 87b2158
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 6 deletions.
29 changes: 26 additions & 3 deletions jupyter_server/gateway/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,11 +123,34 @@ async def list_kernels(self, **kwargs):
culled_ids = []
for kid, _ in our_kernels.items():
if kid not in kernel_models:
# The upstream kernel was not reported in the list of kernels.
self.log.warning(
f"Kernel {kid} no longer active - probably culled on Gateway server."
f"Kernel {kid} not present in the list of kernels - possibly culled on Gateway server."
)
self._kernels.pop(kid, None)
culled_ids.append(kid) # TODO: Figure out what do with these.
try:
# Try to directly refresh the model for this specific kernel in case
# the upstream list of kernels was erroneously incomplete.
#
# That might happen if the case of a proxy that manages multiple
# backends where there could be transient connectivity issues with
# a single backend.
#
# Alternatively, it could happen if there is simply a bug in the
# upstream gateway server.
#
# Either way, including this check improves our reliability in the
# face of such scenarios.
model = await self._kernels[kid].refresh_model()
except web.HTTPError:
model = None
if model:
kernel_models[kid] = model
else:
self.log.warning(
f"Kernel {kid} no longer active - probably culled on Gateway server."
)
self._kernels.pop(kid, None)
culled_ids.append(kid) # TODO: Figure out what do with these.
return list(kernel_models.values())

async def shutdown_kernel(self, kernel_id, now=False, restart=False):
Expand Down
17 changes: 14 additions & 3 deletions tests/test_gateway.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from http.cookies import SimpleCookie
from io import BytesIO
from queue import Empty
from typing import Any, Union
from typing import Any, Dict, Union
from unittest.mock import MagicMock, patch

import pytest
Expand Down Expand Up @@ -62,6 +62,12 @@ def generate_kernelspec(name):
# maintain a dictionary of expected running kernels. Key = kernel_id, Value = model.
running_kernels = {}

# Dictionary of kernels to transiently omit from list results.
#
# This is used to simulate inconsistency in list results from the Gateway server
# due to issues like race conditions, bugs, etc.
omitted_kernels: Dict[str, bool] = {}


def generate_model(name):
"""Generate a mocked kernel model. Caller is responsible for adding model to running_kernels dictionary."""
Expand Down Expand Up @@ -131,8 +137,11 @@ async def mock_gateway_request(url, **kwargs): # noqa
if endpoint.endswith("/api/kernels") and method == "GET":
kernels = []
for kernel_id in running_kernels:
model = running_kernels.get(kernel_id)
kernels.append(model)
if kernel_id in omitted_kernels:
omitted_kernels.pop(kernel_id)
else:
model = running_kernels.get(kernel_id)
kernels.append(model)
response_buf = BytesIO(json.dumps(kernels).encode("utf-8"))
response = await ensure_async(HTTPResponse(request, 200, buffer=response_buf))
return response
Expand Down Expand Up @@ -453,6 +462,7 @@ async def test_gateway_session_lifecycle(init_gateway, jp_root_dir, jp_fetch, cu

assert await is_session_active(jp_fetch, session_id) is True

omitted_kernels[kernel_id] = True
if cull_kernel:
running_kernels.pop(kernel_id)

Expand Down Expand Up @@ -501,6 +511,7 @@ async def test_gateway_kernel_lifecycle(
# ensure kernel still considered running
assert await is_kernel_running(jp_fetch, kernel_id) is True

omitted_kernels[kernel_id] = True
if cull_kernel:
running_kernels.pop(kernel_id)

Expand Down

0 comments on commit 87b2158

Please sign in to comment.