Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gateway manager retry kernel updates #1256

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 26 additions & 3 deletions jupyter_server/gateway/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,11 +123,34 @@ async def list_kernels(self, **kwargs):
culled_ids = []
for kid, _ in our_kernels.items():
if kid not in kernel_models:
# The upstream kernel was not reported in the list of kernels.
self.log.warning(
f"Kernel {kid} no longer active - probably culled on Gateway server."
f"Kernel {kid} not present in the list of kernels - possibly culled on Gateway server."
)
self._kernels.pop(kid, None)
culled_ids.append(kid) # TODO: Figure out what do with these.
try:
# Try to directly refresh the model for this specific kernel in case
# the upstream list of kernels was erroneously incomplete.
#
# That might happen if the case of a proxy that manages multiple
# backends where there could be transient connectivity issues with
# a single backend.
#
# Alternatively, it could happen if there is simply a bug in the
# upstream gateway server.
#
# Either way, including this check improves our reliability in the
# face of such scenarios.
model = await self._kernels[kid].refresh_model()
except web.HTTPError:
model = None
if model:
kernel_models[kid] = model
else:
self.log.warning(
f"Kernel {kid} no longer active - probably culled on Gateway server."
)
self._kernels.pop(kid, None)
culled_ids.append(kid) # TODO: Figure out what do with these.
return list(kernel_models.values())

async def shutdown_kernel(self, kernel_id, now=False, restart=False):
Expand Down
17 changes: 14 additions & 3 deletions tests/test_gateway.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from http.cookies import SimpleCookie
from io import BytesIO
from queue import Empty
from typing import Any, Union
from typing import Any, Dict, Union
from unittest.mock import MagicMock, patch

import pytest
Expand Down Expand Up @@ -62,6 +62,12 @@ def generate_kernelspec(name):
# maintain a dictionary of expected running kernels. Key = kernel_id, Value = model.
running_kernels = {}

# Dictionary of kernels to transiently omit from list results.
#
# This is used to simulate inconsistency in list results from the Gateway server
# due to issues like race conditions, bugs, etc.
omitted_kernels: Dict[str, bool] = {}


def generate_model(name):
"""Generate a mocked kernel model. Caller is responsible for adding model to running_kernels dictionary."""
Expand Down Expand Up @@ -131,8 +137,11 @@ async def mock_gateway_request(url, **kwargs): # noqa
if endpoint.endswith("/api/kernels") and method == "GET":
kernels = []
for kernel_id in running_kernels:
model = running_kernels.get(kernel_id)
kernels.append(model)
if kernel_id in omitted_kernels:
omitted_kernels.pop(kernel_id)
else:
model = running_kernels.get(kernel_id)
kernels.append(model)
response_buf = BytesIO(json.dumps(kernels).encode("utf-8"))
response = await ensure_async(HTTPResponse(request, 200, buffer=response_buf))
return response
Expand Down Expand Up @@ -453,6 +462,7 @@ async def test_gateway_session_lifecycle(init_gateway, jp_root_dir, jp_fetch, cu

assert await is_session_active(jp_fetch, session_id) is True

omitted_kernels[kernel_id] = True
if cull_kernel:
running_kernels.pop(kernel_id)

Expand Down Expand Up @@ -501,6 +511,7 @@ async def test_gateway_kernel_lifecycle(
# ensure kernel still considered running
assert await is_kernel_running(jp_fetch, kernel_id) is True

omitted_kernels[kernel_id] = True
if cull_kernel:
running_kernels.pop(kernel_id)

Expand Down