Skip to content

Commit

Permalink
Enh: Better error handling of docker startup issues (#138)
Browse files Browse the repository at this point in the history
  • Loading branch information
klieret authored Nov 11, 2024
1 parent bdb2a9a commit 358b28c
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 11 deletions.
18 changes: 11 additions & 7 deletions src/swerex/deployment/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,17 +63,21 @@ async def is_alive(self, *, timeout: float | None = None) -> IsAliveResponse:
raise RuntimeError(msg)
return await self._runtime.is_alive(timeout=timeout)

async def _wait_until_alive(self, timeout: float | None = None):
return await _wait_until_alive(self.is_alive, timeout=timeout, function_timeout=self._runtime_timeout)
async def _wait_until_alive(self, timeout: float = 10.0):
try:
return await _wait_until_alive(self.is_alive, timeout=timeout, function_timeout=self._runtime_timeout)
except TimeoutError as e:
self.logger.error("Runtime did not start within timeout. Here's the output from the container process.")
assert self._container_process is not None
self._container_process.terminate()
self.logger.error(self._container_process.stdout.read().decode()) # type: ignore
self.logger.error(self._container_process.stderr.read().decode()) # type: ignore
raise e

def _get_token(self) -> str:
return str(uuid.uuid4())

async def start(
self,
*,
timeout: float | None = None,
):
async def start(self, *, timeout: float = 10.0):
"""Starts the runtime."""
port = self._port or find_free_port()
assert self._container_name is None
Expand Down
1 change: 1 addition & 0 deletions src/swerex/deployment/modal.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def from_registry(self, image: str) -> modal.Image:
}
)
secrets = [secret]
self.logger.debug("Docker login credentials were provided")
else:
self.logger.warning("DOCKER_USERNAME and DOCKER_PASSWORD not set. Using public images.")
secrets = None
Expand Down
6 changes: 2 additions & 4 deletions src/swerex/utils/wait.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@


async def _wait_until_alive(
function: Callable, timeout: float | None = None, function_timeout: float | None = 0.1, sleep: float = 0.1
function: Callable, timeout: float = 10.0, function_timeout: float | None = 0.1, sleep: float = 0.25
):
"""Wait until the function returns a truthy value.
Expand All @@ -16,8 +16,6 @@ async def _wait_until_alive(
Raises:
TimeoutError
"""
if timeout is None:
timeout = 10
end_time = time.time() + timeout
n_attempts = 0
while time.time() < end_time:
Expand All @@ -28,6 +26,6 @@ async def _wait_until_alive(
n_attempts += 1
msg = (
f"Runtime did not start within {timeout}s (tried to connect {n_attempts} times). "
f"The last await response was: {await_response.message!r}"
f"The last await response was:\n{await_response.message}"
)
raise TimeoutError(msg)

0 comments on commit 358b28c

Please sign in to comment.