-
Notifications
You must be signed in to change notification settings - Fork 1
feat(web,setup): WP-6 frontend + UX polish #1941
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 28 commits
aebae71
83b4c6a
bbd4ad6
cb60573
feeccaa
5344532
13a620b
7549178
c064034
fd00b5f
e8a03cd
9222222
7cb2a3b
e77bce8
fa6b3b6
aaf15a9
791c0cd
84f1373
3617d85
ce92ce2
0001d63
f259452
39ce011
3b818e0
59b0df4
52a23ac
6e6a266
32c97e5
eb5aa0c
c5ef146
b7ab105
3768913
03baf13
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -98,10 +98,40 @@ async def _resolve_sse_keepalive_seconds(app_state: AppState | None) -> float: | |
| # control-character session IDs reaching the hub. | ||
| _SESSION_ID_PATTERN = r"^[a-zA-Z0-9_-]{1,128}$" | ||
|
|
||
| # Maximum consecutive revalidation failures (transient persistence | ||
| # blips) before the SSE stream terminates so the client can reconnect | ||
| # against a healthy replica. | ||
| _SSE_REVALIDATE_MAX_FAILURES: int = 3 | ||
| # Fallback for ``api.sse_revalidate_max_failures`` when the settings | ||
| # chain is unavailable (test harness, anonymous boot, resolver outage). | ||
| # Mirrors the registry default in | ||
| # ``src/synthorg/settings/definitions/api.py``. | ||
| _SSE_REVALIDATE_MAX_FAILURES_FALLBACK: int = 3 | ||
|
|
||
|
|
||
| async def _resolve_sse_revalidate_max_failures(app_state: AppState | None) -> int: | ||
| """Resolve the SSE revalidation failure tolerance through settings. | ||
|
|
||
| Falls back to :data:`_SSE_REVALIDATE_MAX_FAILURES_FALLBACK` when no | ||
| :class:`ConfigResolver` is wired (test harness, anonymous boot) or | ||
| when the resolver itself raises -- a transient settings outage | ||
| must not collapse the failure ceiling to zero. | ||
| """ | ||
| if app_state is None or not getattr(app_state, "has_config_resolver", False): | ||
| return _SSE_REVALIDATE_MAX_FAILURES_FALLBACK | ||
| try: | ||
| return await app_state.config_resolver.get_int( | ||
| "api", "sse_revalidate_max_failures" | ||
| ) | ||
| except asyncio.CancelledError: | ||
| raise | ||
| except MemoryError, RecursionError: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Invalid syntax for catching multiple exceptions. In Python 3, multiple exceptions must be caught using a parenthesized tuple. The current syntax except (MemoryError, RecursionError): |
||
| raise | ||
| except Exception as exc: | ||
| logger.warning( | ||
| EVENT_STREAM_PROJECTION_FAILED, | ||
| note="failed to resolve api.sse_revalidate_max_failures; using fallback", | ||
| error_type=type(exc).__name__, | ||
| error=safe_error_description(exc), | ||
| fallback=_SSE_REVALIDATE_MAX_FAILURES_FALLBACK, | ||
| ) | ||
| return _SSE_REVALIDATE_MAX_FAILURES_FALLBACK | ||
|
|
||
|
|
||
| async def _user_revocation_reason( | ||
|
|
@@ -116,7 +146,7 @@ async def _user_revocation_reason( | |
| must kick a live SSE stream within one revalidation interval). | ||
|
|
||
| ``ok`` is False when the persistence call itself failed (transient | ||
| backend error). Callers tolerate ``_SSE_REVALIDATE_MAX_FAILURES`` | ||
| backend error). Callers tolerate ``api.sse_revalidate_max_failures`` | ||
| consecutive ``ok=False`` ticks before tearing down the stream. | ||
| """ | ||
| try: | ||
|
|
@@ -346,13 +376,18 @@ async def _run_revalidation_tick( | |
| app_state: AppState, | ||
| user: AuthenticatedUser, | ||
| consecutive_failures: int, | ||
| max_failures: int, | ||
| ) -> _RevalidationVerdict: | ||
| """Execute one revalidation check and return what the loop should do. | ||
|
|
||
| Centralises the failure-counter / role-check / session-revocation | ||
| decision tree so :func:`_sse_event_stream` does not exceed the | ||
| McCabe complexity ceiling. The caller advances its | ||
| ``next_revalidate_ts`` regardless of the verdict. | ||
|
|
||
| ``max_failures`` is the resolved ``api.sse_revalidate_max_failures`` | ||
| setting; the loop tolerates this many consecutive transient | ||
| persistence errors before yielding a ``revoked`` frame. | ||
| """ | ||
| reason, ok = await _user_revocation_reason( | ||
| app_state, | ||
|
|
@@ -361,7 +396,11 @@ async def _run_revalidation_tick( | |
| ) | ||
| if not ok: | ||
| new_failures = consecutive_failures + 1 | ||
| if new_failures >= _SSE_REVALIDATE_MAX_FAILURES: | ||
| # Strictly greater-than: the docstring contract is to tolerate | ||
| # ``max_failures`` consecutive transient errors and revoke only | ||
| # once that ceiling is exceeded (failure max_failures+1), not on | ||
| # the max_failures-th failure itself. | ||
| if new_failures > max_failures: | ||
| return _RevalidationVerdict( | ||
| consecutive_failures=new_failures, | ||
| revoked_event={ | ||
|
|
@@ -394,8 +433,8 @@ async def _sse_event_stream( # noqa: PLR0915, PLR0912, C901 | |
| independent revalidation deadline (``SSE_REVALIDATE_INTERVAL_SECONDS``) | ||
| and fires it even on busy streams that never hit a keepalive | ||
| timeout. On revocation, yields a final ``revoked`` event | ||
| and terminates the stream. Tolerates ``_SSE_REVALIDATE_MAX_FAILURES`` | ||
| transient persistence errors before escalating. | ||
| and terminates the stream. Tolerates ``api.sse_revalidate_max_failures`` | ||
| consecutive transient persistence errors before escalating. | ||
| """ | ||
| consecutive_failures = 0 | ||
| # Track the disconnect reason by exit path so the | ||
|
|
@@ -420,6 +459,7 @@ async def _sse_event_stream( # noqa: PLR0915, PLR0912, C901 | |
| ) | ||
| revalidation_armed = app_state is not None and user is not None | ||
| keepalive_seconds = await _resolve_sse_keepalive_seconds(app_state) | ||
| revalidate_max_failures = await _resolve_sse_revalidate_max_failures(app_state) | ||
| # Use ``app_state.clock.monotonic()`` so tests inject FakeClock | ||
| # rather than monkey-patching ``asyncio.get_event_loop().time``. | ||
| # The bare loop timer is still acceptable for async waits below. | ||
|
|
@@ -469,6 +509,7 @@ async def _sse_event_stream( # noqa: PLR0915, PLR0912, C901 | |
| app_state=app_state, | ||
| user=user, | ||
| consecutive_failures=consecutive_failures, | ||
| max_failures=revalidate_max_failures, | ||
| ) | ||
| consecutive_failures = verdict.consecutive_failures | ||
| if verdict.revoked_event is not None: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -47,9 +47,22 @@ | |
|
|
||
| logger = get_logger(__name__) | ||
|
|
||
| # Inverted-convention result from ``auto_select_embedder``: ``None`` | ||
| # means success (a model was ranked and persisted); a ``str`` carries | ||
| # the human-readable failure reason. Aliased here so the call site | ||
| # can pass the result directly to | ||
| # ``SetupCompleteResponse.embedder_failure_reason`` without re-stating | ||
| # the inversion at every call. | ||
| type EmbedderSelectResult = str | None | ||
|
|
||
| # Module-level lock: serializes read-modify-write on agents settings. | ||
| AGENT_LOCK = asyncio.Lock() | ||
|
|
||
| # Module-level lock: serializes the entire /setup/complete flow so two | ||
| # concurrent clients cannot both pass the ``setup_complete=false`` check | ||
| # and then race on reinit + flag write. | ||
| COMPLETE_LOCK = asyncio.Lock() | ||
|
|
||
|
|
||
| def validate_agent_index( | ||
| agent_index: int, | ||
|
|
@@ -72,9 +85,14 @@ def validate_agent_index( | |
| async def post_setup_reinit(app_state: AppState) -> None: | ||
| """Reload providers and bootstrap agents after setup completion. | ||
|
|
||
| Both operations are non-fatal: setup completion must succeed | ||
| even if re-init partially fails (the user can restart the | ||
| server to pick up changes). | ||
| Raises on failure so the caller can keep ``setup_complete=false`` | ||
| when reinit cannot finish; a half-configured runtime presenting | ||
| itself as "complete" is worse than a clear error the operator can | ||
| retry after fixing the underlying provider config. | ||
|
|
||
| The matching call site in | ||
| :func:`SetupController.complete_setup` only persists the completion | ||
| flag when this function returns without raising. | ||
|
|
||
| Args: | ||
| app_state: Application state containing services. | ||
|
|
@@ -92,11 +110,13 @@ async def post_setup_reinit(app_state: AppState) -> None: | |
| app_state.swap_provider_registry(new_registry) | ||
| except MemoryError, RecursionError: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| raise | ||
| except Exception: | ||
| except Exception as exc: | ||
| logger.warning( | ||
| SETUP_PROVIDER_RELOAD_FAILED, | ||
| error="Provider reload failed after setup (non-fatal)", | ||
| error_type=type(exc).__name__, | ||
| error=safe_error_description(exc), | ||
| ) | ||
| raise | ||
|
|
||
| # 2. Bootstrap agents into runtime registry. | ||
| if app_state.has_agent_registry: | ||
|
|
@@ -111,11 +131,13 @@ async def post_setup_reinit(app_state: AppState) -> None: | |
| ) | ||
| except MemoryError, RecursionError: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| raise | ||
| except Exception: | ||
| except Exception as exc: | ||
| logger.warning( | ||
| SETUP_AGENT_BOOTSTRAP_FAILED, | ||
| error="Agent bootstrap failed (non-fatal)", | ||
| error_type=type(exc).__name__, | ||
| error=safe_error_description(exc), | ||
| ) | ||
| raise | ||
|
|
||
|
|
||
| async def check_needs_admin( | ||
|
|
@@ -340,7 +362,7 @@ async def auto_select_embedder( | |
| available_model_ids: tuple[str, ...], | ||
| provider_preset_name: str | None = None, | ||
| has_gpu: bool | None = None, | ||
| ) -> None: | ||
| ) -> EmbedderSelectResult: | ||
| """Auto-select an embedding model and persist the choice. | ||
|
|
||
| Best-effort: logs a warning but does not raise on failure. | ||
|
|
@@ -351,6 +373,13 @@ async def auto_select_embedder( | |
| available_model_ids: Model IDs discovered from providers. | ||
| provider_preset_name: Provider preset for tier inference. | ||
| has_gpu: Whether the host has a GPU. | ||
|
|
||
| Returns: | ||
| ``None`` on success (a model was ranked and persisted), or a | ||
| short human-readable failure reason string when selection or | ||
| persistence failed. The inverted convention (None = success, | ||
| str = failure) keeps the caller free to pass the result | ||
| directly to ``SetupCompleteResponse.embedder_failure_reason``. | ||
| """ | ||
| from synthorg.memory.embedding.selector import ( # noqa: PLC0415 | ||
| infer_deployment_tier, | ||
|
|
@@ -373,20 +402,14 @@ async def auto_select_embedder( | |
| # Try without tier filter as fallback. | ||
| ranking = select_embedding_model(available_model_ids) | ||
| if ranking is None: | ||
| reason = "no ranked embedding model available for configured providers" | ||
| logger.warning( | ||
| MEMORY_EMBEDDER_AUTO_SELECT_FAILED, | ||
| available_models=len(available_model_ids), | ||
| tier=tier.value, | ||
| reason="no LMEB-ranked model in available models", | ||
| reason=reason, | ||
| ) | ||
| return | ||
| logger.info( | ||
| MEMORY_EMBEDDER_AUTO_SELECTED, | ||
| model_id=ranking.model_id, | ||
| tier=tier.value, | ||
| overall_score=ranking.overall, | ||
| dims=ranking.output_dims, | ||
| ) | ||
| return reason | ||
| try: | ||
| await settings_svc.set( | ||
| "memory", | ||
|
|
@@ -400,8 +423,24 @@ async def auto_select_embedder( | |
| ) | ||
| except MemoryError, RecursionError: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| raise | ||
| except Exception: | ||
| except Exception as exc: | ||
| reason = "failed to persist embedder settings" | ||
| logger.warning( | ||
| MEMORY_EMBEDDER_AUTO_SELECT_FAILED, | ||
| reason="failed to persist embedder settings", | ||
| reason=reason, | ||
| error_type=type(exc).__name__, | ||
| error=safe_error_description(exc), | ||
| ) | ||
| return reason | ||
| # INFO log emitted AFTER the persistence writes succeed so the | ||
| # event accurately reflects committed state. A pre-write log | ||
| # would otherwise misleadingly claim success when the writes | ||
| # below fail and fall through to the warning branch. | ||
| logger.info( | ||
| MEMORY_EMBEDDER_AUTO_SELECTED, | ||
| model_id=ranking.model_id, | ||
| tier=tier.value, | ||
| overall_score=ranking.overall, | ||
| dims=ranking.output_dims, | ||
| ) | ||
| return None | ||
Uh oh!
There was an error while loading. Please reload this page.