Skip to content

Commit faffab6

Browse files
tisnikzszabo-rh
authored andcommitted
improved readiness probe
2 parents 7173f72 + 558732a commit faffab6

19 files changed

+1212
-84
lines changed

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,8 @@ dependencies = [
4949
# Used by Llama Stack version checker
5050
"semver<4.0.0",
5151
# Used by authorization resolvers
52-
"jsonpath-ng>=1.6.1"
52+
"jsonpath-ng>=1.6.1",
53+
"psycopg2-binary>=2.9.10",
5354
]
5455

5556

src/app/diagnostic_app.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
"""Minimal diagnostic FastAPI app for when configuration fails."""
2+
3+
from fastapi import FastAPI
4+
from app.endpoints import health
5+
import version
6+
7+
8+
def create_diagnostic_app() -> FastAPI:
9+
"""
10+
Create a minimal diagnostic FastAPI app with only health endpoints.
11+
12+
This app is used when configuration loading fails, providing basic
13+
health reporting capabilities for troubleshooting.
14+
15+
Returns:
16+
FastAPI: Minimal app with only health endpoints
17+
"""
18+
app = FastAPI(
19+
title="Lightspeed Stack - Diagnostic Mode",
20+
summary="Minimal diagnostic server for troubleshooting",
21+
description="Limited service running in diagnostic mode due to configuration issues",
22+
version=version.__version__,
23+
contact={
24+
"name": "Red Hat",
25+
"url": "https://www.redhat.com/",
26+
},
27+
license_info={
28+
"name": "Apache 2.0",
29+
"url": "https://www.apache.org/licenses/LICENSE-2.0.html",
30+
},
31+
)
32+
33+
# Only include health endpoints - no authentication required
34+
app.include_router(health.router)
35+
36+
return app
37+
38+
39+
# Export the diagnostic app instance
40+
diagnostic_app = create_diagnostic_app()

src/app/endpoints/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,10 @@
1414

1515
logger = logging.getLogger(__name__)
1616
router = APIRouter(tags=["config"])
17-
1817
auth_dependency = get_auth_dependency()
1918

2019

20+
2121
get_config_responses: dict[int | str, dict[str, Any]] = {
2222
200: {
2323
"name": "foo bar baz",

src/app/endpoints/health.py

Lines changed: 145 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -6,26 +6,125 @@
66
"""
77

88
import logging
9-
from typing import Annotated, Any
9+
import re
10+
from typing import Any, Dict, List
1011

1112
from llama_stack.providers.datatypes import HealthStatus
1213

13-
from fastapi import APIRouter, status, Response, Depends
14+
from fastapi import APIRouter, status, Response
1415
from client import AsyncLlamaStackClientHolder
15-
from authentication.interface import AuthTuple
16-
from authentication import get_auth_dependency
17-
from authorization.middleware import authorize
18-
from models.config import Action
1916
from models.responses import (
2017
LivenessResponse,
2118
ReadinessResponse,
2219
ProviderHealthStatus,
2320
)
21+
from configuration import configuration
22+
from app.state import app_state
2423

2524
logger = logging.getLogger("app.endpoints.handlers")
2625
router = APIRouter(tags=["health"])
2726

28-
auth_dependency = get_auth_dependency()
27+
def find_unresolved_template_placeholders(obj: Any, path: str = "") -> List[tuple[str, str]]:
28+
r"""
29+
Recursively search for unresolved template placeholders in configuration.
30+
31+
Detects patterns like:
32+
- ${VARIABLE_NAME} (basic template format)
33+
- ${\{VARIABLE_NAME}} (malformed template)
34+
- ${env.VARIABLE_NAME} (llama-stack format)
35+
36+
Returns list of (path, value) tuples for any unresolved placeholders.
37+
"""
38+
unresolved = []
39+
40+
# Patterns that indicate unresolved template placeholders
41+
template_patterns = [
42+
r'\$\{\\?\{[^}]+\}\\?\}', # Malformed: ${\{VARIABLE}} (check first)
43+
r'\$\{env\.[^}]+\}', # llama-stack env: ${env.VARIABLE}
44+
r'\$\{[^}]+\}', # Basic: ${VARIABLE} (check last)
45+
]
46+
47+
def check_string_for_patterns(value: str, current_path: str):
48+
"""Check if a string contains unresolved template patterns."""
49+
for pattern in template_patterns:
50+
matches = re.findall(pattern, value)
51+
if matches:
52+
unresolved.append((current_path, matches[0]))
53+
break # Stop after first match to avoid duplicates
54+
55+
def walk_object(obj: Any, current_path: str = ""):
56+
"""Recursively walk the configuration object."""
57+
if isinstance(obj, dict):
58+
for key, value in obj.items():
59+
new_path = f"{current_path}.{key}" if current_path else key
60+
walk_object(value, new_path)
61+
elif isinstance(obj, list):
62+
for i, item in enumerate(obj):
63+
new_path = f"{current_path}[{i}]"
64+
walk_object(item, new_path)
65+
elif isinstance(obj, str):
66+
check_string_for_patterns(obj, current_path)
67+
68+
walk_object(obj, path)
69+
return unresolved
70+
71+
72+
def check_comprehensive_readiness() -> tuple[bool, str]:
73+
"""
74+
Comprehensive readiness check that validates configuration and initialization.
75+
76+
Checks in order of importance:
77+
1. Configuration loading and validation
78+
2. Application initialization state
79+
3. Template placeholder resolution
80+
81+
Returns:
82+
tuple[bool, str]: (is_ready, detailed_reason)
83+
"""
84+
try:
85+
# Check 1: Configuration loading
86+
if not configuration.is_loaded():
87+
# Check if we have detailed error from app_state
88+
status = app_state.initialization_status
89+
for error in status['errors']:
90+
if 'configuration' in error.lower():
91+
return False, f"Configuration loading failed: {error.split(':', 1)[1].strip()}"
92+
return False, "Configuration not loaded"
93+
94+
# Check 2: Template placeholders (critical - causes pydantic errors)
95+
unresolved_placeholders = find_unresolved_template_placeholders(configuration.configuration)
96+
if unresolved_placeholders:
97+
# Prioritize showing the most problematic placeholders
98+
example_path, example_value = unresolved_placeholders[0]
99+
count = len(unresolved_placeholders)
100+
if count == 1:
101+
return False, f"Unresolved template placeholder in {example_path}: {example_value}"
102+
else:
103+
return False, f"Found {count} unresolved template placeholders (e.g., {example_path}: {example_value})"
104+
105+
# Check 3: Application initialization state
106+
if not app_state.is_fully_initialized:
107+
status = app_state.initialization_status
108+
failed_checks = [k for k, v in status['checks'].items() if not v]
109+
110+
# Return specific error if available
111+
for error in status['errors']:
112+
# Return first non-configuration error (those are already handled above)
113+
if not any(check in error.lower() for check in ['configuration']):
114+
error_detail = error.split(':', 1)[1].strip() if ':' in error else error
115+
return False, f"Initialization failed: {error_detail}"
116+
117+
# Fallback to listing failed checks
118+
if failed_checks:
119+
failed_names = [check.replace('_', ' ').title() for check in failed_checks]
120+
return False, f"Incomplete initialization: {', '.join(failed_names)}"
121+
122+
return False, "Application initialization not complete"
123+
124+
return True, "Service ready"
125+
126+
except Exception as e:
127+
return False, f"Readiness check error: {str(e)}"
29128

30129

31130
async def get_providers_health_statuses() -> list[ProviderHealthStatus]:
@@ -78,40 +177,55 @@ async def get_providers_health_statuses() -> list[ProviderHealthStatus]:
78177

79178

80179
@router.get("/readiness", responses=get_readiness_responses)
81-
@authorize(Action.INFO)
82180
async def readiness_probe_get_method(
83-
auth: Annotated[AuthTuple, Depends(auth_dependency)],
84181
response: Response,
85182
) -> ReadinessResponse:
86183
"""
87-
Handle the readiness probe endpoint, returning service readiness.
88-
89-
If any provider reports an error status, responds with HTTP 503
90-
and details of unhealthy providers; otherwise, indicates the
91-
service is ready.
184+
Enhanced readiness probe that validates complete application readiness.
185+
186+
This probe performs comprehensive checks including:
187+
1. Configuration loading and validation (detects unresolved template placeholders)
188+
2. Application initialization state (startup sequence completion)
189+
3. LLM provider health status (existing functionality)
190+
191+
The probe helps detect issues like:
192+
- Configuration loading failures (pydantic validation errors)
193+
- Unresolved environment variables (${VARIABLE} patterns)
194+
- Incomplete application startup (llama client, MCP servers, etc.)
195+
- Provider connectivity problems
196+
197+
Returns 200 when fully ready, 503 when any issues are detected.
198+
Each failure mode provides specific diagnostic information in the response.
92199
"""
93-
# Used only for authorization
94-
_ = auth
95-
96200
logger.info("Response to /v1/readiness endpoint")
97201

98-
provider_statuses = await get_providers_health_statuses()
202+
# Comprehensive configuration and initialization check
203+
config_and_init_ready, reason = check_comprehensive_readiness()
204+
if not config_and_init_ready:
205+
# Configuration/initialization issues are critical - return immediately
206+
response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
207+
return ReadinessResponse(ready=False, reason=reason, providers=[])
99208

100-
# Check if any provider is unhealthy (not counting not_implemented as unhealthy)
101-
unhealthy_providers = [
102-
p for p in provider_statuses if p.status == HealthStatus.ERROR.value
103-
]
209+
# Provider health check (only if configuration/initialization is ready)
210+
try:
211+
provider_statuses = await get_providers_health_statuses()
212+
unhealthy_providers = [
213+
p for p in provider_statuses if p.status == HealthStatus.ERROR.value
214+
]
215+
216+
if unhealthy_providers:
217+
unhealthy_provider_names = [p.provider_id for p in unhealthy_providers]
218+
reason = f"Unhealthy providers: {', '.join(unhealthy_provider_names)}"
219+
response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
220+
return ReadinessResponse(ready=False, reason=reason, providers=unhealthy_providers)
104221

105-
if unhealthy_providers:
106-
ready = False
107-
unhealthy_provider_names = [p.provider_id for p in unhealthy_providers]
108-
reason = f"Providers not healthy: {', '.join(unhealthy_provider_names)}"
222+
except Exception as e:
223+
reason = f"Provider health check failed: {str(e)}"
109224
response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
110-
else:
111-
ready = True
112-
reason = "All providers are healthy"
225+
return ReadinessResponse(ready=False, reason=reason, providers=[])
113226

114-
return ReadinessResponse(ready=ready, reason=reason, providers=unhealthy_providers)
227+
# All checks passed
228+
return ReadinessResponse(ready=True, reason="Application fully initialized and ready", providers=[])
115229

116230

117231
get_liveness_responses: dict[int | str, dict[str, Any]] = {
@@ -124,18 +238,13 @@ async def readiness_probe_get_method(
124238

125239

126240
@router.get("/liveness", responses=get_liveness_responses)
127-
@authorize(Action.INFO)
128-
async def liveness_probe_get_method(
129-
auth: Annotated[AuthTuple, Depends(auth_dependency)],
130-
) -> LivenessResponse:
241+
async def liveness_probe_get_method() -> LivenessResponse:
131242
"""
132243
Return the liveness status of the service.
133244
134245
Returns:
135246
LivenessResponse: Indicates that the service is alive.
136247
"""
137-
# Used only for authorization
138-
_ = auth
139248

140249
logger.info("Response to /v1/liveness endpoint")
141250

src/app/endpoints/info.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818

1919
logger = logging.getLogger("app.endpoints.handlers")
2020
router = APIRouter(tags=["info"])
21-
2221
auth_dependency = get_auth_dependency()
2322

2423

src/app/endpoints/metrics.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@
1515
from metrics.utils import setup_model_metrics
1616

1717
router = APIRouter(tags=["metrics"])
18-
1918
auth_dependency = get_auth_dependency()
2019

2120

21+
2222
@router.get("/metrics", response_class=PlainTextResponse)
2323
@authorize(Action.GET_METRICS)
2424
async def metrics_endpoint_handler(

src/app/endpoints/models.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,10 @@
1818

1919
logger = logging.getLogger(__name__)
2020
router = APIRouter(tags=["models"])
21-
22-
2321
auth_dependency = get_auth_dependency()
2422

2523

24+
2625
models_responses: dict[int | str, dict[str, Any]] = {
2726
200: {
2827
"models": [

src/app/endpoints/root.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313

1414
logger = logging.getLogger("app.endpoints.handlers")
1515
router = APIRouter(tags=["root"])
16-
1716
auth_dependency = get_auth_dependency()
1817

1918

0 commit comments

Comments
 (0)