66"""
77
88import logging
9- from typing import Annotated , Any
9+ import re
10+ from typing import Any , Dict , List
1011
1112from llama_stack .providers .datatypes import HealthStatus
1213
13- from fastapi import APIRouter , status , Response , Depends
14+ from fastapi import APIRouter , status , Response
1415from client import AsyncLlamaStackClientHolder
15- from authentication .interface import AuthTuple
16- from authentication import get_auth_dependency
17- from authorization .middleware import authorize
18- from models .config import Action
1916from models .responses import (
2017 LivenessResponse ,
2118 ReadinessResponse ,
2219 ProviderHealthStatus ,
2320)
21+ from configuration import configuration
22+ from app .state import app_state
2423
2524logger = logging .getLogger ("app.endpoints.handlers" )
2625router = APIRouter (tags = ["health" ])
2726
28- auth_dependency = get_auth_dependency ()
27+ def find_unresolved_template_placeholders (obj : Any , path : str = "" ) -> List [tuple [str , str ]]:
28+ r"""
29+ Recursively search for unresolved template placeholders in configuration.
30+
31+ Detects patterns like:
32+ - ${VARIABLE_NAME} (basic template format)
33+ - ${\{VARIABLE_NAME}} (malformed template)
34+ - ${env.VARIABLE_NAME} (llama-stack format)
35+
36+ Returns list of (path, value) tuples for any unresolved placeholders.
37+ """
38+ unresolved = []
39+
40+ # Patterns that indicate unresolved template placeholders
41+ template_patterns = [
42+ r'\$\{\\?\{[^}]+\}\\?\}' , # Malformed: ${\{VARIABLE}} (check first)
43+ r'\$\{env\.[^}]+\}' , # llama-stack env: ${env.VARIABLE}
44+ r'\$\{[^}]+\}' , # Basic: ${VARIABLE} (check last)
45+ ]
46+
47+ def check_string_for_patterns (value : str , current_path : str ):
48+ """Check if a string contains unresolved template patterns."""
49+ for pattern in template_patterns :
50+ matches = re .findall (pattern , value )
51+ if matches :
52+ unresolved .append ((current_path , matches [0 ]))
53+ break # Stop after first match to avoid duplicates
54+
55+ def walk_object (obj : Any , current_path : str = "" ):
56+ """Recursively walk the configuration object."""
57+ if isinstance (obj , dict ):
58+ for key , value in obj .items ():
59+ new_path = f"{ current_path } .{ key } " if current_path else key
60+ walk_object (value , new_path )
61+ elif isinstance (obj , list ):
62+ for i , item in enumerate (obj ):
63+ new_path = f"{ current_path } [{ i } ]"
64+ walk_object (item , new_path )
65+ elif isinstance (obj , str ):
66+ check_string_for_patterns (obj , current_path )
67+
68+ walk_object (obj , path )
69+ return unresolved
70+
71+
72+ def check_comprehensive_readiness () -> tuple [bool , str ]:
73+ """
74+ Comprehensive readiness check that validates configuration and initialization.
75+
76+ Checks in order of importance:
77+ 1. Configuration loading and validation
78+ 2. Application initialization state
79+ 3. Template placeholder resolution
80+
81+ Returns:
82+ tuple[bool, str]: (is_ready, detailed_reason)
83+ """
84+ try :
85+ # Check 1: Configuration loading
86+ if not configuration .is_loaded ():
87+ # Check if we have detailed error from app_state
88+ status = app_state .initialization_status
89+ for error in status ['errors' ]:
90+ if 'configuration' in error .lower ():
91+ return False , f"Configuration loading failed: { error .split (':' , 1 )[1 ].strip ()} "
92+ return False , "Configuration not loaded"
93+
94+ # Check 2: Template placeholders (critical - causes pydantic errors)
95+ unresolved_placeholders = find_unresolved_template_placeholders (configuration .configuration )
96+ if unresolved_placeholders :
97+ # Prioritize showing the most problematic placeholders
98+ example_path , example_value = unresolved_placeholders [0 ]
99+ count = len (unresolved_placeholders )
100+ if count == 1 :
101+ return False , f"Unresolved template placeholder in { example_path } : { example_value } "
102+ else :
103+ return False , f"Found { count } unresolved template placeholders (e.g., { example_path } : { example_value } )"
104+
105+ # Check 3: Application initialization state
106+ if not app_state .is_fully_initialized :
107+ status = app_state .initialization_status
108+ failed_checks = [k for k , v in status ['checks' ].items () if not v ]
109+
110+ # Return specific error if available
111+ for error in status ['errors' ]:
112+ # Return first non-configuration error (those are already handled above)
113+ if not any (check in error .lower () for check in ['configuration' ]):
114+ error_detail = error .split (':' , 1 )[1 ].strip () if ':' in error else error
115+ return False , f"Initialization failed: { error_detail } "
116+
117+ # Fallback to listing failed checks
118+ if failed_checks :
119+ failed_names = [check .replace ('_' , ' ' ).title () for check in failed_checks ]
120+ return False , f"Incomplete initialization: { ', ' .join (failed_names )} "
121+
122+ return False , "Application initialization not complete"
123+
124+ return True , "Service ready"
125+
126+ except Exception as e :
127+ return False , f"Readiness check error: { str (e )} "
29128
30129
31130async def get_providers_health_statuses () -> list [ProviderHealthStatus ]:
@@ -78,40 +177,55 @@ async def get_providers_health_statuses() -> list[ProviderHealthStatus]:
78177
79178
80179@router .get ("/readiness" , responses = get_readiness_responses )
81- @authorize (Action .INFO )
82180async def readiness_probe_get_method (
83- auth : Annotated [AuthTuple , Depends (auth_dependency )],
84181 response : Response ,
85182) -> ReadinessResponse :
86183 """
87- Handle the readiness probe endpoint, returning service readiness.
88-
89- If any provider reports an error status, responds with HTTP 503
90- and details of unhealthy providers; otherwise, indicates the
91- service is ready.
184+ Enhanced readiness probe that validates complete application readiness.
185+
186+ This probe performs comprehensive checks including:
187+ 1. Configuration loading and validation (detects unresolved template placeholders)
188+ 2. Application initialization state (startup sequence completion)
189+ 3. LLM provider health status (existing functionality)
190+
191+ The probe helps detect issues like:
192+ - Configuration loading failures (pydantic validation errors)
193+ - Unresolved environment variables (${VARIABLE} patterns)
194+ - Incomplete application startup (llama client, MCP servers, etc.)
195+ - Provider connectivity problems
196+
197+ Returns 200 when fully ready, 503 when any issues are detected.
198+ Each failure mode provides specific diagnostic information in the response.
92199 """
93- # Used only for authorization
94- _ = auth
95-
96200 logger .info ("Response to /v1/readiness endpoint" )
97201
98- provider_statuses = await get_providers_health_statuses ()
202+ # Comprehensive configuration and initialization check
203+ config_and_init_ready , reason = check_comprehensive_readiness ()
204+ if not config_and_init_ready :
205+ # Configuration/initialization issues are critical - return immediately
206+ response .status_code = status .HTTP_503_SERVICE_UNAVAILABLE
207+ return ReadinessResponse (ready = False , reason = reason , providers = [])
99208
100- # Check if any provider is unhealthy (not counting not_implemented as unhealthy)
101- unhealthy_providers = [
102- p for p in provider_statuses if p .status == HealthStatus .ERROR .value
103- ]
209+ # Provider health check (only if configuration/initialization is ready)
210+ try :
211+ provider_statuses = await get_providers_health_statuses ()
212+ unhealthy_providers = [
213+ p for p in provider_statuses if p .status == HealthStatus .ERROR .value
214+ ]
215+
216+ if unhealthy_providers :
217+ unhealthy_provider_names = [p .provider_id for p in unhealthy_providers ]
218+ reason = f"Unhealthy providers: { ', ' .join (unhealthy_provider_names )} "
219+ response .status_code = status .HTTP_503_SERVICE_UNAVAILABLE
220+ return ReadinessResponse (ready = False , reason = reason , providers = unhealthy_providers )
104221
105- if unhealthy_providers :
106- ready = False
107- unhealthy_provider_names = [p .provider_id for p in unhealthy_providers ]
108- reason = f"Providers not healthy: { ', ' .join (unhealthy_provider_names )} "
222+ except Exception as e :
223+ reason = f"Provider health check failed: { str (e )} "
109224 response .status_code = status .HTTP_503_SERVICE_UNAVAILABLE
110- else :
111- ready = True
112- reason = "All providers are healthy"
225+ return ReadinessResponse (ready = False , reason = reason , providers = [])
113226
114- return ReadinessResponse (ready = ready , reason = reason , providers = unhealthy_providers )
227+ # All checks passed
228+ return ReadinessResponse (ready = True , reason = "Application fully initialized and ready" , providers = [])
115229
116230
117231get_liveness_responses : dict [int | str , dict [str , Any ]] = {
@@ -124,18 +238,13 @@ async def readiness_probe_get_method(
124238
125239
126240@router .get ("/liveness" , responses = get_liveness_responses )
127- @authorize (Action .INFO )
128- async def liveness_probe_get_method (
129- auth : Annotated [AuthTuple , Depends (auth_dependency )],
130- ) -> LivenessResponse :
241+ async def liveness_probe_get_method () -> LivenessResponse :
131242 """
132243 Return the liveness status of the service.
133244
134245 Returns:
135246 LivenessResponse: Indicates that the service is alive.
136247 """
137- # Used only for authorization
138- _ = auth
139248
140249 logger .info ("Response to /v1/liveness endpoint" )
141250
0 commit comments