From 3591b993d03f1b8bc69541bebd45c526989f28e5 Mon Sep 17 00:00:00 2001 From: Aurelio <19254254+Aureliolo@users.noreply.github.com> Date: Mon, 9 Mar 2026 13:45:36 +0100 Subject: [PATCH 1/4] feat: add CFO cost optimization service with anomaly detection, reports, and approval decisions (#46) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement CostOptimizer and ReportGenerator domain services backing the CFO role (DESIGN_SPEC §10.3). CostOptimizer provides spending anomaly detection (Z-score + spike factor), cost efficiency analysis per agent, model downgrade recommendations via ModelResolver, and operation approval/denial based on budget utilization. ReportGenerator produces multi-dimensional spending reports with task/provider/model breakdowns and period-over-period comparison. Adds get_records() to CostTracker for raw record access. 80 new tests, 96% budget module coverage. --- src/ai_company/budget/__init__.py | 40 + src/ai_company/budget/optimizer.py | 716 ++++++++++++++++++ src/ai_company/budget/optimizer_models.py | 348 +++++++++ src/ai_company/budget/reports.py | 457 +++++++++++ src/ai_company/budget/tracker.py | 43 ++ src/ai_company/observability/events/cfo.py | 13 + tests/unit/budget/conftest.py | 42 + tests/unit/budget/test_optimizer.py | 591 +++++++++++++++ tests/unit/budget/test_optimizer_models.py | 335 ++++++++ tests/unit/budget/test_reports.py | 337 +++++++++ tests/unit/budget/test_tracker_get_records.py | 122 +++ tests/unit/observability/test_events.py | 1 + 12 files changed, 3045 insertions(+) create mode 100644 src/ai_company/budget/optimizer.py create mode 100644 src/ai_company/budget/optimizer_models.py create mode 100644 src/ai_company/budget/reports.py create mode 100644 src/ai_company/observability/events/cfo.py create mode 100644 tests/unit/budget/test_optimizer.py create mode 100644 tests/unit/budget/test_optimizer_models.py create mode 100644 tests/unit/budget/test_reports.py create mode 100644 tests/unit/budget/test_tracker_get_records.py diff --git a/src/ai_company/budget/__init__.py b/src/ai_company/budget/__init__.py index c8f7e006e1..eac48a7d2c 100644 --- a/src/ai_company/budget/__init__.py +++ b/src/ai_company/budget/__init__.py @@ -43,6 +43,20 @@ DepartmentBudget, TeamBudget, ) +from ai_company.budget.optimizer import CostOptimizer +from ai_company.budget.optimizer_models import ( + AgentEfficiency, + AnomalyDetectionResult, + AnomalySeverity, + AnomalyType, + ApprovalDecision, + CostOptimizerConfig, + DowngradeAnalysis, + DowngradeRecommendation, + EfficiencyAnalysis, + EfficiencyRating, + SpendingAnomaly, +) from ai_company.budget.quota import ( DegradationAction, DegradationConfig, @@ -55,6 +69,14 @@ effective_cost_per_1k, ) from ai_company.budget.quota_tracker import QuotaTracker +from ai_company.budget.reports import ( + ModelDistribution, + PeriodComparison, + ProviderDistribution, + ReportGenerator, + SpendingReport, + TaskSpending, +) from ai_company.budget.spending_summary import ( AgentSpending, DepartmentSpending, @@ -65,7 +87,12 @@ __all__ = [ "BUILTIN_TIERS", + "AgentEfficiency", "AgentSpending", + "AnomalyDetectionResult", + "AnomalySeverity", + "AnomalyType", + "ApprovalDecision", "AutoDowngradeConfig", "BudgetAlertConfig", "BudgetAlertLevel", @@ -78,6 +105,8 @@ "CoordinationMetrics", "CoordinationMetricsConfig", "CoordinationOverhead", + "CostOptimizer", + "CostOptimizerConfig", "CostRecord", "CostTierDefinition", "CostTiersConfig", @@ -86,24 +115,35 @@ "DegradationConfig", "DepartmentBudget", "DepartmentSpending", + "DowngradeAnalysis", + "DowngradeRecommendation", + "EfficiencyAnalysis", + "EfficiencyRating", "ErrorAmplification", "ErrorCategory", "ErrorTaxonomyConfig", "LLMCallCategory", "MessageDensity", + "ModelDistribution", "OrchestrationAlertLevel", "OrchestrationAlertThresholds", "OrchestrationRatio", + "PeriodComparison", "PeriodSpending", "ProviderCostModel", + "ProviderDistribution", "QuotaCheckResult", "QuotaLimit", "QuotaSnapshot", "QuotaTracker", "QuotaWindow", "RedundancyRate", + "ReportGenerator", + "SpendingAnomaly", + "SpendingReport", "SpendingSummary", "SubscriptionConfig", + "TaskSpending", "TeamBudget", "billing_period_start", "classify_model_tier", diff --git a/src/ai_company/budget/optimizer.py b/src/ai_company/budget/optimizer.py new file mode 100644 index 0000000000..8f020d06e8 --- /dev/null +++ b/src/ai_company/budget/optimizer.py @@ -0,0 +1,716 @@ +"""CFO cost optimization service. + +Provides spending anomaly detection, cost efficiency analysis, model +downgrade recommendations, and operation approval decisions. Composes +:class:`~ai_company.budget.tracker.CostTracker` and +:class:`~ai_company.budget.config.BudgetConfig` for read-only analytical +queries — the advisory complement to +:class:`~ai_company.budget.enforcer.BudgetEnforcer`. + +Service layer backing the CFO role (DESIGN_SPEC Section 10.3). +""" + +import math +import statistics +from collections import defaultdict +from datetime import UTC, datetime, timedelta +from typing import TYPE_CHECKING + +from ai_company.budget.billing import billing_period_start +from ai_company.budget.enums import BudgetAlertLevel +from ai_company.budget.optimizer_models import ( + AgentEfficiency, + AnomalyDetectionResult, + AnomalySeverity, + AnomalyType, + ApprovalDecision, + CostOptimizerConfig, + DowngradeAnalysis, + DowngradeRecommendation, + EfficiencyAnalysis, + EfficiencyRating, + SpendingAnomaly, +) +from ai_company.constants import BUDGET_ROUNDING_PRECISION +from ai_company.observability import get_logger +from ai_company.observability.events.cfo import ( + CFO_ANOMALY_DETECTED, + CFO_ANOMALY_SCAN_COMPLETE, + CFO_APPROVAL_EVALUATED, + CFO_DOWNGRADE_RECOMMENDED, + CFO_EFFICIENCY_ANALYSIS_COMPLETE, + CFO_OPERATION_DENIED, + CFO_OPTIMIZER_CREATED, +) + +if TYPE_CHECKING: + from collections.abc import Sequence + + from ai_company.budget.config import BudgetConfig + from ai_company.budget.cost_record import CostRecord + from ai_company.budget.tracker import CostTracker + from ai_company.providers.routing.models import ResolvedModel + from ai_company.providers.routing.resolver import ModelResolver + +logger = get_logger(__name__) + +# ── Alert level ordering (reused from enforcer pattern) ────────── + +_ALERT_LEVEL_ORDER: dict[BudgetAlertLevel, int] = { + BudgetAlertLevel.NORMAL: 0, + BudgetAlertLevel.WARNING: 1, + BudgetAlertLevel.CRITICAL: 2, + BudgetAlertLevel.HARD_STOP: 3, +} + + +class CostOptimizer: + """CFO analytical service for cost optimization. + + Composes CostTracker and BudgetConfig for read-only analysis: + anomaly detection, efficiency analysis, downgrade recommendations, + and operation approval evaluation. + + Args: + cost_tracker: Cost tracking service for querying spend. + budget_config: Budget configuration for limits and thresholds. + config: Optimizer-specific configuration. Defaults to + ``CostOptimizerConfig()`` when ``None``. + model_resolver: Optional model resolver for downgrade + recommendations. + """ + + def __init__( + self, + *, + cost_tracker: CostTracker, + budget_config: BudgetConfig, + config: CostOptimizerConfig | None = None, + model_resolver: ModelResolver | None = None, + ) -> None: + self._cost_tracker = cost_tracker + self._budget_config = budget_config + self._config = config or CostOptimizerConfig() + self._model_resolver = model_resolver + logger.debug( + CFO_OPTIMIZER_CREATED, + has_model_resolver=model_resolver is not None, + anomaly_sigma=self._config.anomaly_sigma_threshold, + ) + + async def detect_anomalies( + self, + *, + start: datetime, + end: datetime, + window_count: int = 5, + ) -> AnomalyDetectionResult: + """Detect spending anomalies in the given period. + + Divides ``[start, end)`` into ``window_count`` equal windows, + groups records by agent, and flags agents whose last-window + spending deviates significantly from their historical mean. + + Args: + start: Inclusive period start. + end: Exclusive period end. + window_count: Number of time windows to divide the period + into. Must be >= 2. + + Returns: + Anomaly detection result with any detected anomalies. + + Raises: + ValueError: If ``start >= end`` or ``window_count < 2``. + """ + if window_count < 2: # noqa: PLR2004 + msg = f"window_count must be >= 2, got {window_count}" + raise ValueError(msg) + + now = datetime.now(UTC) + records = await self._cost_tracker.get_records( + start=start, + end=end, + ) + + total_duration = end - start + window_duration = total_duration / window_count + window_starts = tuple(start + window_duration * i for i in range(window_count)) + + agent_ids = sorted({r.agent_id for r in records}) + anomalies: list[SpendingAnomaly] = [] + + for agent_id in agent_ids: + window_costs = _compute_window_costs( + records, + agent_id, + window_starts, + window_duration, + ) + anomaly = _detect_spike_anomaly( + agent_id, + window_costs, + now, + window_starts, + window_duration, + self._config, + ) + if anomaly is not None: + logger.warning( + CFO_ANOMALY_DETECTED, + agent_id=agent_id, + anomaly_type=anomaly.anomaly_type.value, + severity=anomaly.severity.value, + deviation_factor=anomaly.deviation_factor, + ) + anomalies.append(anomaly) + + result = AnomalyDetectionResult( + anomalies=tuple(anomalies), + scan_period_start=start, + scan_period_end=end, + agents_scanned=len(agent_ids), + scan_timestamp=now, + ) + + logger.info( + CFO_ANOMALY_SCAN_COMPLETE, + anomaly_count=len(anomalies), + agents_scanned=len(agent_ids), + ) + + return result + + async def analyze_efficiency( + self, + *, + start: datetime, + end: datetime, + ) -> EfficiencyAnalysis: + """Analyze cost efficiency of all agents in the period. + + Computes cost-per-1k-tokens for each agent and rates them + relative to the global average. + + Args: + start: Inclusive period start. + end: Exclusive period end. + + Returns: + Efficiency analysis with per-agent ratings. + + Raises: + ValueError: If ``start >= end``. + """ + records = await self._cost_tracker.get_records( + start=start, + end=end, + ) + + by_agent: dict[str, list[CostRecord]] = defaultdict(list) + for r in records: + by_agent[r.agent_id].append(r) + + global_avg = _compute_global_avg_cost_per_1k(records) + threshold_factor = self._config.inefficiency_threshold_factor + + agent_efficiencies: list[AgentEfficiency] = [] + inefficient_count = 0 + + for agent_id in sorted(by_agent): + agent_records = by_agent[agent_id] + total_cost = round( + math.fsum(r.cost_usd for r in agent_records), + BUDGET_ROUNDING_PRECISION, + ) + total_tokens = sum(r.input_tokens + r.output_tokens for r in agent_records) + cost_per_1k = _compute_cost_per_1k(total_cost, total_tokens) + rating = _rate_efficiency(cost_per_1k, global_avg, threshold_factor) + + if rating == EfficiencyRating.INEFFICIENT: + inefficient_count += 1 + + agent_efficiencies.append( + AgentEfficiency( + agent_id=agent_id, + total_cost_usd=total_cost, + total_tokens=total_tokens, + cost_per_1k_tokens=cost_per_1k, + record_count=len(agent_records), + efficiency_rating=rating, + ), + ) + + # Sort by cost_per_1k descending (most expensive first) + agent_efficiencies.sort( + key=lambda a: a.cost_per_1k_tokens, + reverse=True, + ) + + result = EfficiencyAnalysis( + agents=tuple(agent_efficiencies), + global_avg_cost_per_1k=global_avg, + analysis_period_start=start, + analysis_period_end=end, + inefficient_agent_count=inefficient_count, + ) + + logger.info( + CFO_EFFICIENCY_ANALYSIS_COMPLETE, + agent_count=len(agent_efficiencies), + inefficient_count=inefficient_count, + global_avg_cost_per_1k=global_avg, + ) + + return result + + async def recommend_downgrades( + self, + *, + start: datetime, + end: datetime, + ) -> DowngradeAnalysis: + """Recommend model downgrades for inefficient agents. + + Runs efficiency analysis and uses the model resolver and + downgrade map to find cheaper alternatives. + + Args: + start: Inclusive period start. + end: Exclusive period end. + + Returns: + Downgrade analysis with recommendations. Empty when no + model_resolver is configured. + + Raises: + ValueError: If ``start >= end``. + """ + if self._model_resolver is None: + return DowngradeAnalysis( + recommendations=(), + total_estimated_monthly_savings=0.0, + budget_pressure_percent=0.0, + ) + + efficiency = await self.analyze_efficiency(start=start, end=end) + records = await self._cost_tracker.get_records( + start=start, + end=end, + ) + + downgrade_map = dict(self._budget_config.auto_downgrade.downgrade_map) + budget_pressure = await self._compute_budget_pressure() + + recommendations: list[DowngradeRecommendation] = [] + total_savings = 0.0 + + for agent in efficiency.agents: + if agent.efficiency_rating != EfficiencyRating.INEFFICIENT: + continue + + most_used_model = _find_most_used_model(records, agent.agent_id) + if most_used_model is None: + continue + + recommendation = _build_downgrade_recommendation( + agent_id=agent.agent_id, + current_model=most_used_model, + downgrade_map=downgrade_map, + resolver=self._model_resolver, + ) + if recommendation is not None: + recommendations.append(recommendation) + total_savings += recommendation.estimated_savings_per_1k + logger.info( + CFO_DOWNGRADE_RECOMMENDED, + agent_id=agent.agent_id, + current_model=most_used_model, + recommended_model=recommendation.recommended_model, + estimated_savings=recommendation.estimated_savings_per_1k, + ) + + return DowngradeAnalysis( + recommendations=tuple(recommendations), + total_estimated_monthly_savings=round( + total_savings, + BUDGET_ROUNDING_PRECISION, + ), + budget_pressure_percent=budget_pressure, + ) + + async def evaluate_operation( + self, + *, + agent_id: str, + estimated_cost_usd: float, + now: datetime | None = None, + ) -> ApprovalDecision: + """Evaluate whether an operation should proceed. + + Checks current budget utilization and determines if the + estimated cost is acceptable. + + Args: + agent_id: Agent requesting the operation. + estimated_cost_usd: Estimated cost of the operation. + now: Reference timestamp for billing period computation. + Defaults to ``datetime.now(UTC)``. + + Returns: + Approval decision with reasoning. + """ + cfg = self._budget_config + + if cfg.total_monthly <= 0: + return ApprovalDecision( + approved=True, + reason="Budget enforcement disabled (no monthly budget)", + budget_remaining_usd=0.0, + budget_used_percent=0.0, + alert_level=BudgetAlertLevel.NORMAL, + conditions=(), + ) + + period_start = billing_period_start(cfg.reset_day, now=now) + monthly_cost = await self._cost_tracker.get_total_cost( + start=period_start, + ) + remaining = round( + cfg.total_monthly - monthly_cost, + BUDGET_ROUNDING_PRECISION, + ) + used_pct = round( + monthly_cost / cfg.total_monthly * 100, + BUDGET_ROUNDING_PRECISION, + ) + alert_level = _compute_alert_level(used_pct, cfg) + + auto_deny_level = self._config.approval_auto_deny_alert_level + + # Auto-deny if at or above auto-deny alert level + if _ALERT_LEVEL_ORDER[alert_level] >= _ALERT_LEVEL_ORDER[auto_deny_level]: + logger.warning( + CFO_OPERATION_DENIED, + agent_id=agent_id, + estimated_cost=estimated_cost_usd, + alert_level=alert_level.value, + reason="alert_level_exceeded", + ) + return ApprovalDecision( + approved=False, + reason=( + f"Denied: alert level {alert_level.value} " + f"meets or exceeds auto-deny threshold " + f"{auto_deny_level.value}" + ), + budget_remaining_usd=remaining, + budget_used_percent=used_pct, + alert_level=alert_level, + conditions=(), + ) + + # Auto-deny if estimated cost would push past hard stop + hard_stop_limit = round( + cfg.total_monthly * cfg.alerts.hard_stop_at / 100, + BUDGET_ROUNDING_PRECISION, + ) + projected_cost = round( + monthly_cost + estimated_cost_usd, + BUDGET_ROUNDING_PRECISION, + ) + if projected_cost >= hard_stop_limit: + logger.warning( + CFO_OPERATION_DENIED, + agent_id=agent_id, + estimated_cost=estimated_cost_usd, + projected_cost=projected_cost, + hard_stop_limit=hard_stop_limit, + reason="would_exceed_hard_stop", + ) + return ApprovalDecision( + approved=False, + reason=( + f"Denied: projected cost ${projected_cost:.2f} " + f"would exceed hard stop ${hard_stop_limit:.2f}" + ), + budget_remaining_usd=remaining, + budget_used_percent=used_pct, + alert_level=alert_level, + conditions=(), + ) + + # Approve with conditions if cost is high + conditions: list[str] = [] + warn_threshold = self._config.approval_warn_threshold_usd + if estimated_cost_usd >= warn_threshold: + conditions.append( + f"High-cost operation: ${estimated_cost_usd:.2f} " + f"(threshold: ${warn_threshold:.2f})" + ) + + if alert_level in (BudgetAlertLevel.WARNING, BudgetAlertLevel.CRITICAL): + conditions.append( + f"Budget alert level is {alert_level.value} ({used_pct:.1f}% used)" + ) + + logger.info( + CFO_APPROVAL_EVALUATED, + agent_id=agent_id, + approved=True, + estimated_cost=estimated_cost_usd, + alert_level=alert_level.value, + conditions_count=len(conditions), + ) + + return ApprovalDecision( + approved=True, + reason="Approved", + budget_remaining_usd=remaining, + budget_used_percent=used_pct, + alert_level=alert_level, + conditions=tuple(conditions), + ) + + # ── Private helpers ────────────────────────────────────────── + + async def _compute_budget_pressure(self) -> float: + """Compute current budget utilization percentage.""" + cfg = self._budget_config + if cfg.total_monthly <= 0: + return 0.0 + period_start = billing_period_start(cfg.reset_day) + monthly_cost = await self._cost_tracker.get_total_cost( + start=period_start, + ) + return round( + monthly_cost / cfg.total_monthly * 100, + BUDGET_ROUNDING_PRECISION, + ) + + +# ── Module-level pure helpers ──────────────────────────────────── + + +def _compute_window_costs( + records: Sequence[CostRecord], + agent_id: str, + window_starts: tuple[datetime, ...], + window_duration: timedelta, +) -> tuple[float, ...]: + """Compute per-window cost for a single agent.""" + costs: list[float] = [] + for ws in window_starts: + window_end = ws + window_duration + window_cost = math.fsum( + r.cost_usd + for r in records + if r.agent_id == agent_id and r.timestamp >= ws and r.timestamp < window_end + ) + costs.append(round(window_cost, BUDGET_ROUNDING_PRECISION)) + return tuple(costs) + + +def _detect_spike_anomaly( # noqa: PLR0913 + agent_id: str, + window_costs: tuple[float, ...], + now: datetime, + window_starts: tuple[datetime, ...], + window_duration: timedelta, + config: CostOptimizerConfig, +) -> SpendingAnomaly | None: + """Detect a spike anomaly for a single agent. + + Returns ``None`` if no anomaly is detected or insufficient data. + """ + if len(window_costs) < config.min_anomaly_windows: + return None + + historical = window_costs[:-1] + current = window_costs[-1] + + if current == 0.0: + return None + + mean = statistics.mean(historical) + + if mean == 0.0: + # No historical spending — a spike from zero is always flagged + if current > 0: + return SpendingAnomaly( + agent_id=agent_id, + anomaly_type=AnomalyType.SPIKE, + severity=AnomalySeverity.HIGH, + description=( + f"Agent {agent_id!r} went from $0.00 baseline " + f"to ${current:.2f} in the latest window" + ), + current_value=current, + baseline_value=0.0, + deviation_factor=0.0, + detected_at=now, + period_start=window_starts[-1], + period_end=window_starts[-1] + window_duration, + ) + return None + + # Check spike factor (independent of stddev) + is_spike = current > config.anomaly_spike_factor * mean + + # Check sigma threshold + stddev = statistics.stdev(historical) if len(historical) > 1 else 0.0 + deviation = (current - mean) / stddev if stddev > 0 else 0.0 + is_sigma_anomaly = stddev > 0 and deviation > config.anomaly_sigma_threshold + + if not is_spike and not is_sigma_anomaly: + return None + + severity = _classify_severity(deviation) + + return SpendingAnomaly( + agent_id=agent_id, + anomaly_type=AnomalyType.SPIKE, + severity=severity, + description=( + f"Agent {agent_id!r} spent ${current:.2f} vs " + f"${mean:.2f} baseline ({deviation:.1f} sigma)" + ), + current_value=current, + baseline_value=round(mean, BUDGET_ROUNDING_PRECISION), + deviation_factor=round(deviation, BUDGET_ROUNDING_PRECISION), + detected_at=now, + period_start=window_starts[-1], + period_end=window_starts[-1] + window_duration, + ) + + +def _classify_severity(deviation: float) -> AnomalySeverity: + """Classify anomaly severity from deviation factor.""" + if deviation >= 3.0: # noqa: PLR2004 + return AnomalySeverity.HIGH + if deviation >= 2.0: # noqa: PLR2004 + return AnomalySeverity.MEDIUM + return AnomalySeverity.LOW + + +def _compute_cost_per_1k(total_cost: float, total_tokens: int) -> float: + """Compute cost per 1000 tokens, returning 0 for zero tokens.""" + if total_tokens == 0: + return 0.0 + return round(total_cost / total_tokens * 1000, BUDGET_ROUNDING_PRECISION) + + +def _rate_efficiency( + cost_per_1k: float, + global_avg: float, + threshold_factor: float, +) -> EfficiencyRating: + """Rate an agent's cost efficiency relative to global average.""" + if global_avg == 0.0: + return EfficiencyRating.NORMAL + if cost_per_1k > threshold_factor * global_avg: + return EfficiencyRating.INEFFICIENT + if cost_per_1k < 0.8 * global_avg: + return EfficiencyRating.EFFICIENT + return EfficiencyRating.NORMAL + + +def _compute_global_avg_cost_per_1k( + records: Sequence[CostRecord], +) -> float: + """Compute global average cost per 1000 tokens across all records.""" + total_cost = math.fsum(r.cost_usd for r in records) + total_tokens = sum(r.input_tokens + r.output_tokens for r in records) + return _compute_cost_per_1k(total_cost, total_tokens) + + +def _find_most_used_model( + records: Sequence[CostRecord], + agent_id: str, +) -> str | None: + """Find the model most frequently used by an agent.""" + model_counts: dict[str, int] = defaultdict(int) + for r in records: + if r.agent_id == agent_id: + model_counts[r.model] += 1 + if not model_counts: + return None + return max(model_counts, key=lambda m: model_counts[m]) + + +def _build_downgrade_recommendation( + *, + agent_id: str, + current_model: str, + downgrade_map: dict[str, str], + resolver: ModelResolver, +) -> DowngradeRecommendation | None: + """Build a downgrade recommendation for a single agent.""" + current_resolved = resolver.resolve_safe(current_model) + if current_resolved is None: + return None + + # Check downgrade map for known path + source_alias = current_resolved.alias + target_ref: str | None = None + + if source_alias is not None: + target_ref = downgrade_map.get(source_alias) + + if target_ref is None: + # Try to find any cheaper model + cheaper = _find_cheaper_model(current_resolved.total_cost_per_1k, resolver) + if cheaper is None: + return None + target_ref = cheaper.model_id + + target_resolved = resolver.resolve_safe(target_ref) + if target_resolved is None: + return None + + savings = round( + current_resolved.total_cost_per_1k - target_resolved.total_cost_per_1k, + BUDGET_ROUNDING_PRECISION, + ) + if savings <= 0: + return None + + return DowngradeRecommendation( + agent_id=agent_id, + current_model=current_model, + recommended_model=target_resolved.model_id, + estimated_savings_per_1k=savings, + reason=( + f"Switch from {current_model!r} " + f"(${current_resolved.total_cost_per_1k:.4f}/1k) to " + f"{target_resolved.model_id!r} " + f"(${target_resolved.total_cost_per_1k:.4f}/1k)" + ), + ) + + +def _find_cheaper_model( + current_cost_per_1k: float, + resolver: ModelResolver, +) -> ResolvedModel | None: + """Find the cheapest model that costs less than the current one.""" + all_models = resolver.all_models_sorted_by_cost() + for model in all_models: + if model.total_cost_per_1k < current_cost_per_1k: + return model + return None + + +def _compute_alert_level( + used_pct: float, + cfg: BudgetConfig, +) -> BudgetAlertLevel: + """Compute alert level from budget usage percentage.""" + alerts = cfg.alerts + if used_pct >= alerts.hard_stop_at: + return BudgetAlertLevel.HARD_STOP + if used_pct >= alerts.critical_at: + return BudgetAlertLevel.CRITICAL + if used_pct >= alerts.warn_at: + return BudgetAlertLevel.WARNING + return BudgetAlertLevel.NORMAL diff --git a/src/ai_company/budget/optimizer_models.py b/src/ai_company/budget/optimizer_models.py new file mode 100644 index 0000000000..0ce01ba672 --- /dev/null +++ b/src/ai_company/budget/optimizer_models.py @@ -0,0 +1,348 @@ +"""CFO / CostOptimizer domain models. + +Frozen Pydantic models for anomaly detection, cost efficiency analysis, +downgrade recommendations, and approval decisions. Used by +:class:`~ai_company.budget.optimizer.CostOptimizer` and +:class:`~ai_company.budget.reports.ReportGenerator`. +""" + +from datetime import datetime # noqa: TC003 — required at runtime by Pydantic +from enum import StrEnum +from typing import Self + +from pydantic import BaseModel, ConfigDict, Field, model_validator + +from ai_company.budget.enums import BudgetAlertLevel +from ai_company.core.types import NotBlankStr # noqa: TC001 + +# ── Enums ───────────────────────────────────────────────────────── + + +class AnomalyType(StrEnum): + """Type of spending anomaly detected.""" + + SPIKE = "spike" + SUSTAINED_HIGH = "sustained_high" + RATE_INCREASE = "rate_increase" + + +class AnomalySeverity(StrEnum): + """Severity of a detected spending anomaly.""" + + LOW = "low" + MEDIUM = "medium" + HIGH = "high" + + +class EfficiencyRating(StrEnum): + """Cost efficiency rating for an agent.""" + + EFFICIENT = "efficient" + NORMAL = "normal" + INEFFICIENT = "inefficient" + + +# ── Anomaly Detection ───────────────────────────────────────────── + + +class SpendingAnomaly(BaseModel): + """A detected spending anomaly for a single agent. + + Attributes: + agent_id: Agent exhibiting the anomaly. + anomaly_type: Classification of the anomaly. + severity: Severity level of the anomaly. + description: Human-readable explanation. + current_value: Spending in the most recent window. + baseline_value: Mean spending across historical windows. + deviation_factor: How many standard deviations above baseline. + detected_at: Timestamp when the anomaly was detected. + period_start: Start of the window that triggered the anomaly. + period_end: End of the window that triggered the anomaly. + """ + + model_config = ConfigDict(frozen=True, allow_inf_nan=False) + + agent_id: NotBlankStr = Field(description="Agent identifier") + anomaly_type: AnomalyType = Field(description="Anomaly classification") + severity: AnomalySeverity = Field(description="Severity level") + description: NotBlankStr = Field(description="Human-readable explanation") + current_value: float = Field( + ge=0.0, + description="Spending in the most recent window", + ) + baseline_value: float = Field( + ge=0.0, + description="Mean spending across historical windows", + ) + deviation_factor: float = Field( + ge=0.0, + description="Standard deviations above baseline", + ) + detected_at: datetime = Field(description="When the anomaly was detected") + period_start: datetime = Field(description="Anomalous window start") + period_end: datetime = Field(description="Anomalous window end") + + @model_validator(mode="after") + def _validate_period_ordering(self) -> Self: + """Ensure period_start is strictly before period_end.""" + if self.period_start >= self.period_end: + msg = ( + f"period_start ({self.period_start.isoformat()}) " + f"must be before period_end ({self.period_end.isoformat()})" + ) + raise ValueError(msg) + return self + + +class AnomalyDetectionResult(BaseModel): + """Result of an anomaly detection scan. + + Attributes: + anomalies: Detected anomalies (may be empty). + scan_period_start: Start of the scanned period. + scan_period_end: End of the scanned period. + agents_scanned: Number of unique agents in the data. + scan_timestamp: When the scan was performed. + """ + + model_config = ConfigDict(frozen=True) + + anomalies: tuple[SpendingAnomaly, ...] = Field( + default=(), + description="Detected anomalies", + ) + scan_period_start: datetime = Field(description="Scanned period start") + scan_period_end: datetime = Field(description="Scanned period end") + agents_scanned: int = Field(ge=0, description="Unique agents in data") + scan_timestamp: datetime = Field(description="When the scan ran") + + @model_validator(mode="after") + def _validate_period_ordering(self) -> Self: + """Ensure scan_period_start is strictly before scan_period_end.""" + if self.scan_period_start >= self.scan_period_end: + msg = ( + f"scan_period_start ({self.scan_period_start.isoformat()}) " + f"must be before scan_period_end " + f"({self.scan_period_end.isoformat()})" + ) + raise ValueError(msg) + return self + + +# ── Cost Efficiency ─────────────────────────────────────────────── + + +class AgentEfficiency(BaseModel): + """Cost efficiency metrics for a single agent. + + Attributes: + agent_id: Agent identifier. + total_cost_usd: Total cost in the analysis period. + total_tokens: Total tokens consumed (input + output). + cost_per_1k_tokens: Cost per 1000 tokens. + record_count: Number of cost records. + efficiency_rating: Efficiency classification. + """ + + model_config = ConfigDict(frozen=True, allow_inf_nan=False) + + agent_id: NotBlankStr = Field(description="Agent identifier") + total_cost_usd: float = Field( + ge=0.0, + description="Total cost in the analysis period", + ) + total_tokens: int = Field(ge=0, description="Total tokens consumed") + cost_per_1k_tokens: float = Field( + ge=0.0, + description="Cost per 1000 tokens", + ) + record_count: int = Field(ge=0, description="Number of cost records") + efficiency_rating: EfficiencyRating = Field( + description="Efficiency classification", + ) + + +class EfficiencyAnalysis(BaseModel): + """Result of a cost efficiency analysis. + + Attributes: + agents: Per-agent efficiency metrics (sorted by cost_per_1k desc). + global_avg_cost_per_1k: Global average cost per 1000 tokens. + analysis_period_start: Start of the analysis period. + analysis_period_end: End of the analysis period. + inefficient_agent_count: Number of agents rated INEFFICIENT. + """ + + model_config = ConfigDict(frozen=True, allow_inf_nan=False) + + agents: tuple[AgentEfficiency, ...] = Field( + default=(), + description="Per-agent efficiency metrics", + ) + global_avg_cost_per_1k: float = Field( + ge=0.0, + description="Global average cost per 1000 tokens", + ) + analysis_period_start: datetime = Field(description="Analysis period start") + analysis_period_end: datetime = Field(description="Analysis period end") + inefficient_agent_count: int = Field( + ge=0, + description="Number of inefficient agents", + ) + + @model_validator(mode="after") + def _validate_period_ordering(self) -> Self: + """Ensure analysis_period_start is before analysis_period_end.""" + if self.analysis_period_start >= self.analysis_period_end: + msg = ( + f"analysis_period_start " + f"({self.analysis_period_start.isoformat()}) " + f"must be before analysis_period_end " + f"({self.analysis_period_end.isoformat()})" + ) + raise ValueError(msg) + return self + + +# ── Downgrade Recommendations ───────────────────────────────────── + + +class DowngradeRecommendation(BaseModel): + """A model downgrade recommendation for a single agent. + + Attributes: + agent_id: Agent identifier. + current_model: Currently used model identifier. + recommended_model: Recommended cheaper model. + estimated_savings_per_1k: Estimated savings per 1000 tokens. + reason: Human-readable explanation. + """ + + model_config = ConfigDict(frozen=True, allow_inf_nan=False) + + agent_id: NotBlankStr = Field(description="Agent identifier") + current_model: NotBlankStr = Field(description="Current model identifier") + recommended_model: NotBlankStr = Field( + description="Recommended cheaper model", + ) + estimated_savings_per_1k: float = Field( + ge=0.0, + description="Estimated savings per 1000 tokens", + ) + reason: NotBlankStr = Field(description="Human-readable explanation") + + +class DowngradeAnalysis(BaseModel): + """Result of a downgrade recommendation analysis. + + Attributes: + recommendations: Per-agent downgrade recommendations. + total_estimated_monthly_savings: Aggregate estimated monthly savings. + budget_pressure_percent: Current budget utilization percentage. + """ + + model_config = ConfigDict(frozen=True, allow_inf_nan=False) + + recommendations: tuple[DowngradeRecommendation, ...] = Field( + default=(), + description="Per-agent downgrade recommendations", + ) + total_estimated_monthly_savings: float = Field( + ge=0.0, + description="Aggregate estimated monthly savings", + ) + budget_pressure_percent: float = Field( + ge=0.0, + description="Current budget utilization percentage", + ) + + +# ── Approval Decision ───────────────────────────────────────────── + + +class ApprovalDecision(BaseModel): + """Result of evaluating whether an operation should proceed. + + Attributes: + approved: Whether the operation is approved. + reason: Explanation for the decision. + budget_remaining_usd: Remaining budget in USD. + budget_used_percent: Percentage of budget consumed. + alert_level: Current budget alert level. + conditions: Any conditions attached to approval. + """ + + model_config = ConfigDict(frozen=True, allow_inf_nan=False) + + approved: bool = Field(description="Whether the operation is approved") + reason: NotBlankStr = Field(description="Explanation for the decision") + budget_remaining_usd: float = Field( + description="Remaining budget in USD", + ) + budget_used_percent: float = Field( + ge=0.0, + description="Percentage of budget consumed", + ) + alert_level: BudgetAlertLevel = Field( + description="Current budget alert level", + ) + conditions: tuple[str, ...] = Field( + default=(), + description="Conditions attached to approval", + ) + + +# ── Configuration ───────────────────────────────────────────────── + + +class CostOptimizerConfig(BaseModel): + """Configuration for the CostOptimizer service. + + Attributes: + anomaly_sigma_threshold: Number of standard deviations above mean + to flag as anomalous. + anomaly_spike_factor: Multiplier above mean to flag as spike + (independent of stddev). + inefficiency_threshold_factor: Factor above global average + cost_per_1k to flag as inefficient. + approval_auto_deny_alert_level: Alert level at or above which + operations are automatically denied. + approval_warn_threshold_usd: Cost threshold for adding a + warning condition to approval. + min_anomaly_windows: Minimum number of historical windows + required before anomaly detection activates. + """ + + model_config = ConfigDict(frozen=True, allow_inf_nan=False) + + anomaly_sigma_threshold: float = Field( + default=2.0, + gt=0.0, + description="Sigma threshold for anomaly detection", + ) + anomaly_spike_factor: float = Field( + default=3.0, + gt=1.0, + description="Spike factor multiplier above mean", + ) + inefficiency_threshold_factor: float = Field( + default=1.5, + gt=1.0, + description="Factor above global avg for inefficiency", + ) + approval_auto_deny_alert_level: BudgetAlertLevel = Field( + default=BudgetAlertLevel.HARD_STOP, + description="Alert level triggering auto-deny", + ) + approval_warn_threshold_usd: float = Field( + default=1.0, + ge=0.0, + description="Cost threshold for warning condition", + ) + min_anomaly_windows: int = Field( + default=3, + ge=2, + strict=True, + description="Minimum historical windows for anomaly detection", + ) diff --git a/src/ai_company/budget/reports.py b/src/ai_company/budget/reports.py new file mode 100644 index 0000000000..43f3cd64de --- /dev/null +++ b/src/ai_company/budget/reports.py @@ -0,0 +1,457 @@ +"""CFO spending report generation. + +Provides multi-dimensional spending reports with breakdowns by task, +provider, model, and time-period comparison. Composes +:class:`~ai_company.budget.tracker.CostTracker` and +:class:`~ai_company.budget.config.BudgetConfig`. + +Service layer backing CFO reporting (DESIGN_SPEC Section 10.3). +""" + +import math +from collections import defaultdict +from datetime import datetime +from typing import TYPE_CHECKING, Self + +from pydantic import BaseModel, ConfigDict, Field, model_validator + +from ai_company.budget.spending_summary import SpendingSummary # noqa: TC001 +from ai_company.constants import BUDGET_ROUNDING_PRECISION +from ai_company.core.types import NotBlankStr # noqa: TC001 +from ai_company.observability import get_logger +from ai_company.observability.events.cfo import CFO_REPORT_GENERATED + +if TYPE_CHECKING: + from collections.abc import Sequence + + from ai_company.budget.config import BudgetConfig + from ai_company.budget.cost_record import CostRecord + from ai_company.budget.tracker import CostTracker + +logger = get_logger(__name__) + + +# ── Report Models ───────────────────────────────────────────────── + + +class TaskSpending(BaseModel): + """Spending aggregation for a single task. + + Attributes: + task_id: Task identifier. + total_cost_usd: Total cost for the task. + total_tokens: Total tokens consumed (input + output). + record_count: Number of cost records. + """ + + model_config = ConfigDict(frozen=True, allow_inf_nan=False) + + task_id: NotBlankStr = Field(description="Task identifier") + total_cost_usd: float = Field(ge=0.0, description="Total cost") + total_tokens: int = Field(ge=0, description="Total tokens consumed") + record_count: int = Field(ge=0, description="Number of cost records") + + +class ProviderDistribution(BaseModel): + """Cost distribution for a single provider. + + Attributes: + provider: Provider name. + total_cost_usd: Total cost for the provider. + record_count: Number of cost records. + percentage_of_total: Percentage of total spending. + """ + + model_config = ConfigDict(frozen=True, allow_inf_nan=False) + + provider: NotBlankStr = Field(description="Provider name") + total_cost_usd: float = Field(ge=0.0, description="Total cost") + record_count: int = Field(ge=0, description="Number of cost records") + percentage_of_total: float = Field( + ge=0.0, + le=100.0, + description="Percentage of total spending", + ) + + +class ModelDistribution(BaseModel): + """Cost distribution for a single model. + + Attributes: + model: Model identifier. + provider: Provider name. + total_cost_usd: Total cost for the model. + record_count: Number of cost records. + percentage_of_total: Percentage of total spending. + """ + + model_config = ConfigDict(frozen=True, allow_inf_nan=False) + + model: NotBlankStr = Field(description="Model identifier") + provider: NotBlankStr = Field(description="Provider name") + total_cost_usd: float = Field(ge=0.0, description="Total cost") + record_count: int = Field(ge=0, description="Number of cost records") + percentage_of_total: float = Field( + ge=0.0, + le=100.0, + description="Percentage of total spending", + ) + + +class PeriodComparison(BaseModel): + """Comparison of spending between two consecutive periods. + + Attributes: + current_period_cost: Cost in the current period. + previous_period_cost: Cost in the previous period. + cost_change_usd: Absolute change in cost. + cost_change_percent: Percentage change in cost. None when + previous period cost is zero. + """ + + model_config = ConfigDict(frozen=True, allow_inf_nan=False) + + current_period_cost: float = Field( + ge=0.0, + description="Current period cost", + ) + previous_period_cost: float = Field( + ge=0.0, + description="Previous period cost", + ) + cost_change_usd: float = Field(description="Absolute cost change") + cost_change_percent: float | None = Field( + default=None, + description="Percentage cost change", + ) + + +class SpendingReport(BaseModel): + """Multi-dimensional spending report. + + Attributes: + summary: Overall spending summary for the period. + by_task: Per-task spending breakdown. + by_provider: Per-provider cost distribution. + by_model: Per-model cost distribution. + period_comparison: Comparison with previous period (optional). + top_agents_by_cost: Top agents by cost (sorted descending). + top_tasks_by_cost: Top tasks by cost (sorted descending). + generated_at: When the report was generated. + """ + + model_config = ConfigDict(frozen=True) + + summary: SpendingSummary = Field(description="Overall spending summary") + by_task: tuple[TaskSpending, ...] = Field( + default=(), + description="Per-task spending breakdown", + ) + by_provider: tuple[ProviderDistribution, ...] = Field( + default=(), + description="Per-provider cost distribution", + ) + by_model: tuple[ModelDistribution, ...] = Field( + default=(), + description="Per-model cost distribution", + ) + period_comparison: PeriodComparison | None = Field( + default=None, + description="Comparison with previous period", + ) + top_agents_by_cost: tuple[tuple[str, float], ...] = Field( + default=(), + description="Top agents by cost (agent_id, cost_usd)", + ) + top_tasks_by_cost: tuple[tuple[str, float], ...] = Field( + default=(), + description="Top tasks by cost (task_id, cost_usd)", + ) + generated_at: datetime = Field(description="When the report was generated") + + @model_validator(mode="after") + def _validate_agent_ranking_order(self) -> Self: + """Ensure top_agents_by_cost is sorted descending.""" + costs = [c for _, c in self.top_agents_by_cost] + if costs != sorted(costs, reverse=True): + msg = "top_agents_by_cost must be sorted by cost descending" + raise ValueError(msg) + return self + + @model_validator(mode="after") + def _validate_task_ranking_order(self) -> Self: + """Ensure top_tasks_by_cost is sorted descending.""" + costs = [c for _, c in self.top_tasks_by_cost] + if costs != sorted(costs, reverse=True): + msg = "top_tasks_by_cost must be sorted by cost descending" + raise ValueError(msg) + return self + + +# ── ReportGenerator Service ─────────────────────────────────────── + + +class ReportGenerator: + """Generates multi-dimensional spending reports. + + Composes CostTracker and BudgetConfig to produce reports with + breakdowns by task, provider, model, and period comparison. + + Args: + cost_tracker: Cost tracking service for querying spend. + budget_config: Budget configuration for context. + """ + + def __init__( + self, + *, + cost_tracker: CostTracker, + budget_config: BudgetConfig, + ) -> None: + self._cost_tracker = cost_tracker + self._budget_config = budget_config + + async def generate_report( + self, + *, + start: datetime, + end: datetime, + top_n: int = 10, + include_period_comparison: bool = True, + ) -> SpendingReport: + """Generate a spending report for the given period. + + Args: + start: Inclusive period start. + end: Exclusive period end. + top_n: Maximum number of top agents/tasks to include. + include_period_comparison: Whether to compute a comparison + with the previous period of the same duration. + + Returns: + Multi-dimensional spending report. + + Raises: + ValueError: If ``start >= end`` or ``top_n < 1``. + """ + if top_n < 1: + msg = f"top_n must be >= 1, got {top_n}" + raise ValueError(msg) + + from datetime import UTC # noqa: PLC0415 + + now = datetime.now(UTC) + + summary = await self._cost_tracker.build_summary( + start=start, + end=end, + ) + records = await self._cost_tracker.get_records( + start=start, + end=end, + ) + + total_cost = summary.period.total_cost_usd + by_task = _build_task_spendings(records) + by_provider = _build_provider_distribution(records, total_cost) + by_model = _build_model_distribution(records, total_cost) + + top_agents = _build_top_agents(summary, top_n) + top_tasks = _build_top_tasks(by_task, top_n) + + period_comparison: PeriodComparison | None = None + if include_period_comparison: + period_comparison = await self._build_period_comparison( + start, + end, + total_cost, + ) + + report = SpendingReport( + summary=summary, + by_task=by_task, + by_provider=by_provider, + by_model=by_model, + period_comparison=period_comparison, + top_agents_by_cost=top_agents, + top_tasks_by_cost=top_tasks, + generated_at=now, + ) + + logger.info( + CFO_REPORT_GENERATED, + total_cost_usd=total_cost, + task_count=len(by_task), + provider_count=len(by_provider), + model_count=len(by_model), + has_comparison=period_comparison is not None, + ) + + return report + + async def _build_period_comparison( + self, + current_start: datetime, + current_end: datetime, + current_cost: float, + ) -> PeriodComparison | None: + """Build a period comparison with the previous period.""" + duration = current_end - current_start + prev_start = current_start - duration + prev_end = current_start + + prev_summary = await self._cost_tracker.build_summary( + start=prev_start, + end=prev_end, + ) + prev_cost = prev_summary.period.total_cost_usd + + if prev_cost == 0.0 and current_cost == 0.0: + return None + + return _compute_period_comparison(current_cost, prev_cost) + + +# ── Module-level pure helpers ──────────────────────────────────── + + +def _build_task_spendings( + records: Sequence[CostRecord], +) -> tuple[TaskSpending, ...]: + """Group records by task and aggregate.""" + by_task: dict[str, list[CostRecord]] = defaultdict(list) + for r in records: + by_task[r.task_id].append(r) + + spendings: list[TaskSpending] = [] + for task_id in sorted(by_task): + task_records = by_task[task_id] + total_cost = round( + math.fsum(r.cost_usd for r in task_records), + BUDGET_ROUNDING_PRECISION, + ) + total_tokens = sum(r.input_tokens + r.output_tokens for r in task_records) + spendings.append( + TaskSpending( + task_id=task_id, + total_cost_usd=total_cost, + total_tokens=total_tokens, + record_count=len(task_records), + ), + ) + return tuple(spendings) + + +def _build_provider_distribution( + records: Sequence[CostRecord], + total_cost: float, +) -> tuple[ProviderDistribution, ...]: + """Group records by provider and compute distribution.""" + by_provider: dict[str, list[CostRecord]] = defaultdict(list) + for r in records: + by_provider[r.provider].append(r) + + distributions: list[ProviderDistribution] = [] + for provider in sorted(by_provider): + provider_records = by_provider[provider] + provider_cost = round( + math.fsum(r.cost_usd for r in provider_records), + BUDGET_ROUNDING_PRECISION, + ) + pct = ( + round(provider_cost / total_cost * 100, BUDGET_ROUNDING_PRECISION) + if total_cost > 0 + else 0.0 + ) + distributions.append( + ProviderDistribution( + provider=provider, + total_cost_usd=provider_cost, + record_count=len(provider_records), + percentage_of_total=pct, + ), + ) + return tuple(distributions) + + +def _build_model_distribution( + records: Sequence[CostRecord], + total_cost: float, +) -> tuple[ModelDistribution, ...]: + """Group records by (model, provider) and compute distribution.""" + by_model: dict[tuple[str, str], list[CostRecord]] = defaultdict(list) + for r in records: + by_model[(r.model, r.provider)].append(r) + + distributions: list[ModelDistribution] = [] + for model, provider in sorted(by_model): + model_records = by_model[(model, provider)] + model_cost = round( + math.fsum(r.cost_usd for r in model_records), + BUDGET_ROUNDING_PRECISION, + ) + pct = ( + round(model_cost / total_cost * 100, BUDGET_ROUNDING_PRECISION) + if total_cost > 0 + else 0.0 + ) + distributions.append( + ModelDistribution( + model=model, + provider=provider, + total_cost_usd=model_cost, + record_count=len(model_records), + percentage_of_total=pct, + ), + ) + return tuple(distributions) + + +def _compute_period_comparison( + current_cost: float, + previous_cost: float, +) -> PeriodComparison: + """Compute the delta between current and previous period costs.""" + change_usd = round( + current_cost - previous_cost, + BUDGET_ROUNDING_PRECISION, + ) + change_pct: float | None = None + if previous_cost > 0: + change_pct = round( + change_usd / previous_cost * 100, + BUDGET_ROUNDING_PRECISION, + ) + + return PeriodComparison( + current_period_cost=current_cost, + previous_period_cost=previous_cost, + cost_change_usd=change_usd, + cost_change_percent=change_pct, + ) + + +def _build_top_agents( + summary: SpendingSummary, + top_n: int, +) -> tuple[tuple[str, float], ...]: + """Extract top-N agents by cost from a spending summary.""" + sorted_agents = sorted( + summary.by_agent, + key=lambda a: a.total_cost_usd, + reverse=True, + ) + return tuple((a.agent_id, a.total_cost_usd) for a in sorted_agents[:top_n]) + + +def _build_top_tasks( + task_spendings: tuple[TaskSpending, ...], + top_n: int, +) -> tuple[tuple[str, float], ...]: + """Extract top-N tasks by cost from task spendings.""" + sorted_tasks = sorted( + task_spendings, + key=lambda t: t.total_cost_usd, + reverse=True, + ) + return tuple((t.task_id, t.total_cost_usd) for t in sorted_tasks[:top_n]) diff --git a/src/ai_company/budget/tracker.py b/src/ai_company/budget/tracker.py index afae82a1d8..6b329159f9 100644 --- a/src/ai_company/budget/tracker.py +++ b/src/ai_company/budget/tracker.py @@ -41,6 +41,7 @@ BUDGET_TOTAL_COST_QUERIED, BUDGET_TRACKER_CREATED, ) +from ai_company.observability.events.cfo import CFO_RECORDS_QUERIED if TYPE_CHECKING: from collections.abc import Callable, Sequence @@ -181,6 +182,48 @@ async def get_record_count(self) -> int: async with self._lock: return len(self._records) + async def get_records( + self, + *, + agent_id: str | None = None, + task_id: str | None = None, + start: datetime | None = None, + end: datetime | None = None, + ) -> tuple[CostRecord, ...]: + """Return filtered cost records. + + Returns an immutable snapshot of records matching the filters. + + Args: + agent_id: Filter by agent. + task_id: Filter by task. + start: Inclusive lower bound on ``timestamp``. + end: Exclusive upper bound on ``timestamp``. + + Returns: + Immutable tuple of matching cost records. + + Raises: + ValueError: If both *start* and *end* are given and + ``start >= end``. + """ + _validate_time_range(start, end) + logger.debug( + CFO_RECORDS_QUERIED, + agent_id=agent_id, + task_id=task_id, + start=start, + end=end, + ) + snapshot = await self._snapshot() + return _filter_records( + snapshot, + agent_id=agent_id, + task_id=task_id, + start=start, + end=end, + ) + async def build_summary( self, *, diff --git a/src/ai_company/observability/events/cfo.py b/src/ai_company/observability/events/cfo.py new file mode 100644 index 0000000000..1c3912a2a1 --- /dev/null +++ b/src/ai_company/observability/events/cfo.py @@ -0,0 +1,13 @@ +"""CFO / CostOptimizer event constants.""" + +from typing import Final + +CFO_OPTIMIZER_CREATED: Final[str] = "cfo.optimizer.created" +CFO_ANOMALY_DETECTED: Final[str] = "cfo.anomaly.detected" +CFO_ANOMALY_SCAN_COMPLETE: Final[str] = "cfo.anomaly.scan_complete" +CFO_EFFICIENCY_ANALYSIS_COMPLETE: Final[str] = "cfo.efficiency.analysis_complete" +CFO_DOWNGRADE_RECOMMENDED: Final[str] = "cfo.downgrade.recommended" +CFO_APPROVAL_EVALUATED: Final[str] = "cfo.approval.evaluated" +CFO_OPERATION_DENIED: Final[str] = "cfo.operation.denied" +CFO_REPORT_GENERATED: Final[str] = "cfo.report.generated" +CFO_RECORDS_QUERIED: Final[str] = "cfo.records.queried" diff --git a/tests/unit/budget/conftest.py b/tests/unit/budget/conftest.py index e8fe67f2fd..2a7a8fce0f 100644 --- a/tests/unit/budget/conftest.py +++ b/tests/unit/budget/conftest.py @@ -19,12 +19,15 @@ DepartmentBudget, TeamBudget, ) +from ai_company.budget.optimizer import CostOptimizer +from ai_company.budget.optimizer_models import CostOptimizerConfig from ai_company.budget.quota import ( QuotaLimit, QuotaWindow, SubscriptionConfig, ) from ai_company.budget.quota_tracker import QuotaTracker +from ai_company.budget.reports import ReportGenerator from ai_company.budget.spending_summary import ( AgentSpending, DepartmentSpending, @@ -105,6 +108,10 @@ class SpendingSummaryFactory(ModelFactory[SpendingSummary]): by_department = () +class CostOptimizerConfigFactory(ModelFactory[CostOptimizerConfig]): + __model__ = CostOptimizerConfig + + class CostTierDefinitionFactory(ModelFactory[CostTierDefinition]): __model__ = CostTierDefinition sort_order = 0 @@ -298,3 +305,38 @@ def make_cost_record( # noqa: PLR0913 cost_usd=cost_usd, timestamp=timestamp or datetime(2026, 2, 15, 12, 0, 0, tzinfo=UTC), ) + + +# ── CFO / CostOptimizer fixtures ───────────────────────────────── + + +@pytest.fixture +def cost_optimizer_config() -> CostOptimizerConfig: + """Default CostOptimizerConfig for tests.""" + return CostOptimizerConfig() + + +@pytest.fixture +def cost_optimizer( + budget_config_for_tracker: BudgetConfig, + cost_tracker: CostTracker, + cost_optimizer_config: CostOptimizerConfig, +) -> CostOptimizer: + """CostOptimizer wired with tracker and config.""" + return CostOptimizer( + cost_tracker=cost_tracker, + budget_config=budget_config_for_tracker, + config=cost_optimizer_config, + ) + + +@pytest.fixture +def report_generator( + budget_config_for_tracker: BudgetConfig, + cost_tracker: CostTracker, +) -> ReportGenerator: + """ReportGenerator wired with tracker and config.""" + return ReportGenerator( + cost_tracker=cost_tracker, + budget_config=budget_config_for_tracker, + ) diff --git a/tests/unit/budget/test_optimizer.py b/tests/unit/budget/test_optimizer.py new file mode 100644 index 0000000000..2af918bd9e --- /dev/null +++ b/tests/unit/budget/test_optimizer.py @@ -0,0 +1,591 @@ +"""Tests for CostOptimizer service.""" + +from datetime import UTC, datetime, timedelta + +import pytest + +from ai_company.budget.config import BudgetAlertConfig, BudgetConfig +from ai_company.budget.enums import BudgetAlertLevel +from ai_company.budget.optimizer import CostOptimizer +from ai_company.budget.optimizer_models import ( + AnomalySeverity, + AnomalyType, + CostOptimizerConfig, + EfficiencyRating, +) +from ai_company.budget.tracker import CostTracker +from ai_company.providers.routing.models import ResolvedModel +from ai_company.providers.routing.resolver import ModelResolver +from tests.unit.budget.conftest import make_cost_record + +# ── Helpers ─────────────────────────────────────────────────────── + +_START = datetime(2026, 2, 1, tzinfo=UTC) +_END = datetime(2026, 3, 1, tzinfo=UTC) + + +def _make_optimizer( + *, + budget_config: BudgetConfig | None = None, + config: CostOptimizerConfig | None = None, + model_resolver: ModelResolver | None = None, +) -> tuple[CostOptimizer, CostTracker]: + """Build a CostOptimizer with a fresh CostTracker.""" + bc = budget_config or BudgetConfig(total_monthly=100.0) + tracker = CostTracker(budget_config=bc) + optimizer = CostOptimizer( + cost_tracker=tracker, + budget_config=bc, + config=config, + model_resolver=model_resolver, + ) + return optimizer, tracker + + +def _make_resolver( + models: list[ResolvedModel] | None = None, +) -> ModelResolver: + """Build a ModelResolver from a list of ResolvedModel.""" + if models is None: + models = [ + ResolvedModel( + provider_name="test-provider", + model_id="test-large-001", + alias="large", + cost_per_1k_input=0.03, + cost_per_1k_output=0.06, + ), + ResolvedModel( + provider_name="test-provider", + model_id="test-medium-001", + alias="medium", + cost_per_1k_input=0.01, + cost_per_1k_output=0.02, + ), + ResolvedModel( + provider_name="test-provider", + model_id="test-small-001", + alias="small", + cost_per_1k_input=0.001, + cost_per_1k_output=0.002, + ), + ] + index: dict[str, ResolvedModel] = {} + for m in models: + index[m.model_id] = m + if m.alias is not None: + index[m.alias] = m + return ModelResolver(index) + + +# ── Init Tests ──────────────────────────────────────────────────── + + +@pytest.mark.unit +class TestInit: + async def test_defaults(self) -> None: + optimizer, _ = _make_optimizer() + assert optimizer._config == CostOptimizerConfig() + + async def test_custom_config(self) -> None: + cfg = CostOptimizerConfig(anomaly_sigma_threshold=3.0) + optimizer, _ = _make_optimizer(config=cfg) + assert optimizer._config.anomaly_sigma_threshold == 3.0 + + +# ── Anomaly Detection Tests ────────────────────────────────────── + + +@pytest.mark.unit +class TestDetectAnomalies: + async def test_no_records_empty_result(self) -> None: + optimizer, _ = _make_optimizer() + result = await optimizer.detect_anomalies(start=_START, end=_END) + assert result.anomalies == () + assert result.agents_scanned == 0 + + async def test_normal_spending_no_anomalies(self) -> None: + optimizer, tracker = _make_optimizer() + # Create uniform spending across 5 windows + window_duration = (_END - _START) / 5 + for i in range(5): + ts = _START + window_duration * i + timedelta(hours=1) + await tracker.record( + make_cost_record(agent_id="alice", cost_usd=1.0, timestamp=ts), + ) + + result = await optimizer.detect_anomalies(start=_START, end=_END) + assert result.anomalies == () + assert result.agents_scanned == 1 + + async def test_spike_detected(self) -> None: + optimizer, tracker = _make_optimizer() + window_duration = (_END - _START) / 5 + + # Normal spending in first 4 windows + for i in range(4): + ts = _START + window_duration * i + timedelta(hours=1) + await tracker.record( + make_cost_record(agent_id="alice", cost_usd=1.0, timestamp=ts), + ) + + # Spike in last window + ts = _START + window_duration * 4 + timedelta(hours=1) + await tracker.record( + make_cost_record(agent_id="alice", cost_usd=20.0, timestamp=ts), + ) + + result = await optimizer.detect_anomalies(start=_START, end=_END) + assert len(result.anomalies) == 1 + anomaly = result.anomalies[0] + assert anomaly.agent_id == "alice" + assert anomaly.anomaly_type == AnomalyType.SPIKE + assert anomaly.current_value == 20.0 + + async def test_insufficient_windows_no_false_positive(self) -> None: + config = CostOptimizerConfig(min_anomaly_windows=5) + optimizer, tracker = _make_optimizer(config=config) + + # Only 3 windows of data in a 3-window analysis + window_duration = (_END - _START) / 3 + for i in range(3): + ts = _START + window_duration * i + timedelta(hours=1) + cost = 1.0 if i < 2 else 50.0 + await tracker.record( + make_cost_record(agent_id="alice", cost_usd=cost, timestamp=ts), + ) + + result = await optimizer.detect_anomalies( + start=_START, + end=_END, + window_count=3, + ) + assert result.anomalies == () + + async def test_multiple_agents_only_anomalous_flagged(self) -> None: + optimizer, tracker = _make_optimizer() + window_duration = (_END - _START) / 5 + + # Alice: uniform spending + for i in range(5): + ts = _START + window_duration * i + timedelta(hours=1) + await tracker.record( + make_cost_record(agent_id="alice", cost_usd=1.0, timestamp=ts), + ) + + # Bob: spike in last window + for i in range(4): + ts = _START + window_duration * i + timedelta(hours=1) + await tracker.record( + make_cost_record(agent_id="bob", cost_usd=1.0, timestamp=ts), + ) + ts = _START + window_duration * 4 + timedelta(hours=1) + await tracker.record( + make_cost_record(agent_id="bob", cost_usd=20.0, timestamp=ts), + ) + + result = await optimizer.detect_anomalies(start=_START, end=_END) + assert len(result.anomalies) == 1 + assert result.anomalies[0].agent_id == "bob" + assert result.agents_scanned == 2 + + async def test_window_count_validation(self) -> None: + optimizer, _ = _make_optimizer() + with pytest.raises(ValueError, match="window_count must be >= 2"): + await optimizer.detect_anomalies( + start=_START, + end=_END, + window_count=1, + ) + + async def test_spike_from_zero_baseline(self) -> None: + """Agent with no historical spending that suddenly appears.""" + optimizer, tracker = _make_optimizer( + config=CostOptimizerConfig(min_anomaly_windows=3), + ) + window_duration = (_END - _START) / 5 + + # No spending in first 4 windows, spending in window 5 + ts = _START + window_duration * 4 + timedelta(hours=1) + await tracker.record( + make_cost_record(agent_id="alice", cost_usd=5.0, timestamp=ts), + ) + + result = await optimizer.detect_anomalies(start=_START, end=_END) + assert len(result.anomalies) == 1 + anomaly = result.anomalies[0] + assert anomaly.severity == AnomalySeverity.HIGH + assert anomaly.baseline_value == 0.0 + + async def test_severity_classification(self) -> None: + """Verify severity levels based on deviation factor.""" + optimizer, tracker = _make_optimizer( + config=CostOptimizerConfig( + anomaly_sigma_threshold=1.5, + anomaly_spike_factor=10.0, + ), + ) + window_duration = (_END - _START) / 5 + + # Create varied baseline with small stddev=0.1 + baseline_costs = [1.0, 1.1, 0.9, 1.0] + for i, cost in enumerate(baseline_costs): + ts = _START + window_duration * i + timedelta(hours=1) + await tracker.record( + make_cost_record(agent_id="alice", cost_usd=cost, timestamp=ts), + ) + + # Medium spike (2-3 sigma range) + ts = _START + window_duration * 4 + timedelta(hours=1) + await tracker.record( + make_cost_record(agent_id="alice", cost_usd=1.25, timestamp=ts), + ) + + await optimizer.detect_anomalies(start=_START, end=_END) + # With such small deviations, this may or may not trigger + # depending on exact sigma; the key is the test runs without error + + +# ── Efficiency Analysis Tests ───────────────────────────────────── + + +@pytest.mark.unit +class TestAnalyzeEfficiency: + async def test_uniform_all_normal(self) -> None: + optimizer, tracker = _make_optimizer() + + # Same cost/token ratio for all agents + for agent in ("alice", "bob", "carol"): + await tracker.record( + make_cost_record( + agent_id=agent, + cost_usd=1.0, + input_tokens=1000, + output_tokens=0, + timestamp=_START + timedelta(hours=1), + ), + ) + + result = await optimizer.analyze_efficiency(start=_START, end=_END) + assert all( + a.efficiency_rating == EfficiencyRating.NORMAL for a in result.agents + ) + assert result.inefficient_agent_count == 0 + + async def test_one_inefficient(self) -> None: + optimizer, tracker = _make_optimizer() + + # Alice: cheap (1.0/1000 = 1.0 per 1k) + await tracker.record( + make_cost_record( + agent_id="alice", + cost_usd=1.0, + input_tokens=1000, + output_tokens=0, + timestamp=_START + timedelta(hours=1), + ), + ) + # Bob: expensive (10.0/1000 = 10.0 per 1k) + await tracker.record( + make_cost_record( + agent_id="bob", + cost_usd=10.0, + input_tokens=1000, + output_tokens=0, + timestamp=_START + timedelta(hours=1), + ), + ) + + result = await optimizer.analyze_efficiency(start=_START, end=_END) + assert result.inefficient_agent_count == 1 + # Sorted by cost_per_1k desc + assert result.agents[0].agent_id == "bob" + assert result.agents[0].efficiency_rating == EfficiencyRating.INEFFICIENT + + async def test_zero_tokens_handled(self) -> None: + optimizer, tracker = _make_optimizer() + + await tracker.record( + make_cost_record( + agent_id="alice", + cost_usd=0.0, + input_tokens=0, + output_tokens=0, + timestamp=_START + timedelta(hours=1), + ), + ) + + result = await optimizer.analyze_efficiency(start=_START, end=_END) + assert len(result.agents) == 1 + assert result.agents[0].cost_per_1k_tokens == 0.0 + assert result.agents[0].efficiency_rating == EfficiencyRating.NORMAL + + async def test_efficient_agent_flagged(self) -> None: + optimizer, tracker = _make_optimizer() + + # Alice: very cheap (0.1/10000 = 0.01 per 1k) + await tracker.record( + make_cost_record( + agent_id="alice", + cost_usd=0.1, + input_tokens=10000, + output_tokens=0, + timestamp=_START + timedelta(hours=1), + ), + ) + # Bob: normal (1.0/1000 = 1.0 per 1k) + await tracker.record( + make_cost_record( + agent_id="bob", + cost_usd=1.0, + input_tokens=1000, + output_tokens=0, + timestamp=_START + timedelta(hours=1), + ), + ) + # Carol: normal (1.0/1000 = 1.0 per 1k) + await tracker.record( + make_cost_record( + agent_id="carol", + cost_usd=1.0, + input_tokens=1000, + output_tokens=0, + timestamp=_START + timedelta(hours=1), + ), + ) + + result = await optimizer.analyze_efficiency(start=_START, end=_END) + alice = next(a for a in result.agents if a.agent_id == "alice") + assert alice.efficiency_rating == EfficiencyRating.EFFICIENT + + async def test_empty_records(self) -> None: + optimizer, _ = _make_optimizer() + result = await optimizer.analyze_efficiency(start=_START, end=_END) + assert result.agents == () + assert result.global_avg_cost_per_1k == 0.0 + + +# ── Downgrade Recommendation Tests ──────────────────────────────── + + +@pytest.mark.unit +class TestRecommendDowngrades: + async def test_no_resolver_empty_result(self) -> None: + optimizer, _ = _make_optimizer() + result = await optimizer.recommend_downgrades(start=_START, end=_END) + assert result.recommendations == () + + async def test_with_downgrade_path(self) -> None: + from ai_company.budget.config import AutoDowngradeConfig + + resolver = _make_resolver() + bc = BudgetConfig( + total_monthly=100.0, + auto_downgrade=AutoDowngradeConfig( + enabled=True, + threshold=80, + downgrade_map=(("large", "small"),), + ), + ) + tracker = CostTracker(budget_config=bc) + optimizer = CostOptimizer( + cost_tracker=tracker, + budget_config=bc, + model_resolver=resolver, + ) + + # Make alice inefficient using large model + await tracker.record( + make_cost_record( + agent_id="alice", + model="test-large-001", + cost_usd=10.0, + input_tokens=1000, + output_tokens=0, + timestamp=_START + timedelta(hours=1), + ), + ) + # Make bob efficient using small model + await tracker.record( + make_cost_record( + agent_id="bob", + model="test-small-001", + cost_usd=0.1, + input_tokens=1000, + output_tokens=0, + timestamp=_START + timedelta(hours=1), + ), + ) + + result = await optimizer.recommend_downgrades(start=_START, end=_END) + assert len(result.recommendations) == 1 + rec = result.recommendations[0] + assert rec.agent_id == "alice" + assert rec.current_model == "test-large-001" + assert rec.recommended_model == "test-small-001" + assert rec.estimated_savings_per_1k > 0 + + async def test_no_cheaper_model_empty(self) -> None: + """No recommendation when agent already uses cheapest model.""" + resolver = _make_resolver( + [ + ResolvedModel( + provider_name="test-provider", + model_id="test-only-001", + alias="only", + cost_per_1k_input=0.01, + cost_per_1k_output=0.02, + ), + ] + ) + bc = BudgetConfig(total_monthly=100.0) + tracker = CostTracker(budget_config=bc) + optimizer = CostOptimizer( + cost_tracker=tracker, + budget_config=bc, + model_resolver=resolver, + ) + + # Only agent, only model — inefficient by default since it's the only one + await tracker.record( + make_cost_record( + agent_id="alice", + model="test-only-001", + cost_usd=10.0, + input_tokens=1000, + output_tokens=0, + timestamp=_START + timedelta(hours=1), + ), + ) + + result = await optimizer.recommend_downgrades(start=_START, end=_END) + assert result.recommendations == () + + +# ── Evaluate Operation Tests ────────────────────────────────────── + + +@pytest.mark.unit +class TestEvaluateOperation: + async def test_healthy_budget_approved(self) -> None: + optimizer, tracker = _make_optimizer() + # Spend only 10% of budget + await tracker.record( + make_cost_record(cost_usd=10.0, timestamp=_START + timedelta(hours=1)), + ) + decision = await optimizer.evaluate_operation( + agent_id="alice", + estimated_cost_usd=0.5, + now=_START + timedelta(days=15), + ) + assert decision.approved is True + assert decision.alert_level == BudgetAlertLevel.NORMAL + + async def test_hard_stop_denied(self) -> None: + bc = BudgetConfig( + total_monthly=100.0, + alerts=BudgetAlertConfig(warn_at=75, critical_at=90, hard_stop_at=100), + ) + optimizer, tracker = _make_optimizer(budget_config=bc) + + # Spend 100% of budget + await tracker.record( + make_cost_record(cost_usd=100.0, timestamp=_START + timedelta(hours=1)), + ) + + decision = await optimizer.evaluate_operation( + agent_id="alice", + estimated_cost_usd=1.0, + now=_START + timedelta(days=15), + ) + assert decision.approved is False + assert decision.alert_level == BudgetAlertLevel.HARD_STOP + + async def test_would_exceed_budget_denied(self) -> None: + bc = BudgetConfig( + total_monthly=100.0, + alerts=BudgetAlertConfig(warn_at=75, critical_at=90, hard_stop_at=100), + ) + optimizer, tracker = _make_optimizer(budget_config=bc) + + # Spend 95% and request 10 more + await tracker.record( + make_cost_record(cost_usd=95.0, timestamp=_START + timedelta(hours=1)), + ) + + decision = await optimizer.evaluate_operation( + agent_id="alice", + estimated_cost_usd=10.0, + now=_START + timedelta(days=15), + ) + assert decision.approved is False + assert "would exceed" in decision.reason + + async def test_warning_level_approved_with_conditions(self) -> None: + bc = BudgetConfig( + total_monthly=100.0, + alerts=BudgetAlertConfig(warn_at=75, critical_at=90, hard_stop_at=100), + ) + optimizer, tracker = _make_optimizer(budget_config=bc) + + # Spend 80% (warning level) + await tracker.record( + make_cost_record(cost_usd=80.0, timestamp=_START + timedelta(hours=1)), + ) + + decision = await optimizer.evaluate_operation( + agent_id="alice", + estimated_cost_usd=2.0, + now=_START + timedelta(days=15), + ) + assert decision.approved is True + assert decision.alert_level == BudgetAlertLevel.WARNING + assert len(decision.conditions) > 0 + + async def test_budget_enforcement_disabled(self) -> None: + bc = BudgetConfig(total_monthly=0.0) + optimizer, _ = _make_optimizer(budget_config=bc) + + decision = await optimizer.evaluate_operation( + agent_id="alice", + estimated_cost_usd=100.0, + ) + assert decision.approved is True + assert "disabled" in decision.reason.lower() + + async def test_critical_level_auto_deny_with_custom_config(self) -> None: + """Auto-deny at CRITICAL when configured.""" + bc = BudgetConfig( + total_monthly=100.0, + alerts=BudgetAlertConfig(warn_at=75, critical_at=90, hard_stop_at=100), + ) + config = CostOptimizerConfig( + approval_auto_deny_alert_level=BudgetAlertLevel.CRITICAL, + ) + optimizer, tracker = _make_optimizer(budget_config=bc, config=config) + + # Spend 92% (critical level) + await tracker.record( + make_cost_record(cost_usd=92.0, timestamp=_START + timedelta(hours=1)), + ) + + decision = await optimizer.evaluate_operation( + agent_id="alice", + estimated_cost_usd=0.01, + now=_START + timedelta(days=15), + ) + assert decision.approved is False + assert decision.alert_level == BudgetAlertLevel.CRITICAL + + async def test_high_cost_condition(self) -> None: + """High-cost warning condition when estimated cost >= threshold.""" + config = CostOptimizerConfig(approval_warn_threshold_usd=0.5) + optimizer, _ = _make_optimizer(config=config) + + decision = await optimizer.evaluate_operation( + agent_id="alice", + estimated_cost_usd=1.0, + now=_START + timedelta(days=15), + ) + assert decision.approved is True + assert any("High-cost" in c for c in decision.conditions) diff --git a/tests/unit/budget/test_optimizer_models.py b/tests/unit/budget/test_optimizer_models.py new file mode 100644 index 0000000000..ca1bca81e5 --- /dev/null +++ b/tests/unit/budget/test_optimizer_models.py @@ -0,0 +1,335 @@ +"""Tests for CFO optimizer domain models.""" + +from datetime import UTC, datetime + +import pytest + +from ai_company.budget.enums import BudgetAlertLevel +from ai_company.budget.optimizer_models import ( + AgentEfficiency, + AnomalyDetectionResult, + AnomalySeverity, + AnomalyType, + ApprovalDecision, + CostOptimizerConfig, + DowngradeAnalysis, + DowngradeRecommendation, + EfficiencyAnalysis, + EfficiencyRating, + SpendingAnomaly, +) + +# ── Enum Tests ──────────────────────────────────────────────────── + + +class TestAnomalyType: + @pytest.mark.unit + def test_values(self) -> None: + assert AnomalyType.SPIKE.value == "spike" + assert AnomalyType.SUSTAINED_HIGH.value == "sustained_high" + assert AnomalyType.RATE_INCREASE.value == "rate_increase" + + @pytest.mark.unit + def test_member_count(self) -> None: + assert len(AnomalyType) == 3 + + +class TestAnomalySeverity: + @pytest.mark.unit + def test_values(self) -> None: + assert AnomalySeverity.LOW.value == "low" + assert AnomalySeverity.MEDIUM.value == "medium" + assert AnomalySeverity.HIGH.value == "high" + + +class TestEfficiencyRating: + @pytest.mark.unit + def test_values(self) -> None: + assert EfficiencyRating.EFFICIENT.value == "efficient" + assert EfficiencyRating.NORMAL.value == "normal" + assert EfficiencyRating.INEFFICIENT.value == "inefficient" + + +# ── SpendingAnomaly Tests ───────────────────────────────────────── + + +class TestSpendingAnomaly: + @pytest.mark.unit + def test_construction(self) -> None: + anomaly = SpendingAnomaly( + agent_id="alice", + anomaly_type=AnomalyType.SPIKE, + severity=AnomalySeverity.HIGH, + description="Test spike", + current_value=10.0, + baseline_value=2.0, + deviation_factor=4.0, + detected_at=datetime(2026, 3, 1, 12, 0, tzinfo=UTC), + period_start=datetime(2026, 2, 28, tzinfo=UTC), + period_end=datetime(2026, 3, 1, tzinfo=UTC), + ) + assert anomaly.agent_id == "alice" + assert anomaly.anomaly_type == AnomalyType.SPIKE + assert anomaly.severity == AnomalySeverity.HIGH + assert anomaly.current_value == 10.0 + assert anomaly.baseline_value == 2.0 + + @pytest.mark.unit + def test_frozen(self) -> None: + anomaly = SpendingAnomaly( + agent_id="alice", + anomaly_type=AnomalyType.SPIKE, + severity=AnomalySeverity.LOW, + description="Test", + current_value=1.0, + baseline_value=0.5, + deviation_factor=1.5, + detected_at=datetime(2026, 3, 1, tzinfo=UTC), + period_start=datetime(2026, 2, 28, tzinfo=UTC), + period_end=datetime(2026, 3, 1, tzinfo=UTC), + ) + with pytest.raises(Exception): # noqa: B017, PT011 + anomaly.agent_id = "bob" # type: ignore[misc] + + @pytest.mark.unit + def test_period_ordering_invalid(self) -> None: + with pytest.raises(ValueError, match="period_start"): + SpendingAnomaly( + agent_id="alice", + anomaly_type=AnomalyType.SPIKE, + severity=AnomalySeverity.LOW, + description="Test", + current_value=1.0, + baseline_value=0.5, + deviation_factor=1.5, + detected_at=datetime(2026, 3, 1, tzinfo=UTC), + period_start=datetime(2026, 3, 2, tzinfo=UTC), + period_end=datetime(2026, 3, 1, tzinfo=UTC), + ) + + +# ── AnomalyDetectionResult Tests ───────────────────────────────── + + +class TestAnomalyDetectionResult: + @pytest.mark.unit + def test_empty_result(self) -> None: + result = AnomalyDetectionResult( + anomalies=(), + scan_period_start=datetime(2026, 2, 1, tzinfo=UTC), + scan_period_end=datetime(2026, 3, 1, tzinfo=UTC), + agents_scanned=0, + scan_timestamp=datetime(2026, 3, 1, tzinfo=UTC), + ) + assert result.anomalies == () + assert result.agents_scanned == 0 + + @pytest.mark.unit + def test_period_ordering_invalid(self) -> None: + with pytest.raises(ValueError, match="scan_period_start"): + AnomalyDetectionResult( + scan_period_start=datetime(2026, 3, 1, tzinfo=UTC), + scan_period_end=datetime(2026, 2, 1, tzinfo=UTC), + agents_scanned=0, + scan_timestamp=datetime(2026, 3, 1, tzinfo=UTC), + ) + + +# ── AgentEfficiency Tests ───────────────────────────────────────── + + +class TestAgentEfficiency: + @pytest.mark.unit + def test_construction(self) -> None: + eff = AgentEfficiency( + agent_id="alice", + total_cost_usd=5.0, + total_tokens=100000, + cost_per_1k_tokens=0.05, + record_count=50, + efficiency_rating=EfficiencyRating.NORMAL, + ) + assert eff.agent_id == "alice" + assert eff.total_cost_usd == 5.0 + assert eff.efficiency_rating == EfficiencyRating.NORMAL + + @pytest.mark.unit + def test_zero_tokens(self) -> None: + eff = AgentEfficiency( + agent_id="alice", + total_cost_usd=0.0, + total_tokens=0, + cost_per_1k_tokens=0.0, + record_count=0, + efficiency_rating=EfficiencyRating.NORMAL, + ) + assert eff.total_tokens == 0 + assert eff.cost_per_1k_tokens == 0.0 + + +# ── EfficiencyAnalysis Tests ───────────────────────────────────── + + +class TestEfficiencyAnalysis: + @pytest.mark.unit + def test_empty_analysis(self) -> None: + analysis = EfficiencyAnalysis( + agents=(), + global_avg_cost_per_1k=0.0, + analysis_period_start=datetime(2026, 2, 1, tzinfo=UTC), + analysis_period_end=datetime(2026, 3, 1, tzinfo=UTC), + inefficient_agent_count=0, + ) + assert analysis.agents == () + assert analysis.inefficient_agent_count == 0 + + @pytest.mark.unit + def test_period_ordering_invalid(self) -> None: + with pytest.raises(ValueError, match="analysis_period_start"): + EfficiencyAnalysis( + agents=(), + global_avg_cost_per_1k=0.0, + analysis_period_start=datetime(2026, 3, 1, tzinfo=UTC), + analysis_period_end=datetime(2026, 2, 1, tzinfo=UTC), + inefficient_agent_count=0, + ) + + +# ── DowngradeRecommendation Tests ───────────────────────────────── + + +class TestDowngradeRecommendation: + @pytest.mark.unit + def test_construction(self) -> None: + rec = DowngradeRecommendation( + agent_id="alice", + current_model="test-large-001", + recommended_model="test-small-001", + estimated_savings_per_1k=0.05, + reason="Switch to cheaper model", + ) + assert rec.agent_id == "alice" + assert rec.estimated_savings_per_1k == 0.05 + + @pytest.mark.unit + def test_frozen(self) -> None: + rec = DowngradeRecommendation( + agent_id="alice", + current_model="test-large-001", + recommended_model="test-small-001", + estimated_savings_per_1k=0.05, + reason="Switch to cheaper model", + ) + with pytest.raises(Exception): # noqa: B017, PT011 + rec.agent_id = "bob" # type: ignore[misc] + + +# ── DowngradeAnalysis Tests ─────────────────────────────────────── + + +class TestDowngradeAnalysis: + @pytest.mark.unit + def test_empty_analysis(self) -> None: + analysis = DowngradeAnalysis( + recommendations=(), + total_estimated_monthly_savings=0.0, + budget_pressure_percent=0.0, + ) + assert analysis.recommendations == () + assert analysis.total_estimated_monthly_savings == 0.0 + + +# ── ApprovalDecision Tests ──────────────────────────────────────── + + +class TestApprovalDecision: + @pytest.mark.unit + def test_approved(self) -> None: + decision = ApprovalDecision( + approved=True, + reason="Approved", + budget_remaining_usd=50.0, + budget_used_percent=50.0, + alert_level=BudgetAlertLevel.NORMAL, + conditions=(), + ) + assert decision.approved is True + assert decision.budget_remaining_usd == 50.0 + + @pytest.mark.unit + def test_denied(self) -> None: + decision = ApprovalDecision( + approved=False, + reason="Budget exhausted", + budget_remaining_usd=0.0, + budget_used_percent=100.0, + alert_level=BudgetAlertLevel.HARD_STOP, + ) + assert decision.approved is False + assert decision.alert_level == BudgetAlertLevel.HARD_STOP + + @pytest.mark.unit + def test_with_conditions(self) -> None: + decision = ApprovalDecision( + approved=True, + reason="Approved with conditions", + budget_remaining_usd=20.0, + budget_used_percent=80.0, + alert_level=BudgetAlertLevel.WARNING, + conditions=("High cost operation", "Budget is running low"), + ) + assert len(decision.conditions) == 2 + + +# ── CostOptimizerConfig Tests ──────────────────────────────────── + + +class TestCostOptimizerConfig: + @pytest.mark.unit + def test_defaults(self) -> None: + config = CostOptimizerConfig() + assert config.anomaly_sigma_threshold == 2.0 + assert config.anomaly_spike_factor == 3.0 + assert config.inefficiency_threshold_factor == 1.5 + assert config.approval_auto_deny_alert_level == BudgetAlertLevel.HARD_STOP + assert config.approval_warn_threshold_usd == 1.0 + assert config.min_anomaly_windows == 3 + + @pytest.mark.unit + def test_custom_values(self) -> None: + config = CostOptimizerConfig( + anomaly_sigma_threshold=3.0, + anomaly_spike_factor=5.0, + inefficiency_threshold_factor=2.0, + approval_auto_deny_alert_level=BudgetAlertLevel.CRITICAL, + approval_warn_threshold_usd=2.5, + min_anomaly_windows=4, + ) + assert config.anomaly_sigma_threshold == 3.0 + assert config.anomaly_spike_factor == 5.0 + + @pytest.mark.unit + def test_sigma_must_be_positive(self) -> None: + with pytest.raises(ValueError, match="greater than 0"): + CostOptimizerConfig(anomaly_sigma_threshold=0.0) + + @pytest.mark.unit + def test_spike_factor_must_exceed_one(self) -> None: + with pytest.raises(ValueError, match="greater than 1"): + CostOptimizerConfig(anomaly_spike_factor=1.0) + + @pytest.mark.unit + def test_inefficiency_factor_must_exceed_one(self) -> None: + with pytest.raises(ValueError, match="greater than 1"): + CostOptimizerConfig(inefficiency_threshold_factor=0.5) + + @pytest.mark.unit + def test_min_anomaly_windows_minimum(self) -> None: + with pytest.raises(ValueError, match="greater than or equal to 2"): + CostOptimizerConfig(min_anomaly_windows=1) + + @pytest.mark.unit + def test_frozen(self) -> None: + config = CostOptimizerConfig() + with pytest.raises(Exception): # noqa: B017, PT011 + config.anomaly_sigma_threshold = 5.0 # type: ignore[misc] diff --git a/tests/unit/budget/test_reports.py b/tests/unit/budget/test_reports.py new file mode 100644 index 0000000000..4c0172f719 --- /dev/null +++ b/tests/unit/budget/test_reports.py @@ -0,0 +1,337 @@ +"""Tests for ReportGenerator service and report models.""" + +from datetime import UTC, datetime, timedelta + +import pytest + +from ai_company.budget.config import BudgetConfig +from ai_company.budget.reports import ( + ModelDistribution, + PeriodComparison, + ProviderDistribution, + ReportGenerator, + TaskSpending, +) +from ai_company.budget.tracker import CostTracker +from tests.unit.budget.conftest import make_cost_record + +# ── Helpers ─────────────────────────────────────────────────────── + +_START = datetime(2026, 2, 1, tzinfo=UTC) +_END = datetime(2026, 3, 1, tzinfo=UTC) + + +def _make_report_generator( + *, + budget_config: BudgetConfig | None = None, +) -> tuple[ReportGenerator, CostTracker]: + """Build a ReportGenerator with a fresh CostTracker.""" + bc = budget_config or BudgetConfig(total_monthly=100.0) + tracker = CostTracker(budget_config=bc) + gen = ReportGenerator(cost_tracker=tracker, budget_config=bc) + return gen, tracker + + +# ── Report Model Tests ──────────────────────────────────────────── + + +@pytest.mark.unit +class TestTaskSpending: + def test_construction(self) -> None: + ts = TaskSpending( + task_id="task-001", + total_cost_usd=5.0, + total_tokens=10000, + record_count=10, + ) + assert ts.task_id == "task-001" + assert ts.total_cost_usd == 5.0 + + def test_frozen(self) -> None: + ts = TaskSpending( + task_id="task-001", + total_cost_usd=5.0, + total_tokens=10000, + record_count=10, + ) + with pytest.raises(Exception): # noqa: B017, PT011 + ts.task_id = "other" # type: ignore[misc] + + +@pytest.mark.unit +class TestProviderDistribution: + def test_construction(self) -> None: + pd = ProviderDistribution( + provider="test-provider", + total_cost_usd=50.0, + record_count=100, + percentage_of_total=100.0, + ) + assert pd.provider == "test-provider" + assert pd.percentage_of_total == 100.0 + + +@pytest.mark.unit +class TestModelDistribution: + def test_construction(self) -> None: + md = ModelDistribution( + model="test-model-001", + provider="test-provider", + total_cost_usd=50.0, + record_count=100, + percentage_of_total=50.0, + ) + assert md.model == "test-model-001" + + +@pytest.mark.unit +class TestPeriodComparison: + def test_construction(self) -> None: + pc = PeriodComparison( + current_period_cost=100.0, + previous_period_cost=80.0, + cost_change_usd=20.0, + cost_change_percent=25.0, + ) + assert pc.cost_change_usd == 20.0 + assert pc.cost_change_percent == 25.0 + + def test_negative_change(self) -> None: + pc = PeriodComparison( + current_period_cost=60.0, + previous_period_cost=80.0, + cost_change_usd=-20.0, + cost_change_percent=-25.0, + ) + assert pc.cost_change_usd == -20.0 + assert pc.cost_change_percent == -25.0 + + def test_no_previous_data(self) -> None: + pc = PeriodComparison( + current_period_cost=50.0, + previous_period_cost=0.0, + cost_change_usd=50.0, + cost_change_percent=None, + ) + assert pc.cost_change_percent is None + + +# ── ReportGenerator Tests ───────────────────────────────────────── + + +@pytest.mark.unit +class TestReportGenerator: + async def test_init(self) -> None: + gen, _ = _make_report_generator() + assert gen._cost_tracker is not None + assert gen._budget_config is not None + + async def test_generate_report_no_records(self) -> None: + gen, _ = _make_report_generator() + report = await gen.generate_report(start=_START, end=_END) + assert report.by_task == () + assert report.by_provider == () + assert report.by_model == () + assert report.summary.period.total_cost_usd == 0.0 + + async def test_generate_report_multiple_agents_tasks(self) -> None: + gen, tracker = _make_report_generator() + + await tracker.record( + make_cost_record( + agent_id="alice", + task_id="task-a", + cost_usd=3.0, + timestamp=_START + timedelta(hours=1), + ), + ) + await tracker.record( + make_cost_record( + agent_id="bob", + task_id="task-b", + cost_usd=5.0, + timestamp=_START + timedelta(hours=2), + ), + ) + await tracker.record( + make_cost_record( + agent_id="alice", + task_id="task-a", + cost_usd=2.0, + timestamp=_START + timedelta(hours=3), + ), + ) + + report = await gen.generate_report(start=_START, end=_END) + assert report.summary.period.total_cost_usd == 10.0 + assert len(report.by_task) == 2 + + # task-a has 5.0, task-b has 5.0 + task_a = next(t for t in report.by_task if t.task_id == "task-a") + assert task_a.total_cost_usd == 5.0 + assert task_a.record_count == 2 + + async def test_provider_distribution_percentages(self) -> None: + gen, tracker = _make_report_generator() + + await tracker.record( + make_cost_record( + provider="provider-a", + cost_usd=3.0, + timestamp=_START + timedelta(hours=1), + ), + ) + await tracker.record( + make_cost_record( + provider="provider-b", + cost_usd=7.0, + timestamp=_START + timedelta(hours=2), + ), + ) + + report = await gen.generate_report(start=_START, end=_END) + assert len(report.by_provider) == 2 + total_pct = sum(p.percentage_of_total for p in report.by_provider) + assert abs(total_pct - 100.0) < 0.01 + + async def test_model_distribution(self) -> None: + gen, tracker = _make_report_generator() + + await tracker.record( + make_cost_record( + model="model-a", + cost_usd=4.0, + timestamp=_START + timedelta(hours=1), + ), + ) + await tracker.record( + make_cost_record( + model="model-b", + cost_usd=6.0, + timestamp=_START + timedelta(hours=2), + ), + ) + + report = await gen.generate_report(start=_START, end=_END) + assert len(report.by_model) == 2 + model_a = next(m for m in report.by_model if m.model == "model-a") + assert model_a.total_cost_usd == 4.0 + + async def test_period_comparison_cost_increase(self) -> None: + gen, tracker = _make_report_generator() + + # Previous period data + prev_start = _START - (_END - _START) + await tracker.record( + make_cost_record( + cost_usd=5.0, + timestamp=prev_start + timedelta(hours=1), + ), + ) + # Current period data + await tracker.record( + make_cost_record( + cost_usd=8.0, + timestamp=_START + timedelta(hours=1), + ), + ) + + report = await gen.generate_report(start=_START, end=_END) + assert report.period_comparison is not None + assert report.period_comparison.current_period_cost == 8.0 + assert report.period_comparison.previous_period_cost == 5.0 + assert report.period_comparison.cost_change_usd == 3.0 + assert report.period_comparison.cost_change_percent == 60.0 + + async def test_no_prior_data_no_comparison(self) -> None: + gen, _ = _make_report_generator() + report = await gen.generate_report(start=_START, end=_END) + # Both periods have zero cost → no comparison + assert report.period_comparison is None + + async def test_top_n_agents(self) -> None: + gen, tracker = _make_report_generator() + + for i, agent in enumerate(["alice", "bob", "carol", "dave", "eve"]): + await tracker.record( + make_cost_record( + agent_id=agent, + cost_usd=float(i + 1), + timestamp=_START + timedelta(hours=i + 1), + ), + ) + + report = await gen.generate_report( + start=_START, + end=_END, + top_n=3, + ) + assert len(report.top_agents_by_cost) == 3 + # Sorted descending + assert report.top_agents_by_cost[0][0] == "eve" + assert report.top_agents_by_cost[0][1] == 5.0 + + async def test_top_n_tasks(self) -> None: + gen, tracker = _make_report_generator() + + for i, task in enumerate(["t1", "t2", "t3", "t4"]): + await tracker.record( + make_cost_record( + task_id=task, + cost_usd=float(i + 1) * 2, + timestamp=_START + timedelta(hours=i + 1), + ), + ) + + report = await gen.generate_report( + start=_START, + end=_END, + top_n=2, + ) + assert len(report.top_tasks_by_cost) == 2 + assert report.top_tasks_by_cost[0][0] == "t4" + + async def test_top_n_validation(self) -> None: + gen, _ = _make_report_generator() + with pytest.raises(ValueError, match="top_n must be >= 1"): + await gen.generate_report(start=_START, end=_END, top_n=0) + + async def test_period_comparison_cost_decrease(self) -> None: + gen, tracker = _make_report_generator() + + prev_start = _START - (_END - _START) + await tracker.record( + make_cost_record( + cost_usd=10.0, + timestamp=prev_start + timedelta(hours=1), + ), + ) + await tracker.record( + make_cost_record( + cost_usd=3.0, + timestamp=_START + timedelta(hours=1), + ), + ) + + report = await gen.generate_report(start=_START, end=_END) + assert report.period_comparison is not None + assert report.period_comparison.cost_change_usd == -7.0 + assert report.period_comparison.cost_change_percent is not None + assert report.period_comparison.cost_change_percent < 0 + + async def test_skip_period_comparison(self) -> None: + gen, tracker = _make_report_generator() + + await tracker.record( + make_cost_record( + cost_usd=5.0, + timestamp=_START + timedelta(hours=1), + ), + ) + + report = await gen.generate_report( + start=_START, + end=_END, + include_period_comparison=False, + ) + assert report.period_comparison is None diff --git a/tests/unit/budget/test_tracker_get_records.py b/tests/unit/budget/test_tracker_get_records.py new file mode 100644 index 0000000000..eef314f113 --- /dev/null +++ b/tests/unit/budget/test_tracker_get_records.py @@ -0,0 +1,122 @@ +"""Tests for CostTracker.get_records() method.""" + +from datetime import UTC, datetime, timedelta + +import pytest + +from ai_company.budget.tracker import CostTracker +from tests.unit.budget.conftest import make_cost_record + + +@pytest.mark.unit +class TestGetRecords: + """Tests for the get_records query method.""" + + async def test_empty_tracker_returns_empty_tuple(self) -> None: + tracker = CostTracker() + result = await tracker.get_records() + assert result == () + + async def test_returns_all_records_unfiltered(self) -> None: + tracker = CostTracker() + r1 = make_cost_record(agent_id="alice", task_id="t1") + r2 = make_cost_record(agent_id="bob", task_id="t2") + await tracker.record(r1) + await tracker.record(r2) + + result = await tracker.get_records() + assert len(result) == 2 + + async def test_filter_by_agent_id(self) -> None: + tracker = CostTracker() + await tracker.record(make_cost_record(agent_id="alice")) + await tracker.record(make_cost_record(agent_id="bob")) + + result = await tracker.get_records(agent_id="alice") + assert len(result) == 1 + assert result[0].agent_id == "alice" + + async def test_filter_by_task_id(self) -> None: + tracker = CostTracker() + await tracker.record(make_cost_record(task_id="task-a")) + await tracker.record(make_cost_record(task_id="task-b")) + + result = await tracker.get_records(task_id="task-a") + assert len(result) == 1 + assert result[0].task_id == "task-a" + + async def test_filter_by_time_range(self) -> None: + tracker = CostTracker() + t1 = datetime(2026, 2, 10, tzinfo=UTC) + t2 = datetime(2026, 2, 20, tzinfo=UTC) + t3 = datetime(2026, 2, 25, tzinfo=UTC) + await tracker.record(make_cost_record(timestamp=t1)) + await tracker.record(make_cost_record(timestamp=t2)) + await tracker.record(make_cost_record(timestamp=t3)) + + result = await tracker.get_records( + start=datetime(2026, 2, 15, tzinfo=UTC), + end=datetime(2026, 2, 22, tzinfo=UTC), + ) + assert len(result) == 1 + assert result[0].timestamp == t2 + + async def test_combined_filters(self) -> None: + tracker = CostTracker() + t1 = datetime(2026, 2, 10, tzinfo=UTC) + t2 = datetime(2026, 2, 20, tzinfo=UTC) + await tracker.record( + make_cost_record(agent_id="alice", task_id="t1", timestamp=t1), + ) + await tracker.record( + make_cost_record(agent_id="alice", task_id="t2", timestamp=t2), + ) + await tracker.record( + make_cost_record(agent_id="bob", task_id="t1", timestamp=t2), + ) + + result = await tracker.get_records( + agent_id="alice", + start=datetime(2026, 2, 15, tzinfo=UTC), + ) + assert len(result) == 1 + assert result[0].task_id == "t2" + + async def test_returns_immutable_tuple(self) -> None: + tracker = CostTracker() + await tracker.record(make_cost_record()) + result = await tracker.get_records() + assert isinstance(result, tuple) + + async def test_no_matches_returns_empty(self) -> None: + tracker = CostTracker() + await tracker.record(make_cost_record(agent_id="alice")) + result = await tracker.get_records(agent_id="nonexistent") + assert result == () + + async def test_invalid_time_range_raises(self) -> None: + tracker = CostTracker() + with pytest.raises(ValueError, match=r"start.*before.*end"): + await tracker.get_records( + start=datetime(2026, 3, 1, tzinfo=UTC), + end=datetime(2026, 2, 1, tzinfo=UTC), + ) + + async def test_start_inclusive_end_exclusive(self) -> None: + tracker = CostTracker() + boundary = datetime(2026, 2, 15, tzinfo=UTC) + before = boundary - timedelta(seconds=1) + after = boundary + timedelta(seconds=1) + + await tracker.record(make_cost_record(timestamp=before)) + await tracker.record(make_cost_record(timestamp=boundary)) + await tracker.record(make_cost_record(timestamp=after)) + + # start=boundary should include boundary + result = await tracker.get_records(start=boundary) + assert len(result) == 2 + + # end=boundary should exclude boundary + result = await tracker.get_records(end=boundary) + assert len(result) == 1 + assert result[0].timestamp == before diff --git a/tests/unit/observability/test_events.py b/tests/unit/observability/test_events.py index dde625f0b9..693fc127fe 100644 --- a/tests/unit/observability/test_events.py +++ b/tests/unit/observability/test_events.py @@ -176,6 +176,7 @@ def test_all_domain_modules_discovered(self) -> None: """Every expected domain module is found by pkgutil discovery.""" expected = { "budget", + "cfo", "classification", "communication", "company", From 9048bf83df8be15a3ff7c20fee1e37455596d017 Mon Sep 17 00:00:00 2001 From: Aurelio <19254254+Aureliolo@users.noreply.github.com> Date: Mon, 9 Mar 2026 14:20:32 +0100 Subject: [PATCH 2/4] fix: pre-PR review fixes for CFO optimizer, reports, and model improvements Pre-reviewed by 9 agents, 35 findings addressed. --- CLAUDE.md | 4 +- DESIGN_SPEC.md | 11 +- README.md | 2 +- src/ai_company/budget/optimizer.py | 249 ++++++++++++------ src/ai_company/budget/optimizer_models.py | 73 +++-- src/ai_company/budget/reports.py | 71 +++-- src/ai_company/budget/tracker.py | 8 +- src/ai_company/observability/events/budget.py | 2 + src/ai_company/observability/events/cfo.py | 4 +- tests/unit/budget/test_optimizer.py | 172 +++++++++++- tests/unit/budget/test_optimizer_models.py | 72 ++++- tests/unit/budget/test_reports.py | 79 +++++- 12 files changed, 568 insertions(+), 179 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 94b9ef1adb..bc2d9a223b 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -44,7 +44,7 @@ uv run pre-commit run --all-files # all pre-commit hooks ```text src/ai_company/ api/ # FastAPI REST + WebSocket routes - budget/ # Cost tracking, budget enforcement (pre-flight/in-flight checks, auto-downgrade), billing periods, cost tiers, quota/subscription tracking + budget/ # Cost tracking, budget enforcement (pre-flight/in-flight checks, auto-downgrade), billing periods, cost tiers, quota/subscription tracking, CFO cost optimization (anomaly detection, efficiency analysis, downgrade recommendations, approval decisions), spending reports cli/ # Typer CLI commands communication/ # Message bus, dispatcher, messenger, channels, delegation, loop prevention, conflict resolution, meeting protocol config/ # YAML company config loading and validation @@ -83,7 +83,7 @@ src/ai_company/ - **Every module** with business logic MUST have: `from ai_company.observability import get_logger` then `logger = get_logger(__name__)` - **Never** use `import logging` / `logging.getLogger()` / `print()` in application code - **Variable name**: always `logger` (not `_logger`, not `log`) -- **Event names**: always use constants from the domain-specific module under `ai_company.observability.events` (e.g. `PROVIDER_CALL_START` from `events.provider`, `BUDGET_RECORD_ADDED` from `events.budget`, `CONFLICT_DETECTED` from `events.conflict`, `MEETING_STARTED` from `events.meeting`, `CLASSIFICATION_START` from `events.classification`). Import directly: `from ai_company.observability.events. import EVENT_CONSTANT` +- **Event names**: always use constants from the domain-specific module under `ai_company.observability.events` (e.g. `PROVIDER_CALL_START` from `events.provider`, `BUDGET_RECORD_ADDED` from `events.budget`, `CFO_ANOMALY_DETECTED` from `events.cfo`, `CONFLICT_DETECTED` from `events.conflict`, `MEETING_STARTED` from `events.meeting`, `CLASSIFICATION_START` from `events.classification`). Import directly: `from ai_company.observability.events. import EVENT_CONSTANT` - **Structured kwargs**: always `logger.info(EVENT, key=value)` — never `logger.info("msg %s", val)` - **All error paths** must log at WARNING or ERROR with context before raising - **All state transitions** must log at INFO diff --git a/DESIGN_SPEC.md b/DESIGN_SPEC.md index 8aa47adbbd..ea8e8eb8f7 100644 --- a/DESIGN_SPEC.md +++ b/DESIGN_SPEC.md @@ -1845,6 +1845,13 @@ The CFO agent (when enabled) acts as a cost management system: - Blocks tasks that would exceed remaining budget - Optimizes model routing for cost/quality balance +> **Implementation note (M5):** `CostOptimizer` service (`budget/optimizer.py`) +> implements anomaly detection (sigma + spike factor), per-agent efficiency +> analysis, model downgrade recommendations (via `ModelResolver`), and +> operation approval evaluation. `ReportGenerator` service +> (`budget/reports.py`) produces multi-dimensional spending reports with +> task/provider/model breakdowns and period-over-period comparison. + ### 10.4 Cost Controls > **Minimal config:** @@ -2839,6 +2846,7 @@ ai-company/ │ │ ├── events/ # Per-domain event constants │ │ │ ├── __init__.py # Package marker with usage docs; no re-exports │ │ │ ├── budget.py # BUDGET_* constants +│ │ │ ├── cfo.py # CFO_* constants │ │ │ ├── classification.py # CLASSIFICATION_* constants │ │ │ ├── company.py # COMPANY_* constants │ │ │ ├── communication.py # COMM_* constants @@ -2942,7 +2950,8 @@ ai-company/ │ │ ├── enums.py # Budget-related enums │ │ ├── billing.py # Billing period computation utilities │ │ ├── enforcer.py # BudgetEnforcer service (pre-flight, in-flight, auto-downgrade) -│ │ ├── optimizer.py # Cost optimization / CFO logic (M5) +│ │ ├── optimizer.py # CostOptimizer service — anomaly detection, efficiency analysis, downgrade recommendations, approval decisions (M5) +│ │ ├── optimizer_models.py # CostOptimizer domain models — anomaly, efficiency, downgrade, approval, config (M5) │ │ ├── quota.py # Quota/subscription models, degradation config, quota snapshots │ │ ├── quota_tracker.py # QuotaTracker service: per-provider request/token quota enforcement │ │ └── reports.py # Spending reports (M5) diff --git a/README.md b/README.md index 015654b377..7512edc68e 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ AI Company lets you spin up a virtual organization staffed entirely by AI agents - **Persistence Layer (M5)** - Pluggable `PersistenceBackend` protocol with SQLite backend (aiosqlite), repository protocols, schema migrations - **Memory Interface (M5)** - Pluggable `MemoryBackend` protocol with capability discovery, shared knowledge protocol, domain models, config, factory, and context injection retrieval pipeline (ranking, token-budget formatting) - **Coordination Error Taxonomy (M5)** - Post-execution classification pipeline detecting logical contradictions, numerical drift, context omissions, and coordination failures -- **Budget Enforcement (M5)** - `BudgetEnforcer` service with pre-flight checks, in-flight budget checking, auto-downgrade, configurable cost tiers, and quota/subscription tracking; CFO agent and advanced reporting pending +- **Budget Enforcement (M5)** - `BudgetEnforcer` service with pre-flight checks, in-flight budget checking, auto-downgrade, configurable cost tiers, and quota/subscription tracking; `CostOptimizer` CFO service with anomaly detection, efficiency analysis, downgrade recommendations, and approval decisions; `ReportGenerator` for multi-dimensional spending reports ### Not implemented yet (planned milestones) diff --git a/src/ai_company/budget/optimizer.py b/src/ai_company/budget/optimizer.py index 8f020d06e8..86f186fe0b 100644 --- a/src/ai_company/budget/optimizer.py +++ b/src/ai_company/budget/optimizer.py @@ -38,9 +38,12 @@ CFO_ANOMALY_SCAN_COMPLETE, CFO_APPROVAL_EVALUATED, CFO_DOWNGRADE_RECOMMENDED, + CFO_DOWNGRADE_SKIPPED, CFO_EFFICIENCY_ANALYSIS_COMPLETE, + CFO_INSUFFICIENT_WINDOWS, CFO_OPERATION_DENIED, CFO_OPTIMIZER_CREATED, + CFO_RESOLVER_MISSING, ) if TYPE_CHECKING: @@ -54,8 +57,10 @@ logger = get_logger(__name__) -# ── Alert level ordering (reused from enforcer pattern) ────────── +# Agents spending below this fraction of global average are rated EFFICIENT +_EFFICIENCY_LOWER_BOUND = 0.8 +# Same ordering as BudgetEnforcer._ALERT_ORDER _ALERT_LEVEL_ORDER: dict[BudgetAlertLevel, int] = { BudgetAlertLevel.NORMAL: 0, BudgetAlertLevel.WARNING: 1, @@ -123,6 +128,9 @@ async def detect_anomalies( Raises: ValueError: If ``start >= end`` or ``window_count < 2``. """ + if start >= end: + msg = f"start ({start.isoformat()}) must be before end ({end.isoformat()})" + raise ValueError(msg) if window_count < 2: # noqa: PLR2004 msg = f"window_count must be >= 2, got {window_count}" raise ValueError(msg) @@ -202,64 +210,27 @@ async def analyze_efficiency( Raises: ValueError: If ``start >= end``. """ + if start >= end: + msg = f"start ({start.isoformat()}) must be before end ({end.isoformat()})" + raise ValueError(msg) + records = await self._cost_tracker.get_records( start=start, end=end, ) - by_agent: dict[str, list[CostRecord]] = defaultdict(list) - for r in records: - by_agent[r.agent_id].append(r) - - global_avg = _compute_global_avg_cost_per_1k(records) - threshold_factor = self._config.inefficiency_threshold_factor - - agent_efficiencies: list[AgentEfficiency] = [] - inefficient_count = 0 - - for agent_id in sorted(by_agent): - agent_records = by_agent[agent_id] - total_cost = round( - math.fsum(r.cost_usd for r in agent_records), - BUDGET_ROUNDING_PRECISION, - ) - total_tokens = sum(r.input_tokens + r.output_tokens for r in agent_records) - cost_per_1k = _compute_cost_per_1k(total_cost, total_tokens) - rating = _rate_efficiency(cost_per_1k, global_avg, threshold_factor) - - if rating == EfficiencyRating.INEFFICIENT: - inefficient_count += 1 - - agent_efficiencies.append( - AgentEfficiency( - agent_id=agent_id, - total_cost_usd=total_cost, - total_tokens=total_tokens, - cost_per_1k_tokens=cost_per_1k, - record_count=len(agent_records), - efficiency_rating=rating, - ), - ) - - # Sort by cost_per_1k descending (most expensive first) - agent_efficiencies.sort( - key=lambda a: a.cost_per_1k_tokens, - reverse=True, - ) - - result = EfficiencyAnalysis( - agents=tuple(agent_efficiencies), - global_avg_cost_per_1k=global_avg, - analysis_period_start=start, - analysis_period_end=end, - inefficient_agent_count=inefficient_count, + result = _build_efficiency_from_records( + records, + start=start, + end=end, + threshold_factor=self._config.inefficiency_threshold_factor, ) logger.info( CFO_EFFICIENCY_ANALYSIS_COMPLETE, - agent_count=len(agent_efficiencies), - inefficient_count=inefficient_count, - global_avg_cost_per_1k=global_avg, + agent_count=len(result.agents), + inefficient_count=result.inefficient_agent_count, + global_avg_cost_per_1k=result.global_avg_cost_per_1k, ) return result @@ -286,18 +257,38 @@ async def recommend_downgrades( Raises: ValueError: If ``start >= end``. """ + if start >= end: + msg = f"start ({start.isoformat()}) must be before end ({end.isoformat()})" + raise ValueError(msg) + if self._model_resolver is None: + logger.warning( + CFO_RESOLVER_MISSING, + reason="no_model_resolver_configured", + ) return DowngradeAnalysis( recommendations=(), - total_estimated_monthly_savings=0.0, + total_estimated_savings_per_1k=0.0, budget_pressure_percent=0.0, ) - efficiency = await self.analyze_efficiency(start=start, end=end) records = await self._cost_tracker.get_records( start=start, end=end, ) + efficiency = _build_efficiency_from_records( + records, + start=start, + end=end, + threshold_factor=self._config.inefficiency_threshold_factor, + ) + + logger.info( + CFO_EFFICIENCY_ANALYSIS_COMPLETE, + agent_count=len(efficiency.agents), + inefficient_count=efficiency.inefficient_agent_count, + global_avg_cost_per_1k=efficiency.global_avg_cost_per_1k, + ) downgrade_map = dict(self._budget_config.auto_downgrade.downgrade_map) budget_pressure = await self._compute_budget_pressure() @@ -332,7 +323,7 @@ async def recommend_downgrades( return DowngradeAnalysis( recommendations=tuple(recommendations), - total_estimated_monthly_savings=round( + total_estimated_savings_per_1k=round( total_savings, BUDGET_ROUNDING_PRECISION, ), @@ -348,8 +339,14 @@ async def evaluate_operation( ) -> ApprovalDecision: """Evaluate whether an operation should proceed. - Checks current budget utilization and determines if the - estimated cost is acceptable. + Evaluates three criteria in order: + + 1. Denies if the current alert level meets or exceeds the + auto-deny threshold (configurable). + 2. Denies if the projected cost would exceed the hard-stop + limit. + 3. Approves with optional warning conditions for high-cost + operations or elevated alert levels. Args: agent_id: Agent requesting the operation. @@ -388,7 +385,6 @@ async def evaluate_operation( auto_deny_level = self._config.approval_auto_deny_alert_level - # Auto-deny if at or above auto-deny alert level if _ALERT_LEVEL_ORDER[alert_level] >= _ALERT_LEVEL_ORDER[auto_deny_level]: logger.warning( CFO_OPERATION_DENIED, @@ -410,7 +406,6 @@ async def evaluate_operation( conditions=(), ) - # Auto-deny if estimated cost would push past hard stop hard_stop_limit = round( cfg.total_monthly * cfg.alerts.hard_stop_at / 100, BUDGET_ROUNDING_PRECISION, @@ -440,7 +435,6 @@ async def evaluate_operation( conditions=(), ) - # Approve with conditions if cost is high conditions: list[str] = [] warn_threshold = self._config.approval_warn_threshold_usd if estimated_cost_usd >= warn_threshold: @@ -492,6 +486,54 @@ async def _compute_budget_pressure(self) -> float: # ── Module-level pure helpers ──────────────────────────────────── +def _build_efficiency_from_records( + records: Sequence[CostRecord], + *, + start: datetime, + end: datetime, + threshold_factor: float, +) -> EfficiencyAnalysis: + """Build an EfficiencyAnalysis from pre-fetched records.""" + by_agent: dict[str, list[CostRecord]] = defaultdict(list) + for r in records: + by_agent[r.agent_id].append(r) + + global_avg = _compute_global_avg_cost_per_1k(records) + + agent_efficiencies: list[AgentEfficiency] = [] + for agent_id in sorted(by_agent): + agent_records = by_agent[agent_id] + total_cost = round( + math.fsum(r.cost_usd for r in agent_records), + BUDGET_ROUNDING_PRECISION, + ) + total_tokens = sum(r.input_tokens + r.output_tokens for r in agent_records) + cost_per_1k = _compute_cost_per_1k(total_cost, total_tokens) + rating = _rate_efficiency(cost_per_1k, global_avg, threshold_factor) + + agent_efficiencies.append( + AgentEfficiency( + agent_id=agent_id, + total_cost_usd=total_cost, + total_tokens=total_tokens, + record_count=len(agent_records), + efficiency_rating=rating, + ), + ) + + agent_efficiencies.sort( + key=lambda a: a.cost_per_1k_tokens, + reverse=True, + ) + + return EfficiencyAnalysis( + agents=tuple(agent_efficiencies), + global_avg_cost_per_1k=global_avg, + analysis_period_start=start, + analysis_period_end=end, + ) + + def _compute_window_costs( records: Sequence[CostRecord], agent_id: str, @@ -505,7 +547,7 @@ def _compute_window_costs( window_cost = math.fsum( r.cost_usd for r in records - if r.agent_id == agent_id and r.timestamp >= ws and r.timestamp < window_end + if r.agent_id == agent_id and ws <= r.timestamp < window_end ) costs.append(round(window_cost, BUDGET_ROUNDING_PRECISION)) return tuple(costs) @@ -524,6 +566,12 @@ def _detect_spike_anomaly( # noqa: PLR0913 Returns ``None`` if no anomaly is detected or insufficient data. """ if len(window_costs) < config.min_anomaly_windows: + logger.debug( + CFO_INSUFFICIENT_WINDOWS, + agent_id=agent_id, + window_count=len(window_costs), + min_required=config.min_anomaly_windows, + ) return None historical = window_costs[:-1] @@ -535,27 +583,26 @@ def _detect_spike_anomaly( # noqa: PLR0913 mean = statistics.mean(historical) if mean == 0.0: - # No historical spending — a spike from zero is always flagged - if current > 0: - return SpendingAnomaly( - agent_id=agent_id, - anomaly_type=AnomalyType.SPIKE, - severity=AnomalySeverity.HIGH, - description=( - f"Agent {agent_id!r} went from $0.00 baseline " - f"to ${current:.2f} in the latest window" - ), - current_value=current, - baseline_value=0.0, - deviation_factor=0.0, - detected_at=now, - period_start=window_starts[-1], - period_end=window_starts[-1] + window_duration, - ) - return None + # No historical spending — spike from zero (current > 0 per guard above) + return SpendingAnomaly( + agent_id=agent_id, + anomaly_type=AnomalyType.SPIKE, + severity=AnomalySeverity.HIGH, + description=( + f"Agent {agent_id!r} went from $0.00 baseline " + f"to ${current:.2f} in the latest window" + ), + current_value=current, + baseline_value=0.0, + deviation_factor=0.0, + detected_at=now, + period_start=window_starts[-1], + period_end=window_starts[-1] + window_duration, + ) # Check spike factor (independent of stddev) - is_spike = current > config.anomaly_spike_factor * mean + spike_ratio = current / mean + is_spike = spike_ratio > config.anomaly_spike_factor # Check sigma threshold stddev = statistics.stdev(historical) if len(historical) > 1 else 0.0 @@ -565,7 +612,12 @@ def _detect_spike_anomaly( # noqa: PLR0913 if not is_spike and not is_sigma_anomaly: return None - severity = _classify_severity(deviation) + # When stddev is zero, use the spike ratio for severity classification + severity = ( + _classify_severity(spike_ratio) + if is_spike and stddev == 0.0 + else _classify_severity(deviation) + ) return SpendingAnomaly( agent_id=agent_id, @@ -610,7 +662,7 @@ def _rate_efficiency( return EfficiencyRating.NORMAL if cost_per_1k > threshold_factor * global_avg: return EfficiencyRating.INEFFICIENT - if cost_per_1k < 0.8 * global_avg: + if cost_per_1k < _EFFICIENCY_LOWER_BOUND * global_avg: return EfficiencyRating.EFFICIENT return EfficiencyRating.NORMAL @@ -648,24 +700,48 @@ def _build_downgrade_recommendation( """Build a downgrade recommendation for a single agent.""" current_resolved = resolver.resolve_safe(current_model) if current_resolved is None: + logger.debug( + CFO_DOWNGRADE_SKIPPED, + agent_id=agent_id, + reason="current_model_not_resolved", + model=current_model, + ) return None - # Check downgrade map for known path + # Check downgrade map for known path (alias-based lookup) source_alias = current_resolved.alias target_ref: str | None = None if source_alias is not None: target_ref = downgrade_map.get(source_alias) + else: + logger.debug( + CFO_DOWNGRADE_SKIPPED, + agent_id=agent_id, + reason="no_alias_for_downgrade_map", + model=current_model, + ) if target_ref is None: - # Try to find any cheaper model cheaper = _find_cheaper_model(current_resolved.total_cost_per_1k, resolver) if cheaper is None: + logger.debug( + CFO_DOWNGRADE_SKIPPED, + agent_id=agent_id, + reason="no_cheaper_model_available", + model=current_model, + ) return None target_ref = cheaper.model_id target_resolved = resolver.resolve_safe(target_ref) if target_resolved is None: + logger.debug( + CFO_DOWNGRADE_SKIPPED, + agent_id=agent_id, + reason="target_model_not_resolved", + target=target_ref, + ) return None savings = round( @@ -673,6 +749,13 @@ def _build_downgrade_recommendation( BUDGET_ROUNDING_PRECISION, ) if savings <= 0: + logger.debug( + CFO_DOWNGRADE_SKIPPED, + agent_id=agent_id, + reason="no_savings", + current_cost=current_resolved.total_cost_per_1k, + target_cost=target_resolved.total_cost_per_1k, + ) return None return DowngradeRecommendation( @@ -693,7 +776,7 @@ def _find_cheaper_model( current_cost_per_1k: float, resolver: ModelResolver, ) -> ResolvedModel | None: - """Find the cheapest model that costs less than the current one.""" + """Find the overall cheapest available model below current cost.""" all_models = resolver.all_models_sorted_by_cost() for model in all_models: if model.total_cost_per_1k < current_cost_per_1k: diff --git a/src/ai_company/budget/optimizer_models.py b/src/ai_company/budget/optimizer_models.py index 0ce01ba672..81d5fcd32c 100644 --- a/src/ai_company/budget/optimizer_models.py +++ b/src/ai_company/budget/optimizer_models.py @@ -10,16 +10,21 @@ from enum import StrEnum from typing import Self -from pydantic import BaseModel, ConfigDict, Field, model_validator +from pydantic import BaseModel, ConfigDict, Field, computed_field, model_validator from ai_company.budget.enums import BudgetAlertLevel +from ai_company.constants import BUDGET_ROUNDING_PRECISION from ai_company.core.types import NotBlankStr # noqa: TC001 # ── Enums ───────────────────────────────────────────────────────── class AnomalyType(StrEnum): - """Type of spending anomaly detected.""" + """Type of spending anomaly detected. + + ``SUSTAINED_HIGH`` and ``RATE_INCREASE`` are reserved for future + detection algorithms; only ``SPIKE`` is currently produced. + """ SPIKE = "spike" SUSTAINED_HIGH = "sustained_high" @@ -56,6 +61,7 @@ class SpendingAnomaly(BaseModel): current_value: Spending in the most recent window. baseline_value: Mean spending across historical windows. deviation_factor: How many standard deviations above baseline. + Set to 0.0 when the baseline is zero (no historical spending). detected_at: Timestamp when the anomaly was detected. period_start: Start of the window that triggered the anomaly. period_end: End of the window that triggered the anomaly. @@ -106,7 +112,7 @@ class AnomalyDetectionResult(BaseModel): scan_timestamp: When the scan was performed. """ - model_config = ConfigDict(frozen=True) + model_config = ConfigDict(frozen=True, allow_inf_nan=False) anomalies: tuple[SpendingAnomaly, ...] = Field( default=(), @@ -140,7 +146,7 @@ class AgentEfficiency(BaseModel): agent_id: Agent identifier. total_cost_usd: Total cost in the analysis period. total_tokens: Total tokens consumed (input + output). - cost_per_1k_tokens: Cost per 1000 tokens. + cost_per_1k_tokens: Cost per 1000 tokens (computed). record_count: Number of cost records. efficiency_rating: Efficiency classification. """ @@ -153,15 +159,22 @@ class AgentEfficiency(BaseModel): description="Total cost in the analysis period", ) total_tokens: int = Field(ge=0, description="Total tokens consumed") - cost_per_1k_tokens: float = Field( - ge=0.0, - description="Cost per 1000 tokens", - ) record_count: int = Field(ge=0, description="Number of cost records") efficiency_rating: EfficiencyRating = Field( description="Efficiency classification", ) + @computed_field # type: ignore[prop-decorator] + @property + def cost_per_1k_tokens(self) -> float: + """Cost per 1000 tokens, derived from total_cost and total_tokens.""" + if self.total_tokens == 0: + return 0.0 + return round( + self.total_cost_usd / self.total_tokens * 1000, + BUDGET_ROUNDING_PRECISION, + ) + class EfficiencyAnalysis(BaseModel): """Result of a cost efficiency analysis. @@ -171,7 +184,8 @@ class EfficiencyAnalysis(BaseModel): global_avg_cost_per_1k: Global average cost per 1000 tokens. analysis_period_start: Start of the analysis period. analysis_period_end: End of the analysis period. - inefficient_agent_count: Number of agents rated INEFFICIENT. + inefficient_agent_count: Number of agents rated INEFFICIENT + (computed). """ model_config = ConfigDict(frozen=True, allow_inf_nan=False) @@ -186,10 +200,16 @@ class EfficiencyAnalysis(BaseModel): ) analysis_period_start: datetime = Field(description="Analysis period start") analysis_period_end: datetime = Field(description="Analysis period end") - inefficient_agent_count: int = Field( - ge=0, - description="Number of inefficient agents", - ) + + @computed_field # type: ignore[prop-decorator] + @property + def inefficient_agent_count(self) -> int: + """Number of agents rated INEFFICIENT.""" + return sum( + 1 + for a in self.agents + if a.efficiency_rating == EfficiencyRating.INEFFICIENT + ) @model_validator(mode="after") def _validate_period_ordering(self) -> Self: @@ -227,18 +247,30 @@ class DowngradeRecommendation(BaseModel): description="Recommended cheaper model", ) estimated_savings_per_1k: float = Field( - ge=0.0, + gt=0.0, description="Estimated savings per 1000 tokens", ) reason: NotBlankStr = Field(description="Human-readable explanation") + @model_validator(mode="after") + def _validate_different_models(self) -> Self: + """Ensure current and recommended models differ.""" + if self.current_model == self.recommended_model: + msg = ( + f"current_model and recommended_model must differ, " + f"both are {self.current_model!r}" + ) + raise ValueError(msg) + return self + class DowngradeAnalysis(BaseModel): """Result of a downgrade recommendation analysis. Attributes: recommendations: Per-agent downgrade recommendations. - total_estimated_monthly_savings: Aggregate estimated monthly savings. + total_estimated_savings_per_1k: Aggregate estimated savings per 1000 + tokens across all recommendations. budget_pressure_percent: Current budget utilization percentage. """ @@ -248,9 +280,9 @@ class DowngradeAnalysis(BaseModel): default=(), description="Per-agent downgrade recommendations", ) - total_estimated_monthly_savings: float = Field( + total_estimated_savings_per_1k: float = Field( ge=0.0, - description="Aggregate estimated monthly savings", + description="Aggregate estimated savings per 1000 tokens", ) budget_pressure_percent: float = Field( ge=0.0, @@ -267,7 +299,8 @@ class ApprovalDecision(BaseModel): Attributes: approved: Whether the operation is approved. reason: Explanation for the decision. - budget_remaining_usd: Remaining budget in USD. + budget_remaining_usd: Remaining budget in USD (may be negative + if over budget). budget_used_percent: Percentage of budget consumed. alert_level: Current budget alert level. conditions: Any conditions attached to approval. @@ -278,7 +311,7 @@ class ApprovalDecision(BaseModel): approved: bool = Field(description="Whether the operation is approved") reason: NotBlankStr = Field(description="Explanation for the decision") budget_remaining_usd: float = Field( - description="Remaining budget in USD", + description="Remaining budget in USD (negative when over budget)", ) budget_used_percent: float = Field( ge=0.0, @@ -287,7 +320,7 @@ class ApprovalDecision(BaseModel): alert_level: BudgetAlertLevel = Field( description="Current budget alert level", ) - conditions: tuple[str, ...] = Field( + conditions: tuple[NotBlankStr, ...] = Field( default=(), description="Conditions attached to approval", ) diff --git a/src/ai_company/budget/reports.py b/src/ai_company/budget/reports.py index 43f3cd64de..4f8b25d276 100644 --- a/src/ai_company/budget/reports.py +++ b/src/ai_company/budget/reports.py @@ -10,10 +10,10 @@ import math from collections import defaultdict -from datetime import datetime +from datetime import UTC, datetime from typing import TYPE_CHECKING, Self -from pydantic import BaseModel, ConfigDict, Field, model_validator +from pydantic import BaseModel, ConfigDict, Field, computed_field, model_validator from ai_company.budget.spending_summary import SpendingSummary # noqa: TC001 from ai_company.constants import BUDGET_ROUNDING_PRECISION @@ -104,9 +104,9 @@ class PeriodComparison(BaseModel): Attributes: current_period_cost: Cost in the current period. previous_period_cost: Cost in the previous period. - cost_change_usd: Absolute change in cost. - cost_change_percent: Percentage change in cost. None when - previous period cost is zero. + cost_change_usd: Absolute change in cost (computed). + cost_change_percent: Percentage change in cost (computed). + None when previous period cost is zero. """ model_config = ConfigDict(frozen=True, allow_inf_nan=False) @@ -119,11 +119,26 @@ class PeriodComparison(BaseModel): ge=0.0, description="Previous period cost", ) - cost_change_usd: float = Field(description="Absolute cost change") - cost_change_percent: float | None = Field( - default=None, - description="Percentage cost change", - ) + + @computed_field # type: ignore[prop-decorator] + @property + def cost_change_usd(self) -> float: + """Absolute cost change (current - previous).""" + return round( + self.current_period_cost - self.previous_period_cost, + BUDGET_ROUNDING_PRECISION, + ) + + @computed_field # type: ignore[prop-decorator] + @property + def cost_change_percent(self) -> float | None: + """Percentage cost change. None when previous period cost is zero.""" + if self.previous_period_cost <= 0: + return None + return round( + self.cost_change_usd / self.previous_period_cost * 100, + BUDGET_ROUNDING_PRECISION, + ) class SpendingReport(BaseModel): @@ -140,7 +155,7 @@ class SpendingReport(BaseModel): generated_at: When the report was generated. """ - model_config = ConfigDict(frozen=True) + model_config = ConfigDict(frozen=True, allow_inf_nan=False) summary: SpendingSummary = Field(description="Overall spending summary") by_task: tuple[TaskSpending, ...] = Field( @@ -234,12 +249,13 @@ async def generate_report( Raises: ValueError: If ``start >= end`` or ``top_n < 1``. """ + if start >= end: + msg = f"start ({start.isoformat()}) must be before end ({end.isoformat()})" + raise ValueError(msg) if top_n < 1: msg = f"top_n must be >= 1, got {top_n}" raise ValueError(msg) - from datetime import UTC # noqa: PLC0415 - now = datetime.now(UTC) summary = await self._cost_tracker.build_summary( @@ -309,7 +325,10 @@ async def _build_period_comparison( if prev_cost == 0.0 and current_cost == 0.0: return None - return _compute_period_comparison(current_cost, prev_cost) + return PeriodComparison( + current_period_cost=current_cost, + previous_period_cost=prev_cost, + ) # ── Module-level pure helpers ──────────────────────────────────── @@ -407,30 +426,6 @@ def _build_model_distribution( return tuple(distributions) -def _compute_period_comparison( - current_cost: float, - previous_cost: float, -) -> PeriodComparison: - """Compute the delta between current and previous period costs.""" - change_usd = round( - current_cost - previous_cost, - BUDGET_ROUNDING_PRECISION, - ) - change_pct: float | None = None - if previous_cost > 0: - change_pct = round( - change_usd / previous_cost * 100, - BUDGET_ROUNDING_PRECISION, - ) - - return PeriodComparison( - current_period_cost=current_cost, - previous_period_cost=previous_cost, - cost_change_usd=change_usd, - cost_change_percent=change_pct, - ) - - def _build_top_agents( summary: SpendingSummary, top_n: int, diff --git a/src/ai_company/budget/tracker.py b/src/ai_company/budget/tracker.py index 6b329159f9..8b467e6716 100644 --- a/src/ai_company/budget/tracker.py +++ b/src/ai_company/budget/tracker.py @@ -4,8 +4,8 @@ aggregation queries consumed by the CFO agent and budget monitoring. Service layer for the cost tracking schema defined in DESIGN_SPEC Section 10.2. -Persistence (SQLite) is deferred to M5; the current implementation is purely -in-memory. +The current implementation is purely in-memory; persistence integration is +planned as part of M5. """ import asyncio @@ -36,12 +36,12 @@ BUDGET_ORCHESTRATION_RATIO_ALERT, BUDGET_ORCHESTRATION_RATIO_QUERIED, BUDGET_RECORD_ADDED, + BUDGET_RECORDS_QUERIED, BUDGET_SUMMARY_BUILT, BUDGET_TIME_RANGE_INVALID, BUDGET_TOTAL_COST_QUERIED, BUDGET_TRACKER_CREATED, ) -from ai_company.observability.events.cfo import CFO_RECORDS_QUERIED if TYPE_CHECKING: from collections.abc import Callable, Sequence @@ -209,7 +209,7 @@ async def get_records( """ _validate_time_range(start, end) logger.debug( - CFO_RECORDS_QUERIED, + BUDGET_RECORDS_QUERIED, agent_id=agent_id, task_id=task_id, start=start, diff --git a/src/ai_company/observability/events/budget.py b/src/ai_company/observability/events/budget.py index 389e87bc81..6cfad2a1f3 100644 --- a/src/ai_company/observability/events/budget.py +++ b/src/ai_company/observability/events/budget.py @@ -29,3 +29,5 @@ BUDGET_TIER_RESOLVED: Final[str] = "budget.tier.resolved" BUDGET_TIER_CLASSIFY_MISS: Final[str] = "budget.tier.classify_miss" + +BUDGET_RECORDS_QUERIED: Final[str] = "budget.records.queried" diff --git a/src/ai_company/observability/events/cfo.py b/src/ai_company/observability/events/cfo.py index 1c3912a2a1..6e2bdc4fee 100644 --- a/src/ai_company/observability/events/cfo.py +++ b/src/ai_company/observability/events/cfo.py @@ -7,7 +7,9 @@ CFO_ANOMALY_SCAN_COMPLETE: Final[str] = "cfo.anomaly.scan_complete" CFO_EFFICIENCY_ANALYSIS_COMPLETE: Final[str] = "cfo.efficiency.analysis_complete" CFO_DOWNGRADE_RECOMMENDED: Final[str] = "cfo.downgrade.recommended" +CFO_DOWNGRADE_SKIPPED: Final[str] = "cfo.downgrade.skipped" CFO_APPROVAL_EVALUATED: Final[str] = "cfo.approval.evaluated" CFO_OPERATION_DENIED: Final[str] = "cfo.operation.denied" CFO_REPORT_GENERATED: Final[str] = "cfo.report.generated" -CFO_RECORDS_QUERIED: Final[str] = "cfo.records.queried" +CFO_RESOLVER_MISSING: Final[str] = "cfo.resolver.missing" +CFO_INSUFFICIENT_WINDOWS: Final[str] = "cfo.anomaly.insufficient_windows" diff --git a/tests/unit/budget/test_optimizer.py b/tests/unit/budget/test_optimizer.py index 2af918bd9e..4b219ab624 100644 --- a/tests/unit/budget/test_optimizer.py +++ b/tests/unit/budget/test_optimizer.py @@ -6,7 +6,7 @@ from ai_company.budget.config import BudgetAlertConfig, BudgetConfig from ai_company.budget.enums import BudgetAlertLevel -from ai_company.budget.optimizer import CostOptimizer +from ai_company.budget.optimizer import CostOptimizer, _classify_severity from ai_company.budget.optimizer_models import ( AnomalySeverity, AnomalyType, @@ -217,33 +217,33 @@ async def test_spike_from_zero_baseline(self) -> None: assert anomaly.severity == AnomalySeverity.HIGH assert anomaly.baseline_value == 0.0 - async def test_severity_classification(self) -> None: - """Verify severity levels based on deviation factor.""" + async def test_spike_severity_with_zero_stddev(self) -> None: + """Spike severity uses spike_ratio when stddev is 0.""" optimizer, tracker = _make_optimizer( config=CostOptimizerConfig( - anomaly_sigma_threshold=1.5, - anomaly_spike_factor=10.0, + anomaly_sigma_threshold=2.0, + anomaly_spike_factor=2.0, + min_anomaly_windows=3, ), ) window_duration = (_END - _START) / 5 - # Create varied baseline with small stddev=0.1 - baseline_costs = [1.0, 1.1, 0.9, 1.0] - for i, cost in enumerate(baseline_costs): + # Identical baseline → stddev=0 + for i in range(4): ts = _START + window_duration * i + timedelta(hours=1) await tracker.record( - make_cost_record(agent_id="alice", cost_usd=cost, timestamp=ts), + make_cost_record(agent_id="alice", cost_usd=1.0, timestamp=ts), ) - # Medium spike (2-3 sigma range) + # Spike: 4x baseline → spike_ratio=4.0 → HIGH (>=3.0) ts = _START + window_duration * 4 + timedelta(hours=1) await tracker.record( - make_cost_record(agent_id="alice", cost_usd=1.25, timestamp=ts), + make_cost_record(agent_id="alice", cost_usd=4.0, timestamp=ts), ) - await optimizer.detect_anomalies(start=_START, end=_END) - # With such small deviations, this may or may not trigger - # depending on exact sigma; the key is the test runs without error + result = await optimizer.detect_anomalies(start=_START, end=_END) + assert len(result.anomalies) == 1 + assert result.anomalies[0].severity == AnomalySeverity.HIGH # ── Efficiency Analysis Tests ───────────────────────────────────── @@ -589,3 +589,147 @@ async def test_high_cost_condition(self) -> None: ) assert decision.approved is True assert any("High-cost" in c for c in decision.conditions) + + +# ── _classify_severity Tests ───────────────────────────────────── + + +@pytest.mark.unit +class TestClassifySeverity: + @pytest.mark.parametrize( + ("deviation", "expected"), + [ + (0.0, AnomalySeverity.LOW), + (1.5, AnomalySeverity.LOW), + (1.99, AnomalySeverity.LOW), + (2.0, AnomalySeverity.MEDIUM), + (2.5, AnomalySeverity.MEDIUM), + (2.99, AnomalySeverity.MEDIUM), + (3.0, AnomalySeverity.HIGH), + (5.0, AnomalySeverity.HIGH), + (100.0, AnomalySeverity.HIGH), + ], + ) + def test_thresholds(self, deviation: float, expected: AnomalySeverity) -> None: + assert _classify_severity(deviation) == expected + + +# ── Input Validation Tests ─────────────────────────────────────── + + +@pytest.mark.unit +class TestInputValidation: + async def test_detect_anomalies_start_after_end(self) -> None: + optimizer, _ = _make_optimizer() + with pytest.raises(ValueError, match=r"start .* must be before end"): + await optimizer.detect_anomalies(start=_END, end=_START) + + async def test_analyze_efficiency_start_after_end(self) -> None: + optimizer, _ = _make_optimizer() + with pytest.raises(ValueError, match=r"start .* must be before end"): + await optimizer.analyze_efficiency(start=_END, end=_START) + + async def test_recommend_downgrades_start_after_end(self) -> None: + optimizer, _ = _make_optimizer() + with pytest.raises(ValueError, match=r"start .* must be before end"): + await optimizer.recommend_downgrades(start=_END, end=_START) + + +# ── Edge Case Tests ────────────────────────────────────────────── + + +@pytest.mark.unit +class TestEdgeCases: + async def test_find_cheaper_model_picks_cheapest(self) -> None: + """_find_cheaper_model selects the overall cheapest below current.""" + resolver = _make_resolver() + result = await _make_optimizer(model_resolver=resolver)[0].recommend_downgrades( + start=_START, end=_END + ) + # No records → no recommendations, but validates the path + assert result.recommendations == () + + async def test_budget_pressure_percent_reflects_spending(self) -> None: + """budget_pressure_percent reflects actual spend vs budget.""" + from ai_company.budget.billing import billing_period_start + + resolver = _make_resolver() + bc = BudgetConfig(total_monthly=100.0) + tracker = CostTracker(budget_config=bc) + optimizer = CostOptimizer( + cost_tracker=tracker, + budget_config=bc, + model_resolver=resolver, + ) + # Record in the current billing period so pressure reflects it + now = datetime.now(UTC) + period_start = billing_period_start(bc.reset_day, now=now) + await tracker.record( + make_cost_record( + cost_usd=60.0, + timestamp=period_start + timedelta(hours=1), + ), + ) + # Use a period that covers the data for the efficiency analysis + analysis_start = period_start + analysis_end = now + timedelta(days=1) + result = await optimizer.recommend_downgrades( + start=analysis_start, end=analysis_end + ) + assert result.budget_pressure_percent == 60.0 + + async def test_downgrade_target_not_resolved(self) -> None: + """No recommendation when downgrade target doesn't resolve.""" + from ai_company.budget.config import AutoDowngradeConfig + + resolver = _make_resolver( + [ + ResolvedModel( + provider_name="test-provider", + model_id="test-large-001", + alias="large", + cost_per_1k_input=0.03, + cost_per_1k_output=0.06, + ), + ] + ) + bc = BudgetConfig( + total_monthly=100.0, + auto_downgrade=AutoDowngradeConfig( + enabled=True, + threshold=80, + downgrade_map=(("large", "nonexistent"),), + ), + ) + tracker = CostTracker(budget_config=bc) + optimizer = CostOptimizer( + cost_tracker=tracker, + budget_config=bc, + model_resolver=resolver, + ) + + # Make alice inefficient (only agent, but needs another to set avg) + await tracker.record( + make_cost_record( + agent_id="alice", + model="test-large-001", + cost_usd=10.0, + input_tokens=1000, + output_tokens=0, + timestamp=_START + timedelta(hours=1), + ), + ) + await tracker.record( + make_cost_record( + agent_id="bob", + model="test-large-001", + cost_usd=0.1, + input_tokens=1000, + output_tokens=0, + timestamp=_START + timedelta(hours=1), + ), + ) + + result = await optimizer.recommend_downgrades(start=_START, end=_END) + # Target "nonexistent" can't be resolved → no recommendation + assert result.recommendations == () diff --git a/tests/unit/budget/test_optimizer_models.py b/tests/unit/budget/test_optimizer_models.py index ca1bca81e5..f2489e01e5 100644 --- a/tests/unit/budget/test_optimizer_models.py +++ b/tests/unit/budget/test_optimizer_models.py @@ -145,13 +145,13 @@ def test_construction(self) -> None: agent_id="alice", total_cost_usd=5.0, total_tokens=100000, - cost_per_1k_tokens=0.05, record_count=50, efficiency_rating=EfficiencyRating.NORMAL, ) assert eff.agent_id == "alice" assert eff.total_cost_usd == 5.0 assert eff.efficiency_rating == EfficiencyRating.NORMAL + assert eff.cost_per_1k_tokens == 0.05 @pytest.mark.unit def test_zero_tokens(self) -> None: @@ -159,13 +159,23 @@ def test_zero_tokens(self) -> None: agent_id="alice", total_cost_usd=0.0, total_tokens=0, - cost_per_1k_tokens=0.0, record_count=0, efficiency_rating=EfficiencyRating.NORMAL, ) assert eff.total_tokens == 0 assert eff.cost_per_1k_tokens == 0.0 + @pytest.mark.unit + def test_cost_per_1k_is_computed(self) -> None: + eff = AgentEfficiency( + agent_id="alice", + total_cost_usd=10.0, + total_tokens=5000, + record_count=10, + efficiency_rating=EfficiencyRating.NORMAL, + ) + assert eff.cost_per_1k_tokens == 2.0 + # ── EfficiencyAnalysis Tests ───────────────────────────────────── @@ -178,11 +188,35 @@ def test_empty_analysis(self) -> None: global_avg_cost_per_1k=0.0, analysis_period_start=datetime(2026, 2, 1, tzinfo=UTC), analysis_period_end=datetime(2026, 3, 1, tzinfo=UTC), - inefficient_agent_count=0, ) assert analysis.agents == () assert analysis.inefficient_agent_count == 0 + @pytest.mark.unit + def test_inefficient_count_is_computed(self) -> None: + analysis = EfficiencyAnalysis( + agents=( + AgentEfficiency( + agent_id="alice", + total_cost_usd=10.0, + total_tokens=1000, + record_count=5, + efficiency_rating=EfficiencyRating.INEFFICIENT, + ), + AgentEfficiency( + agent_id="bob", + total_cost_usd=1.0, + total_tokens=1000, + record_count=5, + efficiency_rating=EfficiencyRating.NORMAL, + ), + ), + global_avg_cost_per_1k=5.0, + analysis_period_start=datetime(2026, 2, 1, tzinfo=UTC), + analysis_period_end=datetime(2026, 3, 1, tzinfo=UTC), + ) + assert analysis.inefficient_agent_count == 1 + @pytest.mark.unit def test_period_ordering_invalid(self) -> None: with pytest.raises(ValueError, match="analysis_period_start"): @@ -191,7 +225,6 @@ def test_period_ordering_invalid(self) -> None: global_avg_cost_per_1k=0.0, analysis_period_start=datetime(2026, 3, 1, tzinfo=UTC), analysis_period_end=datetime(2026, 2, 1, tzinfo=UTC), - inefficient_agent_count=0, ) @@ -232,11 +265,11 @@ class TestDowngradeAnalysis: def test_empty_analysis(self) -> None: analysis = DowngradeAnalysis( recommendations=(), - total_estimated_monthly_savings=0.0, + total_estimated_savings_per_1k=0.0, budget_pressure_percent=0.0, ) assert analysis.recommendations == () - assert analysis.total_estimated_monthly_savings == 0.0 + assert analysis.total_estimated_savings_per_1k == 0.0 # ── ApprovalDecision Tests ──────────────────────────────────────── @@ -333,3 +366,30 @@ def test_frozen(self) -> None: config = CostOptimizerConfig() with pytest.raises(Exception): # noqa: B017, PT011 config.anomaly_sigma_threshold = 5.0 # type: ignore[misc] + + +# ── DowngradeRecommendation validator tests ───────────────────── + + +class TestDowngradeRecommendationValidator: + @pytest.mark.unit + def test_same_model_rejected(self) -> None: + with pytest.raises(ValueError, match="must differ"): + DowngradeRecommendation( + agent_id="alice", + current_model="test-large-001", + recommended_model="test-large-001", + estimated_savings_per_1k=0.05, + reason="No actual downgrade", + ) + + @pytest.mark.unit + def test_zero_savings_rejected(self) -> None: + with pytest.raises(ValueError, match="greater than 0"): + DowngradeRecommendation( + agent_id="alice", + current_model="test-large-001", + recommended_model="test-small-001", + estimated_savings_per_1k=0.0, + reason="Zero savings", + ) diff --git a/tests/unit/budget/test_reports.py b/tests/unit/budget/test_reports.py index 4c0172f719..a3ed157273 100644 --- a/tests/unit/budget/test_reports.py +++ b/tests/unit/budget/test_reports.py @@ -10,8 +10,10 @@ PeriodComparison, ProviderDistribution, ReportGenerator, + SpendingReport, TaskSpending, ) +from ai_company.budget.spending_summary import SpendingSummary from ai_company.budget.tracker import CostTracker from tests.unit.budget.conftest import make_cost_record @@ -86,35 +88,38 @@ def test_construction(self) -> None: @pytest.mark.unit class TestPeriodComparison: - def test_construction(self) -> None: + def test_cost_increase(self) -> None: pc = PeriodComparison( current_period_cost=100.0, previous_period_cost=80.0, - cost_change_usd=20.0, - cost_change_percent=25.0, ) assert pc.cost_change_usd == 20.0 assert pc.cost_change_percent == 25.0 - def test_negative_change(self) -> None: + def test_cost_decrease(self) -> None: pc = PeriodComparison( current_period_cost=60.0, previous_period_cost=80.0, - cost_change_usd=-20.0, - cost_change_percent=-25.0, ) assert pc.cost_change_usd == -20.0 assert pc.cost_change_percent == -25.0 - def test_no_previous_data(self) -> None: + def test_no_previous_data_percent_is_none(self) -> None: pc = PeriodComparison( current_period_cost=50.0, previous_period_cost=0.0, - cost_change_usd=50.0, - cost_change_percent=None, ) + assert pc.cost_change_usd == 50.0 assert pc.cost_change_percent is None + def test_equal_periods(self) -> None: + pc = PeriodComparison( + current_period_cost=50.0, + previous_period_cost=50.0, + ) + assert pc.cost_change_usd == 0.0 + assert pc.cost_change_percent == 0.0 + # ── ReportGenerator Tests ───────────────────────────────────────── @@ -335,3 +340,59 @@ async def test_skip_period_comparison(self) -> None: include_period_comparison=False, ) assert report.period_comparison is None + + async def test_start_after_end_rejected(self) -> None: + gen, _ = _make_report_generator() + with pytest.raises(ValueError, match=r"start .* must be before end"): + await gen.generate_report(start=_END, end=_START) + + +# ── SpendingReport Validator Tests ─────────────────────────────── + + +def _make_summary() -> SpendingSummary: + """Build a minimal SpendingSummary for validator tests.""" + from ai_company.budget.spending_summary import PeriodSpending + + return SpendingSummary( + period=PeriodSpending( + start=_START, + end=_END, + total_cost_usd=0.0, + ), + ) + + +@pytest.mark.unit +class TestSpendingReportValidators: + def test_agents_sorted_descending_accepted(self) -> None: + report = SpendingReport( + summary=_make_summary(), + top_agents_by_cost=(("eve", 5.0), ("bob", 3.0), ("alice", 1.0)), + generated_at=_START, + ) + assert len(report.top_agents_by_cost) == 3 + + def test_agents_unsorted_rejected(self) -> None: + with pytest.raises(ValueError, match="top_agents_by_cost must be sorted"): + SpendingReport( + summary=_make_summary(), + top_agents_by_cost=(("alice", 1.0), ("bob", 5.0)), + generated_at=_START, + ) + + def test_tasks_sorted_descending_accepted(self) -> None: + report = SpendingReport( + summary=_make_summary(), + top_tasks_by_cost=(("t2", 8.0), ("t1", 2.0)), + generated_at=_START, + ) + assert len(report.top_tasks_by_cost) == 2 + + def test_tasks_unsorted_rejected(self) -> None: + with pytest.raises(ValueError, match="top_tasks_by_cost must be sorted"): + SpendingReport( + summary=_make_summary(), + top_tasks_by_cost=(("t1", 2.0), ("t2", 8.0)), + generated_at=_START, + ) From 69f06c1b3bc0cc33affd000c79529bb460fef260 Mon Sep 17 00:00:00 2001 From: Aurelio <19254254+Aureliolo@users.noreply.github.com> Date: Mon, 9 Mar 2026 14:53:38 +0100 Subject: [PATCH 3/4] fix: address 21 PR review items for CFO optimizer, reports, and models - Add routing optimization feature (#1): new suggest_routing_optimizations() method, RoutingSuggestion and RoutingOptimizationAnalysis models - Add negative estimated_cost_usd validation (#2) - Fix double snapshot in generate_report (#3) - Fix deviation_factor to use spike_ratio when stddev=0 (#4) - Convert DowngradeAnalysis.total_estimated_savings_per_1k to @computed_field (#5) - Change str to NotBlankStr in SpendingReport tuple fields (#6) - Add window_count upper bound validation (#7) - Pre-group records by agent for O(N+M) complexity (#8) - Update DESIGN_SPEC.md implementation snapshot (#9) - Use projected alert level for auto-deny check (#11) - Move approval log after ApprovalDecision construction (#12) - Add ReportGenerator.__init__ debug log + event constant (#13) - Fix _ALERT_LEVEL_ORDER comment (#14) - Fix _classify_severity docstring for dual-use (#15) - Add WARNING logs before ValueError raises (#16) - Update evaluate_operation docstring (#17) - Add sort-order validator to EfficiencyAnalysis.agents (#18) - Add debug log when _find_most_used_model returns None (#19) - Remove redundant stddev > 0 check in is_sigma_anomaly (#20) - Document approval_warn_threshold_usd=0.0 behavior (#21) - Extract helpers to _optimizer_helpers.py to stay under 800-line limit --- DESIGN_SPEC.md | 2 +- src/ai_company/budget/__init__.py | 4 + src/ai_company/budget/_optimizer_helpers.py | 374 ++++++++++++ src/ai_company/budget/optimizer.py | 635 ++++++++------------ src/ai_company/budget/optimizer_models.py | 140 ++++- src/ai_company/budget/reports.py | 22 +- src/ai_company/observability/events/cfo.py | 2 + tests/unit/budget/test_optimizer.py | 171 +++++- tests/unit/budget/test_optimizer_models.py | 189 +++++- 9 files changed, 1141 insertions(+), 398 deletions(-) create mode 100644 src/ai_company/budget/_optimizer_helpers.py diff --git a/DESIGN_SPEC.md b/DESIGN_SPEC.md index ea8e8eb8f7..6ad5968283 100644 --- a/DESIGN_SPEC.md +++ b/DESIGN_SPEC.md @@ -81,7 +81,7 @@ The MVP validates the core hypothesis: **a single agent can complete a real task > **Implementation snapshot (2026-03-09):** > - **Done:** M0–M4 (tooling, config/core, providers, single-agent engine, multi-agent orchestration). Memory layer backend selected ([ADR-001](docs/decisions/ADR-001-memory-layer.md)). Persistence backend (§7.6) completed. -> - **In progress:** M5 — memory interface protocol complete (MemoryBackend, MemoryCapabilities, SharedKnowledgeStore protocols, models, config, factory), budget enforcement complete (BudgetEnforcer + configurable cost tiers + quota/subscription tracking). Memory retrieval pipeline (#41: ranking, token-budget formatting, context injection) in progress. Mem0 adapter backend pending. +> - **In progress:** M5 — memory interface protocol complete (MemoryBackend, MemoryCapabilities, SharedKnowledgeStore protocols, models, config, factory), budget enforcement complete (BudgetEnforcer + configurable cost tiers + quota/subscription tracking), CFO cost optimization complete (CostOptimizer: anomaly detection, efficiency analysis, downgrade recommendations, routing optimization, approval decisions; ReportGenerator: multi-dimensional spending reports). Memory retrieval pipeline (#41: ranking, token-budget formatting, context injection) in progress. Mem0 adapter backend pending. > - **Not started (mostly placeholders):** M6 API/CLI surface, M7 security + approval system. ### 1.5 Configuration Philosophy diff --git a/src/ai_company/budget/__init__.py b/src/ai_company/budget/__init__.py index eac48a7d2c..e73eb24303 100644 --- a/src/ai_company/budget/__init__.py +++ b/src/ai_company/budget/__init__.py @@ -55,6 +55,8 @@ DowngradeRecommendation, EfficiencyAnalysis, EfficiencyRating, + RoutingOptimizationAnalysis, + RoutingSuggestion, SpendingAnomaly, ) from ai_company.budget.quota import ( @@ -139,6 +141,8 @@ "QuotaWindow", "RedundancyRate", "ReportGenerator", + "RoutingOptimizationAnalysis", + "RoutingSuggestion", "SpendingAnomaly", "SpendingReport", "SpendingSummary", diff --git a/src/ai_company/budget/_optimizer_helpers.py b/src/ai_company/budget/_optimizer_helpers.py new file mode 100644 index 0000000000..210d6c72fe --- /dev/null +++ b/src/ai_company/budget/_optimizer_helpers.py @@ -0,0 +1,374 @@ +"""Pure helper functions for the CostOptimizer service. + +Extracted from ``optimizer.py`` to keep both modules under the 800-line +project limit. All functions are module-private (prefixed with ``_``) +and stateless. +""" + +import math +import statistics +from collections import defaultdict +from typing import TYPE_CHECKING + +from ai_company.budget.enums import BudgetAlertLevel +from ai_company.budget.optimizer_models import ( + AgentEfficiency, + AnomalySeverity, + AnomalyType, + CostOptimizerConfig, + DowngradeRecommendation, + EfficiencyAnalysis, + EfficiencyRating, + SpendingAnomaly, +) +from ai_company.constants import BUDGET_ROUNDING_PRECISION +from ai_company.observability import get_logger +from ai_company.observability.events.cfo import ( + CFO_DOWNGRADE_SKIPPED, + CFO_INSUFFICIENT_WINDOWS, +) + +if TYPE_CHECKING: + from collections.abc import Sequence + from datetime import datetime, timedelta + + from ai_company.budget.config import BudgetConfig + from ai_company.budget.cost_record import CostRecord + from ai_company.providers.routing.models import ResolvedModel + from ai_company.providers.routing.resolver import ModelResolver + +logger = get_logger(__name__) + +# Agents spending below this fraction of global average are rated EFFICIENT +_EFFICIENCY_LOWER_BOUND = 0.8 + + +def _build_efficiency_from_records( + records: Sequence[CostRecord], + *, + start: datetime, + end: datetime, + threshold_factor: float, +) -> EfficiencyAnalysis: + """Build an EfficiencyAnalysis from pre-fetched records.""" + by_agent: dict[str, list[CostRecord]] = defaultdict(list) + for r in records: + by_agent[r.agent_id].append(r) + + global_avg = _compute_global_avg_cost_per_1k(records) + + agent_efficiencies: list[AgentEfficiency] = [] + for agent_id in sorted(by_agent): + agent_records = by_agent[agent_id] + total_cost = round( + math.fsum(r.cost_usd for r in agent_records), + BUDGET_ROUNDING_PRECISION, + ) + total_tokens = sum(r.input_tokens + r.output_tokens for r in agent_records) + cost_per_1k = _compute_cost_per_1k(total_cost, total_tokens) + rating = _rate_efficiency(cost_per_1k, global_avg, threshold_factor) + + agent_efficiencies.append( + AgentEfficiency( + agent_id=agent_id, + total_cost_usd=total_cost, + total_tokens=total_tokens, + record_count=len(agent_records), + efficiency_rating=rating, + ), + ) + + agent_efficiencies.sort( + key=lambda a: a.cost_per_1k_tokens, + reverse=True, + ) + + return EfficiencyAnalysis( + agents=tuple(agent_efficiencies), + global_avg_cost_per_1k=global_avg, + analysis_period_start=start, + analysis_period_end=end, + ) + + +def _compute_window_costs( + agent_records: Sequence[CostRecord], + window_starts: tuple[datetime, ...], + window_duration: timedelta, +) -> tuple[float, ...]: + """Compute per-window cost for a single agent's pre-filtered records.""" + costs: list[float] = [] + for ws in window_starts: + window_end = ws + window_duration + window_cost = math.fsum( + r.cost_usd for r in agent_records if ws <= r.timestamp < window_end + ) + costs.append(round(window_cost, BUDGET_ROUNDING_PRECISION)) + return tuple(costs) + + +def _detect_spike_anomaly( # noqa: PLR0913 + agent_id: str, + window_costs: tuple[float, ...], + now: datetime, + window_starts: tuple[datetime, ...], + window_duration: timedelta, + config: CostOptimizerConfig, +) -> SpendingAnomaly | None: + """Detect a spike anomaly for a single agent. + + Returns ``None`` if no anomaly is detected or insufficient data. + """ + if len(window_costs) < config.min_anomaly_windows: + logger.debug( + CFO_INSUFFICIENT_WINDOWS, + agent_id=agent_id, + window_count=len(window_costs), + min_required=config.min_anomaly_windows, + ) + return None + + historical = window_costs[:-1] + current = window_costs[-1] + + if current == 0.0: + return None + + mean = statistics.mean(historical) + + if mean == 0.0: + # No historical spending -- spike from zero (current > 0 per guard) + return SpendingAnomaly( + agent_id=agent_id, + anomaly_type=AnomalyType.SPIKE, + severity=AnomalySeverity.HIGH, + description=( + f"Agent {agent_id!r} went from $0.00 baseline " + f"to ${current:.2f} in the latest window" + ), + current_value=current, + baseline_value=0.0, + deviation_factor=0.0, + detected_at=now, + period_start=window_starts[-1], + period_end=window_starts[-1] + window_duration, + ) + + # Check spike factor (independent of stddev) + spike_ratio = current / mean + is_spike = spike_ratio > config.anomaly_spike_factor + + # Check sigma threshold + stddev = statistics.stdev(historical) if len(historical) > 1 else 0.0 + deviation = (current - mean) / stddev if stddev > 0 else 0.0 + is_sigma_anomaly = deviation > config.anomaly_sigma_threshold + + if not is_spike and not is_sigma_anomaly: + return None + + # When stddev is zero, use the spike ratio for severity classification + classification_value = spike_ratio if is_spike and stddev == 0.0 else deviation + severity = _classify_severity(classification_value) + + # Use spike_ratio as deviation_factor when stddev is zero + effective_deviation = spike_ratio if stddev == 0.0 else deviation + + return SpendingAnomaly( + agent_id=agent_id, + anomaly_type=AnomalyType.SPIKE, + severity=severity, + description=( + f"Agent {agent_id!r} spent ${current:.2f} vs " + f"${mean:.2f} baseline ({effective_deviation:.1f}x)" + ), + current_value=current, + baseline_value=round(mean, BUDGET_ROUNDING_PRECISION), + deviation_factor=round(effective_deviation, BUDGET_ROUNDING_PRECISION), + detected_at=now, + period_start=window_starts[-1], + period_end=window_starts[-1] + window_duration, + ) + + +def _classify_severity(value: float) -> AnomalySeverity: + """Classify anomaly severity from a deviation factor or spike ratio. + + Args: + value: A deviation factor (sigma) or spike ratio used to + determine severity. Thresholds: >= 3.0 -> HIGH, + >= 2.0 -> MEDIUM, else LOW. + """ + if value >= 3.0: # noqa: PLR2004 + return AnomalySeverity.HIGH + if value >= 2.0: # noqa: PLR2004 + return AnomalySeverity.MEDIUM + return AnomalySeverity.LOW + + +def _compute_cost_per_1k(total_cost: float, total_tokens: int) -> float: + """Compute cost per 1000 tokens, returning 0 for zero tokens.""" + if total_tokens == 0: + return 0.0 + return round(total_cost / total_tokens * 1000, BUDGET_ROUNDING_PRECISION) + + +def _rate_efficiency( + cost_per_1k: float, + global_avg: float, + threshold_factor: float, +) -> EfficiencyRating: + """Rate an agent's cost efficiency relative to global average.""" + if global_avg == 0.0: + return EfficiencyRating.NORMAL + if cost_per_1k > threshold_factor * global_avg: + return EfficiencyRating.INEFFICIENT + if cost_per_1k < _EFFICIENCY_LOWER_BOUND * global_avg: + return EfficiencyRating.EFFICIENT + return EfficiencyRating.NORMAL + + +def _compute_global_avg_cost_per_1k( + records: Sequence[CostRecord], +) -> float: + """Compute global average cost per 1000 tokens across all records.""" + total_cost = math.fsum(r.cost_usd for r in records) + total_tokens = sum(r.input_tokens + r.output_tokens for r in records) + return _compute_cost_per_1k(total_cost, total_tokens) + + +def _find_most_used_model( + records: Sequence[CostRecord], + agent_id: str, +) -> str | None: + """Find the model most frequently used by an agent.""" + model_counts: dict[str, int] = defaultdict(int) + for r in records: + if r.agent_id == agent_id: + model_counts[r.model] += 1 + if not model_counts: + logger.debug( + CFO_DOWNGRADE_SKIPPED, + agent_id=agent_id, + reason="no_records_for_agent", + ) + return None + return max(model_counts, key=lambda m: model_counts[m]) + + +def _build_downgrade_recommendation( + *, + agent_id: str, + current_model: str, + downgrade_map: dict[str, str], + resolver: ModelResolver, +) -> DowngradeRecommendation | None: + """Build a downgrade recommendation for a single agent.""" + current_resolved = resolver.resolve_safe(current_model) + if current_resolved is None: + logger.debug( + CFO_DOWNGRADE_SKIPPED, + agent_id=agent_id, + reason="current_model_not_resolved", + model=current_model, + ) + return None + + # Check downgrade map for known path (alias-based lookup) + source_alias = current_resolved.alias + target_ref: str | None = None + + if source_alias is not None: + target_ref = downgrade_map.get(source_alias) + else: + logger.debug( + CFO_DOWNGRADE_SKIPPED, + agent_id=agent_id, + reason="no_alias_for_downgrade_map", + model=current_model, + ) + + if target_ref is None: + cheaper = _find_cheaper_model(current_resolved.total_cost_per_1k, resolver) + if cheaper is None: + logger.debug( + CFO_DOWNGRADE_SKIPPED, + agent_id=agent_id, + reason="no_cheaper_model_available", + model=current_model, + ) + return None + target_ref = cheaper.model_id + + target_resolved = resolver.resolve_safe(target_ref) + if target_resolved is None: + logger.debug( + CFO_DOWNGRADE_SKIPPED, + agent_id=agent_id, + reason="target_model_not_resolved", + target=target_ref, + ) + return None + + savings = round( + current_resolved.total_cost_per_1k - target_resolved.total_cost_per_1k, + BUDGET_ROUNDING_PRECISION, + ) + if savings <= 0: + logger.debug( + CFO_DOWNGRADE_SKIPPED, + agent_id=agent_id, + reason="no_savings", + current_cost=current_resolved.total_cost_per_1k, + target_cost=target_resolved.total_cost_per_1k, + ) + return None + + return DowngradeRecommendation( + agent_id=agent_id, + current_model=current_model, + recommended_model=target_resolved.model_id, + estimated_savings_per_1k=savings, + reason=( + f"Switch from {current_model!r} " + f"(${current_resolved.total_cost_per_1k:.4f}/1k) to " + f"{target_resolved.model_id!r} " + f"(${target_resolved.total_cost_per_1k:.4f}/1k)" + ), + ) + + +def _find_cheaper_model( + current_cost_per_1k: float, + resolver: ModelResolver, +) -> ResolvedModel | None: + """Find the overall cheapest available model below current cost.""" + all_models = resolver.all_models_sorted_by_cost() + for model in all_models: + if model.total_cost_per_1k < current_cost_per_1k: + return model + return None + + +def _compute_alert_level( + used_pct: float, + cfg: BudgetConfig, +) -> BudgetAlertLevel: + """Compute alert level from budget usage percentage.""" + alerts = cfg.alerts + if used_pct >= alerts.hard_stop_at: + return BudgetAlertLevel.HARD_STOP + if used_pct >= alerts.critical_at: + return BudgetAlertLevel.CRITICAL + if used_pct >= alerts.warn_at: + return BudgetAlertLevel.WARNING + return BudgetAlertLevel.NORMAL + + +def _group_records_by_agent( + records: Sequence[CostRecord], +) -> dict[str, list[CostRecord]]: + """Group records by agent_id for efficient per-agent iteration.""" + by_agent: dict[str, list[CostRecord]] = defaultdict(list) + for r in records: + by_agent[r.agent_id].append(r) + return by_agent diff --git a/src/ai_company/budget/optimizer.py b/src/ai_company/budget/optimizer.py index 86f186fe0b..bf7fc68cf5 100644 --- a/src/ai_company/budget/optimizer.py +++ b/src/ai_company/budget/optimizer.py @@ -1,7 +1,8 @@ """CFO cost optimization service. Provides spending anomaly detection, cost efficiency analysis, model -downgrade recommendations, and operation approval decisions. Composes +downgrade recommendations, routing optimization suggestions, and +operation approval decisions. Composes :class:`~ai_company.budget.tracker.CostTracker` and :class:`~ai_company.budget.config.BudgetConfig` for read-only analytical queries — the advisory complement to @@ -10,25 +11,31 @@ Service layer backing the CFO role (DESIGN_SPEC Section 10.3). """ -import math -import statistics -from collections import defaultdict -from datetime import UTC, datetime, timedelta +from datetime import UTC, datetime from typing import TYPE_CHECKING +from ai_company.budget._optimizer_helpers import ( + _build_downgrade_recommendation, + _build_efficiency_from_records, + _classify_severity, + _compute_alert_level, + _compute_window_costs, + _detect_spike_anomaly, + _find_most_used_model, + _group_records_by_agent, +) from ai_company.budget.billing import billing_period_start from ai_company.budget.enums import BudgetAlertLevel from ai_company.budget.optimizer_models import ( - AgentEfficiency, AnomalyDetectionResult, - AnomalySeverity, - AnomalyType, ApprovalDecision, CostOptimizerConfig, DowngradeAnalysis, DowngradeRecommendation, EfficiencyAnalysis, EfficiencyRating, + RoutingOptimizationAnalysis, + RoutingSuggestion, SpendingAnomaly, ) from ai_company.constants import BUDGET_ROUNDING_PRECISION @@ -40,27 +47,20 @@ CFO_DOWNGRADE_RECOMMENDED, CFO_DOWNGRADE_SKIPPED, CFO_EFFICIENCY_ANALYSIS_COMPLETE, - CFO_INSUFFICIENT_WINDOWS, CFO_OPERATION_DENIED, CFO_OPTIMIZER_CREATED, CFO_RESOLVER_MISSING, + CFO_ROUTING_OPTIMIZATION_COMPLETE, ) if TYPE_CHECKING: - from collections.abc import Sequence - from ai_company.budget.config import BudgetConfig - from ai_company.budget.cost_record import CostRecord from ai_company.budget.tracker import CostTracker - from ai_company.providers.routing.models import ResolvedModel from ai_company.providers.routing.resolver import ModelResolver logger = get_logger(__name__) -# Agents spending below this fraction of global average are rated EFFICIENT -_EFFICIENCY_LOWER_BOUND = 0.8 - -# Same ordering as BudgetEnforcer._ALERT_ORDER +# Same ordering as BudgetEnforcer._ALERT_LEVEL_ORDER _ALERT_LEVEL_ORDER: dict[BudgetAlertLevel, int] = { BudgetAlertLevel.NORMAL: 0, BudgetAlertLevel.WARNING: 1, @@ -68,21 +68,25 @@ BudgetAlertLevel.HARD_STOP: 3, } +# Maximum number of time windows for anomaly detection to avoid +# excessive memory/compute from pathological inputs. +_MAX_WINDOW_COUNT = 1000 + class CostOptimizer: """CFO analytical service for cost optimization. Composes CostTracker and BudgetConfig for read-only analysis: anomaly detection, efficiency analysis, downgrade recommendations, - and operation approval evaluation. + routing optimization suggestions, and operation approval evaluation. Args: cost_tracker: Cost tracking service for querying spend. budget_config: Budget configuration for limits and thresholds. config: Optimizer-specific configuration. Defaults to ``CostOptimizerConfig()`` when ``None``. - model_resolver: Optional model resolver for downgrade - recommendations. + model_resolver: Optional model resolver for downgrade and + routing optimization recommendations. """ def __init__( @@ -120,20 +124,40 @@ async def detect_anomalies( start: Inclusive period start. end: Exclusive period end. window_count: Number of time windows to divide the period - into. Must be >= 2. + into. Must be >= 2 and <= 1000. Returns: Anomaly detection result with any detected anomalies. Raises: - ValueError: If ``start >= end`` or ``window_count < 2``. + ValueError: If ``start >= end``, ``window_count < 2``, or + ``window_count > 1000``. """ if start >= end: + logger.warning( + CFO_ANOMALY_SCAN_COMPLETE, + error="start_after_end", + start=start.isoformat(), + end=end.isoformat(), + ) msg = f"start ({start.isoformat()}) must be before end ({end.isoformat()})" raise ValueError(msg) if window_count < 2: # noqa: PLR2004 + logger.warning( + CFO_ANOMALY_SCAN_COMPLETE, + error="window_count_below_minimum", + window_count=window_count, + ) msg = f"window_count must be >= 2, got {window_count}" raise ValueError(msg) + if window_count > _MAX_WINDOW_COUNT: + logger.warning( + CFO_ANOMALY_SCAN_COMPLETE, + error="window_count_above_maximum", + window_count=window_count, + ) + msg = f"window_count must be <= {_MAX_WINDOW_COUNT}, got {window_count}" + raise ValueError(msg) now = datetime.now(UTC) records = await self._cost_tracker.get_records( @@ -145,13 +169,14 @@ async def detect_anomalies( window_duration = total_duration / window_count window_starts = tuple(start + window_duration * i for i in range(window_count)) - agent_ids = sorted({r.agent_id for r in records}) + # Pre-group records by agent for O(N+M) complexity (#8) + by_agent = _group_records_by_agent(records) + agent_ids = sorted(by_agent) anomalies: list[SpendingAnomaly] = [] for agent_id in agent_ids: window_costs = _compute_window_costs( - records, - agent_id, + by_agent[agent_id], window_starts, window_duration, ) @@ -211,6 +236,12 @@ async def analyze_efficiency( ValueError: If ``start >= end``. """ if start >= end: + logger.warning( + CFO_EFFICIENCY_ANALYSIS_COMPLETE, + error="start_after_end", + start=start.isoformat(), + end=end.isoformat(), + ) msg = f"start ({start.isoformat()}) must be before end ({end.isoformat()})" raise ValueError(msg) @@ -258,6 +289,12 @@ async def recommend_downgrades( ValueError: If ``start >= end``. """ if start >= end: + logger.warning( + CFO_DOWNGRADE_RECOMMENDED, + error="start_after_end", + start=start.isoformat(), + end=end.isoformat(), + ) msg = f"start ({start.isoformat()}) must be before end ({end.isoformat()})" raise ValueError(msg) @@ -268,7 +305,6 @@ async def recommend_downgrades( ) return DowngradeAnalysis( recommendations=(), - total_estimated_savings_per_1k=0.0, budget_pressure_percent=0.0, ) @@ -294,7 +330,6 @@ async def recommend_downgrades( budget_pressure = await self._compute_budget_pressure() recommendations: list[DowngradeRecommendation] = [] - total_savings = 0.0 for agent in efficiency.agents: if agent.efficiency_rating != EfficiencyRating.INEFFICIENT: @@ -302,6 +337,11 @@ async def recommend_downgrades( most_used_model = _find_most_used_model(records, agent.agent_id) if most_used_model is None: + logger.debug( + CFO_DOWNGRADE_SKIPPED, + agent_id=agent.agent_id, + reason="no_most_used_model", + ) continue recommendation = _build_downgrade_recommendation( @@ -312,7 +352,6 @@ async def recommend_downgrades( ) if recommendation is not None: recommendations.append(recommendation) - total_savings += recommendation.estimated_savings_per_1k logger.info( CFO_DOWNGRADE_RECOMMENDED, agent_id=agent.agent_id, @@ -323,13 +362,126 @@ async def recommend_downgrades( return DowngradeAnalysis( recommendations=tuple(recommendations), - total_estimated_savings_per_1k=round( - total_savings, - BUDGET_ROUNDING_PRECISION, - ), budget_pressure_percent=budget_pressure, ) + async def suggest_routing_optimizations( + self, + *, + start: datetime, + end: datetime, + ) -> RoutingOptimizationAnalysis: + """Suggest routing optimizations based on actual usage patterns. + + Analyzes each agent's most-used model and suggests cheaper + alternatives available through the model resolver, comparing by + cost, context window, and latency. + + Unlike ``recommend_downgrades`` which only targets INEFFICIENT + agents, this method analyzes all agents and suggests cheaper + alternatives regardless of efficiency rating — any agent that + could use a cheaper model is a candidate. + + Args: + start: Inclusive period start. + end: Exclusive period end. + + Returns: + Routing optimization analysis with per-agent suggestions. + Empty when no model_resolver is configured. + + Raises: + ValueError: If ``start >= end``. + """ + if start >= end: + logger.warning( + CFO_ROUTING_OPTIMIZATION_COMPLETE, + error="start_after_end", + start=start.isoformat(), + end=end.isoformat(), + ) + msg = f"start ({start.isoformat()}) must be before end ({end.isoformat()})" + raise ValueError(msg) + + if self._model_resolver is None: + logger.warning( + CFO_RESOLVER_MISSING, + reason="no_model_resolver_configured", + ) + return RoutingOptimizationAnalysis( + suggestions=(), + analysis_period_start=start, + analysis_period_end=end, + agents_analyzed=0, + ) + + records = await self._cost_tracker.get_records( + start=start, + end=end, + ) + + by_agent = _group_records_by_agent(records) + all_models = self._model_resolver.all_models_sorted_by_cost() + suggestions: list[RoutingSuggestion] = [] + + for agent_id in sorted(by_agent): + most_used = _find_most_used_model(records, agent_id) + if most_used is None: + continue + + current_resolved = self._model_resolver.resolve_safe(most_used) + if current_resolved is None: + continue + + # Find cheapest model with sufficient context window + for candidate in all_models: + if candidate.model_id == current_resolved.model_id: + continue + if candidate.total_cost_per_1k >= current_resolved.total_cost_per_1k: + continue + if candidate.max_context < current_resolved.max_context: + continue + + suggestions.append( + RoutingSuggestion( + agent_id=agent_id, + current_model=most_used, + suggested_model=candidate.model_id, + current_cost_per_1k=round( + current_resolved.total_cost_per_1k, + BUDGET_ROUNDING_PRECISION, + ), + suggested_cost_per_1k=round( + candidate.total_cost_per_1k, + BUDGET_ROUNDING_PRECISION, + ), + reason=( + f"Switch from {most_used!r} " + f"(${current_resolved.total_cost_per_1k:.4f}/1k) " + f"to {candidate.model_id!r} " + f"(${candidate.total_cost_per_1k:.4f}/1k) " + f"— same context window, lower cost" + ), + ), + ) + break # Take first (cheapest) match per agent + + result = RoutingOptimizationAnalysis( + suggestions=tuple(suggestions), + analysis_period_start=start, + analysis_period_end=end, + agents_analyzed=len(by_agent), + ) + + logger.info( + CFO_ROUTING_OPTIMIZATION_COMPLETE, + suggestion_count=len(suggestions), + agents_analyzed=len(by_agent), + total_savings_per_1k=result.total_estimated_savings_per_1k, + ) + + return result + async def evaluate_operation( self, *, @@ -341,22 +493,40 @@ async def evaluate_operation( Evaluates three criteria in order: - 1. Denies if the current alert level meets or exceeds the - auto-deny threshold (configurable). - 2. Denies if the projected cost would exceed the hard-stop + 1. Rejects negative ``estimated_cost_usd`` immediately. + 2. Denies if the *projected* alert level (after adding the + estimated cost) meets or exceeds the auto-deny threshold. + 3. Denies if the projected cost would exceed the hard-stop limit. - 3. Approves with optional warning conditions for high-cost + 4. Approves with optional warning conditions for high-cost operations or elevated alert levels. + When ``total_monthly <= 0`` budget enforcement is disabled and + the operation is always approved with no conditions. + Args: agent_id: Agent requesting the operation. - estimated_cost_usd: Estimated cost of the operation. + estimated_cost_usd: Estimated cost of the operation. Must + be >= 0. now: Reference timestamp for billing period computation. Defaults to ``datetime.now(UTC)``. Returns: Approval decision with reasoning. + + Raises: + ValueError: If ``estimated_cost_usd`` is negative. """ + if estimated_cost_usd < 0: + logger.warning( + CFO_OPERATION_DENIED, + agent_id=agent_id, + estimated_cost=estimated_cost_usd, + reason="negative_estimated_cost", + ) + msg = f"estimated_cost_usd must be >= 0, got {estimated_cost_usd}" + raise ValueError(msg) + cfg = self._budget_config if cfg.total_monthly <= 0: @@ -383,20 +553,24 @@ async def evaluate_operation( ) alert_level = _compute_alert_level(used_pct, cfg) + # Use projected alert level (after cost) for auto-deny check (#11) + projected_cost = round( + monthly_cost + estimated_cost_usd, + BUDGET_ROUNDING_PRECISION, + ) + projected_pct = round( + projected_cost / cfg.total_monthly * 100, + BUDGET_ROUNDING_PRECISION, + ) + projected_alert = _compute_alert_level(projected_pct, cfg) + auto_deny_level = self._config.approval_auto_deny_alert_level - if _ALERT_LEVEL_ORDER[alert_level] >= _ALERT_LEVEL_ORDER[auto_deny_level]: - logger.warning( - CFO_OPERATION_DENIED, - agent_id=agent_id, - estimated_cost=estimated_cost_usd, - alert_level=alert_level.value, - reason="alert_level_exceeded", - ) - return ApprovalDecision( + if _ALERT_LEVEL_ORDER[projected_alert] >= _ALERT_LEVEL_ORDER[auto_deny_level]: + decision = ApprovalDecision( approved=False, reason=( - f"Denied: alert level {alert_level.value} " + f"Denied: projected alert level {projected_alert.value} " f"meets or exceeds auto-deny threshold " f"{auto_deny_level.value}" ), @@ -405,25 +579,22 @@ async def evaluate_operation( alert_level=alert_level, conditions=(), ) + logger.warning( + CFO_OPERATION_DENIED, + agent_id=agent_id, + estimated_cost=estimated_cost_usd, + alert_level=alert_level.value, + projected_alert_level=projected_alert.value, + reason="alert_level_exceeded", + ) + return decision hard_stop_limit = round( cfg.total_monthly * cfg.alerts.hard_stop_at / 100, BUDGET_ROUNDING_PRECISION, ) - projected_cost = round( - monthly_cost + estimated_cost_usd, - BUDGET_ROUNDING_PRECISION, - ) if projected_cost >= hard_stop_limit: - logger.warning( - CFO_OPERATION_DENIED, - agent_id=agent_id, - estimated_cost=estimated_cost_usd, - projected_cost=projected_cost, - hard_stop_limit=hard_stop_limit, - reason="would_exceed_hard_stop", - ) - return ApprovalDecision( + decision = ApprovalDecision( approved=False, reason=( f"Denied: projected cost ${projected_cost:.2f} " @@ -434,6 +605,15 @@ async def evaluate_operation( alert_level=alert_level, conditions=(), ) + logger.warning( + CFO_OPERATION_DENIED, + agent_id=agent_id, + estimated_cost=estimated_cost_usd, + projected_cost=projected_cost, + hard_stop_limit=hard_stop_limit, + reason="would_exceed_hard_stop", + ) + return decision conditions: list[str] = [] warn_threshold = self._config.approval_warn_threshold_usd @@ -448,6 +628,15 @@ async def evaluate_operation( f"Budget alert level is {alert_level.value} ({used_pct:.1f}% used)" ) + decision = ApprovalDecision( + approved=True, + reason="Approved", + budget_remaining_usd=remaining, + budget_used_percent=used_pct, + alert_level=alert_level, + conditions=tuple(conditions), + ) + logger.info( CFO_APPROVAL_EVALUATED, agent_id=agent_id, @@ -457,14 +646,7 @@ async def evaluate_operation( conditions_count=len(conditions), ) - return ApprovalDecision( - approved=True, - reason="Approved", - budget_remaining_usd=remaining, - budget_used_percent=used_pct, - alert_level=alert_level, - conditions=tuple(conditions), - ) + return decision # ── Private helpers ────────────────────────────────────────── @@ -483,317 +665,6 @@ async def _compute_budget_pressure(self) -> float: ) -# ── Module-level pure helpers ──────────────────────────────────── - - -def _build_efficiency_from_records( - records: Sequence[CostRecord], - *, - start: datetime, - end: datetime, - threshold_factor: float, -) -> EfficiencyAnalysis: - """Build an EfficiencyAnalysis from pre-fetched records.""" - by_agent: dict[str, list[CostRecord]] = defaultdict(list) - for r in records: - by_agent[r.agent_id].append(r) - - global_avg = _compute_global_avg_cost_per_1k(records) - - agent_efficiencies: list[AgentEfficiency] = [] - for agent_id in sorted(by_agent): - agent_records = by_agent[agent_id] - total_cost = round( - math.fsum(r.cost_usd for r in agent_records), - BUDGET_ROUNDING_PRECISION, - ) - total_tokens = sum(r.input_tokens + r.output_tokens for r in agent_records) - cost_per_1k = _compute_cost_per_1k(total_cost, total_tokens) - rating = _rate_efficiency(cost_per_1k, global_avg, threshold_factor) - - agent_efficiencies.append( - AgentEfficiency( - agent_id=agent_id, - total_cost_usd=total_cost, - total_tokens=total_tokens, - record_count=len(agent_records), - efficiency_rating=rating, - ), - ) - - agent_efficiencies.sort( - key=lambda a: a.cost_per_1k_tokens, - reverse=True, - ) - - return EfficiencyAnalysis( - agents=tuple(agent_efficiencies), - global_avg_cost_per_1k=global_avg, - analysis_period_start=start, - analysis_period_end=end, - ) - - -def _compute_window_costs( - records: Sequence[CostRecord], - agent_id: str, - window_starts: tuple[datetime, ...], - window_duration: timedelta, -) -> tuple[float, ...]: - """Compute per-window cost for a single agent.""" - costs: list[float] = [] - for ws in window_starts: - window_end = ws + window_duration - window_cost = math.fsum( - r.cost_usd - for r in records - if r.agent_id == agent_id and ws <= r.timestamp < window_end - ) - costs.append(round(window_cost, BUDGET_ROUNDING_PRECISION)) - return tuple(costs) - - -def _detect_spike_anomaly( # noqa: PLR0913 - agent_id: str, - window_costs: tuple[float, ...], - now: datetime, - window_starts: tuple[datetime, ...], - window_duration: timedelta, - config: CostOptimizerConfig, -) -> SpendingAnomaly | None: - """Detect a spike anomaly for a single agent. - - Returns ``None`` if no anomaly is detected or insufficient data. - """ - if len(window_costs) < config.min_anomaly_windows: - logger.debug( - CFO_INSUFFICIENT_WINDOWS, - agent_id=agent_id, - window_count=len(window_costs), - min_required=config.min_anomaly_windows, - ) - return None - - historical = window_costs[:-1] - current = window_costs[-1] - - if current == 0.0: - return None - - mean = statistics.mean(historical) - - if mean == 0.0: - # No historical spending — spike from zero (current > 0 per guard above) - return SpendingAnomaly( - agent_id=agent_id, - anomaly_type=AnomalyType.SPIKE, - severity=AnomalySeverity.HIGH, - description=( - f"Agent {agent_id!r} went from $0.00 baseline " - f"to ${current:.2f} in the latest window" - ), - current_value=current, - baseline_value=0.0, - deviation_factor=0.0, - detected_at=now, - period_start=window_starts[-1], - period_end=window_starts[-1] + window_duration, - ) - - # Check spike factor (independent of stddev) - spike_ratio = current / mean - is_spike = spike_ratio > config.anomaly_spike_factor - - # Check sigma threshold - stddev = statistics.stdev(historical) if len(historical) > 1 else 0.0 - deviation = (current - mean) / stddev if stddev > 0 else 0.0 - is_sigma_anomaly = stddev > 0 and deviation > config.anomaly_sigma_threshold - - if not is_spike and not is_sigma_anomaly: - return None - - # When stddev is zero, use the spike ratio for severity classification - severity = ( - _classify_severity(spike_ratio) - if is_spike and stddev == 0.0 - else _classify_severity(deviation) - ) - - return SpendingAnomaly( - agent_id=agent_id, - anomaly_type=AnomalyType.SPIKE, - severity=severity, - description=( - f"Agent {agent_id!r} spent ${current:.2f} vs " - f"${mean:.2f} baseline ({deviation:.1f} sigma)" - ), - current_value=current, - baseline_value=round(mean, BUDGET_ROUNDING_PRECISION), - deviation_factor=round(deviation, BUDGET_ROUNDING_PRECISION), - detected_at=now, - period_start=window_starts[-1], - period_end=window_starts[-1] + window_duration, - ) - - -def _classify_severity(deviation: float) -> AnomalySeverity: - """Classify anomaly severity from deviation factor.""" - if deviation >= 3.0: # noqa: PLR2004 - return AnomalySeverity.HIGH - if deviation >= 2.0: # noqa: PLR2004 - return AnomalySeverity.MEDIUM - return AnomalySeverity.LOW - - -def _compute_cost_per_1k(total_cost: float, total_tokens: int) -> float: - """Compute cost per 1000 tokens, returning 0 for zero tokens.""" - if total_tokens == 0: - return 0.0 - return round(total_cost / total_tokens * 1000, BUDGET_ROUNDING_PRECISION) - - -def _rate_efficiency( - cost_per_1k: float, - global_avg: float, - threshold_factor: float, -) -> EfficiencyRating: - """Rate an agent's cost efficiency relative to global average.""" - if global_avg == 0.0: - return EfficiencyRating.NORMAL - if cost_per_1k > threshold_factor * global_avg: - return EfficiencyRating.INEFFICIENT - if cost_per_1k < _EFFICIENCY_LOWER_BOUND * global_avg: - return EfficiencyRating.EFFICIENT - return EfficiencyRating.NORMAL - - -def _compute_global_avg_cost_per_1k( - records: Sequence[CostRecord], -) -> float: - """Compute global average cost per 1000 tokens across all records.""" - total_cost = math.fsum(r.cost_usd for r in records) - total_tokens = sum(r.input_tokens + r.output_tokens for r in records) - return _compute_cost_per_1k(total_cost, total_tokens) - - -def _find_most_used_model( - records: Sequence[CostRecord], - agent_id: str, -) -> str | None: - """Find the model most frequently used by an agent.""" - model_counts: dict[str, int] = defaultdict(int) - for r in records: - if r.agent_id == agent_id: - model_counts[r.model] += 1 - if not model_counts: - return None - return max(model_counts, key=lambda m: model_counts[m]) - - -def _build_downgrade_recommendation( - *, - agent_id: str, - current_model: str, - downgrade_map: dict[str, str], - resolver: ModelResolver, -) -> DowngradeRecommendation | None: - """Build a downgrade recommendation for a single agent.""" - current_resolved = resolver.resolve_safe(current_model) - if current_resolved is None: - logger.debug( - CFO_DOWNGRADE_SKIPPED, - agent_id=agent_id, - reason="current_model_not_resolved", - model=current_model, - ) - return None - - # Check downgrade map for known path (alias-based lookup) - source_alias = current_resolved.alias - target_ref: str | None = None - - if source_alias is not None: - target_ref = downgrade_map.get(source_alias) - else: - logger.debug( - CFO_DOWNGRADE_SKIPPED, - agent_id=agent_id, - reason="no_alias_for_downgrade_map", - model=current_model, - ) - - if target_ref is None: - cheaper = _find_cheaper_model(current_resolved.total_cost_per_1k, resolver) - if cheaper is None: - logger.debug( - CFO_DOWNGRADE_SKIPPED, - agent_id=agent_id, - reason="no_cheaper_model_available", - model=current_model, - ) - return None - target_ref = cheaper.model_id - - target_resolved = resolver.resolve_safe(target_ref) - if target_resolved is None: - logger.debug( - CFO_DOWNGRADE_SKIPPED, - agent_id=agent_id, - reason="target_model_not_resolved", - target=target_ref, - ) - return None - - savings = round( - current_resolved.total_cost_per_1k - target_resolved.total_cost_per_1k, - BUDGET_ROUNDING_PRECISION, - ) - if savings <= 0: - logger.debug( - CFO_DOWNGRADE_SKIPPED, - agent_id=agent_id, - reason="no_savings", - current_cost=current_resolved.total_cost_per_1k, - target_cost=target_resolved.total_cost_per_1k, - ) - return None - - return DowngradeRecommendation( - agent_id=agent_id, - current_model=current_model, - recommended_model=target_resolved.model_id, - estimated_savings_per_1k=savings, - reason=( - f"Switch from {current_model!r} " - f"(${current_resolved.total_cost_per_1k:.4f}/1k) to " - f"{target_resolved.model_id!r} " - f"(${target_resolved.total_cost_per_1k:.4f}/1k)" - ), - ) - - -def _find_cheaper_model( - current_cost_per_1k: float, - resolver: ModelResolver, -) -> ResolvedModel | None: - """Find the overall cheapest available model below current cost.""" - all_models = resolver.all_models_sorted_by_cost() - for model in all_models: - if model.total_cost_per_1k < current_cost_per_1k: - return model - return None - - -def _compute_alert_level( - used_pct: float, - cfg: BudgetConfig, -) -> BudgetAlertLevel: - """Compute alert level from budget usage percentage.""" - alerts = cfg.alerts - if used_pct >= alerts.hard_stop_at: - return BudgetAlertLevel.HARD_STOP - if used_pct >= alerts.critical_at: - return BudgetAlertLevel.CRITICAL - if used_pct >= alerts.warn_at: - return BudgetAlertLevel.WARNING - return BudgetAlertLevel.NORMAL +# Re-export _classify_severity for backwards compatibility with tests +# that import it directly from optimizer. +__all__ = ["CostOptimizer", "_classify_severity"] diff --git a/src/ai_company/budget/optimizer_models.py b/src/ai_company/budget/optimizer_models.py index 81d5fcd32c..33be4ffb11 100644 --- a/src/ai_company/budget/optimizer_models.py +++ b/src/ai_company/budget/optimizer_models.py @@ -224,6 +224,15 @@ def _validate_period_ordering(self) -> Self: raise ValueError(msg) return self + @model_validator(mode="after") + def _validate_agents_sort_order(self) -> Self: + """Ensure agents are sorted by cost_per_1k_tokens descending.""" + costs = [a.cost_per_1k_tokens for a in self.agents] + if costs != sorted(costs, reverse=True): + msg = "agents must be sorted by cost_per_1k_tokens descending" + raise ValueError(msg) + return self + # ── Downgrade Recommendations ───────────────────────────────────── @@ -270,7 +279,7 @@ class DowngradeAnalysis(BaseModel): Attributes: recommendations: Per-agent downgrade recommendations. total_estimated_savings_per_1k: Aggregate estimated savings per 1000 - tokens across all recommendations. + tokens across all recommendations (computed). budget_pressure_percent: Current budget utilization percentage. """ @@ -280,15 +289,20 @@ class DowngradeAnalysis(BaseModel): default=(), description="Per-agent downgrade recommendations", ) - total_estimated_savings_per_1k: float = Field( - ge=0.0, - description="Aggregate estimated savings per 1000 tokens", - ) budget_pressure_percent: float = Field( ge=0.0, description="Current budget utilization percentage", ) + @computed_field # type: ignore[prop-decorator] + @property + def total_estimated_savings_per_1k(self) -> float: + """Aggregate estimated savings per 1000 tokens.""" + return round( + sum(r.estimated_savings_per_1k for r in self.recommendations), + BUDGET_ROUNDING_PRECISION, + ) + # ── Approval Decision ───────────────────────────────────────────── @@ -342,7 +356,9 @@ class CostOptimizerConfig(BaseModel): approval_auto_deny_alert_level: Alert level at or above which operations are automatically denied. approval_warn_threshold_usd: Cost threshold for adding a - warning condition to approval. + warning condition to approval. When set to ``0.0``, every + approved operation receives a "High-cost operation" condition + (effectively "always warn"). min_anomaly_windows: Minimum number of historical windows required before anomaly detection activates. """ @@ -379,3 +395,115 @@ class CostOptimizerConfig(BaseModel): strict=True, description="Minimum historical windows for anomaly detection", ) + + +# ── Routing Optimization ──────────────────────────────────────── + + +class RoutingSuggestion(BaseModel): + """A routing optimization suggestion for a single agent. + + Suggests switching an agent's most-used model to a cheaper + alternative that provides sufficient context window size. + + Attributes: + agent_id: Agent identifier. + current_model: Currently most-used model identifier. + suggested_model: Suggested cheaper alternative. + current_cost_per_1k: Current model's total cost per 1k tokens. + suggested_cost_per_1k: Suggested model's total cost per 1k tokens. + estimated_savings_per_1k: Estimated savings per 1k tokens (computed). + reason: Human-readable explanation. + """ + + model_config = ConfigDict(frozen=True, allow_inf_nan=False) + + agent_id: NotBlankStr = Field(description="Agent identifier") + current_model: NotBlankStr = Field(description="Current most-used model") + suggested_model: NotBlankStr = Field(description="Suggested cheaper model") + current_cost_per_1k: float = Field( + ge=0.0, + description="Current model total cost per 1k tokens", + ) + suggested_cost_per_1k: float = Field( + ge=0.0, + description="Suggested model total cost per 1k tokens", + ) + reason: NotBlankStr = Field(description="Human-readable explanation") + + @computed_field # type: ignore[prop-decorator] + @property + def estimated_savings_per_1k(self) -> float: + """Estimated savings per 1k tokens.""" + return round( + self.current_cost_per_1k - self.suggested_cost_per_1k, + BUDGET_ROUNDING_PRECISION, + ) + + @model_validator(mode="after") + def _validate_different_models(self) -> Self: + """Ensure current and suggested models differ.""" + if self.current_model == self.suggested_model: + msg = ( + f"current_model and suggested_model must differ, " + f"both are {self.current_model!r}" + ) + raise ValueError(msg) + return self + + @model_validator(mode="after") + def _validate_savings_positive(self) -> Self: + """Ensure suggested model is actually cheaper.""" + if self.suggested_cost_per_1k >= self.current_cost_per_1k: + msg = ( + f"suggested_cost_per_1k ({self.suggested_cost_per_1k}) " + f"must be less than current_cost_per_1k " + f"({self.current_cost_per_1k})" + ) + raise ValueError(msg) + return self + + +class RoutingOptimizationAnalysis(BaseModel): + """Result of a routing optimization analysis. + + Attributes: + suggestions: Per-agent routing optimization suggestions. + total_estimated_savings_per_1k: Aggregate estimated savings per 1k + tokens across all suggestions (computed). + analysis_period_start: Start of the analysis period. + analysis_period_end: End of the analysis period. + agents_analyzed: Number of agents analyzed. + """ + + model_config = ConfigDict(frozen=True, allow_inf_nan=False) + + suggestions: tuple[RoutingSuggestion, ...] = Field( + default=(), + description="Per-agent routing optimization suggestions", + ) + analysis_period_start: datetime = Field(description="Analysis period start") + analysis_period_end: datetime = Field(description="Analysis period end") + agents_analyzed: int = Field(ge=0, description="Number of agents analyzed") + + @computed_field # type: ignore[prop-decorator] + @property + def total_estimated_savings_per_1k(self) -> float: + """Aggregate estimated savings per 1k tokens.""" + return round( + sum(s.estimated_savings_per_1k for s in self.suggestions), + BUDGET_ROUNDING_PRECISION, + ) + + @model_validator(mode="after") + def _validate_period_ordering(self) -> Self: + """Ensure analysis_period_start is before analysis_period_end.""" + if self.analysis_period_start >= self.analysis_period_end: + msg = ( + f"analysis_period_start " + f"({self.analysis_period_start.isoformat()}) " + f"must be before analysis_period_end " + f"({self.analysis_period_end.isoformat()})" + ) + raise ValueError(msg) + return self diff --git a/src/ai_company/budget/reports.py b/src/ai_company/budget/reports.py index 4f8b25d276..661edbdd61 100644 --- a/src/ai_company/budget/reports.py +++ b/src/ai_company/budget/reports.py @@ -19,7 +19,10 @@ from ai_company.constants import BUDGET_ROUNDING_PRECISION from ai_company.core.types import NotBlankStr # noqa: TC001 from ai_company.observability import get_logger -from ai_company.observability.events.cfo import CFO_REPORT_GENERATED +from ai_company.observability.events.cfo import ( + CFO_REPORT_GENERATED, + CFO_REPORT_GENERATOR_CREATED, +) if TYPE_CHECKING: from collections.abc import Sequence @@ -174,11 +177,11 @@ class SpendingReport(BaseModel): default=None, description="Comparison with previous period", ) - top_agents_by_cost: tuple[tuple[str, float], ...] = Field( + top_agents_by_cost: tuple[tuple[NotBlankStr, float], ...] = Field( default=(), description="Top agents by cost (agent_id, cost_usd)", ) - top_tasks_by_cost: tuple[tuple[str, float], ...] = Field( + top_tasks_by_cost: tuple[tuple[NotBlankStr, float], ...] = Field( default=(), description="Top tasks by cost (task_id, cost_usd)", ) @@ -225,6 +228,10 @@ def __init__( ) -> None: self._cost_tracker = cost_tracker self._budget_config = budget_config + logger.debug( + CFO_REPORT_GENERATOR_CREATED, + has_budget_config=True, + ) async def generate_report( self, @@ -236,6 +243,10 @@ async def generate_report( ) -> SpendingReport: """Generate a spending report for the given period. + Uses a single ``get_records`` snapshot and derives the summary + from the same data to avoid race conditions between separate + ``build_summary`` and ``get_records`` calls. + Args: start: Inclusive period start. end: Exclusive period end. @@ -258,11 +269,12 @@ async def generate_report( now = datetime.now(UTC) - summary = await self._cost_tracker.build_summary( + # Single snapshot to avoid double-fetch race condition (#3) + records = await self._cost_tracker.get_records( start=start, end=end, ) - records = await self._cost_tracker.get_records( + summary = await self._cost_tracker.build_summary( start=start, end=end, ) diff --git a/src/ai_company/observability/events/cfo.py b/src/ai_company/observability/events/cfo.py index 6e2bdc4fee..8dd33f268c 100644 --- a/src/ai_company/observability/events/cfo.py +++ b/src/ai_company/observability/events/cfo.py @@ -11,5 +11,7 @@ CFO_APPROVAL_EVALUATED: Final[str] = "cfo.approval.evaluated" CFO_OPERATION_DENIED: Final[str] = "cfo.operation.denied" CFO_REPORT_GENERATED: Final[str] = "cfo.report.generated" +CFO_REPORT_GENERATOR_CREATED: Final[str] = "cfo.report_generator.created" CFO_RESOLVER_MISSING: Final[str] = "cfo.resolver.missing" CFO_INSUFFICIENT_WINDOWS: Final[str] = "cfo.anomaly.insufficient_windows" +CFO_ROUTING_OPTIMIZATION_COMPLETE: Final[str] = "cfo.routing.optimization_complete" diff --git a/tests/unit/budget/test_optimizer.py b/tests/unit/budget/test_optimizer.py index 4b219ab624..9d9b293371 100644 --- a/tests/unit/budget/test_optimizer.py +++ b/tests/unit/budget/test_optimizer.py @@ -4,9 +4,10 @@ import pytest +from ai_company.budget._optimizer_helpers import _classify_severity from ai_company.budget.config import BudgetAlertConfig, BudgetConfig from ai_company.budget.enums import BudgetAlertLevel -from ai_company.budget.optimizer import CostOptimizer, _classify_severity +from ai_company.budget.optimizer import CostOptimizer from ai_company.budget.optimizer_models import ( AnomalySeverity, AnomalyType, @@ -508,7 +509,7 @@ async def test_would_exceed_budget_denied(self) -> None: ) optimizer, tracker = _make_optimizer(budget_config=bc) - # Spend 95% and request 10 more + # Spend 95% and request 10 more → projected 105% → HARD_STOP await tracker.record( make_cost_record(cost_usd=95.0, timestamp=_START + timedelta(hours=1)), ) @@ -519,7 +520,8 @@ async def test_would_exceed_budget_denied(self) -> None: now=_START + timedelta(days=15), ) assert decision.approved is False - assert "would exceed" in decision.reason + # With projected alert level, this now triggers auto-deny + assert "denied" in decision.reason.lower() async def test_warning_level_approved_with_conditions(self) -> None: bc = BudgetConfig( @@ -733,3 +735,166 @@ async def test_downgrade_target_not_resolved(self) -> None: result = await optimizer.recommend_downgrades(start=_START, end=_END) # Target "nonexistent" can't be resolved → no recommendation assert result.recommendations == () + + async def test_negative_estimated_cost_rejected(self) -> None: + """Negative estimated_cost_usd raises ValueError.""" + optimizer, _ = _make_optimizer() + with pytest.raises(ValueError, match="estimated_cost_usd must be >= 0"): + await optimizer.evaluate_operation( + agent_id="alice", + estimated_cost_usd=-1.0, + ) + + async def test_window_count_upper_bound(self) -> None: + """window_count > 1000 raises ValueError.""" + optimizer, _ = _make_optimizer() + with pytest.raises(ValueError, match="window_count must be <= 1000"): + await optimizer.detect_anomalies( + start=_START, + end=_END, + window_count=1001, + ) + + async def test_projected_alert_level_used_for_auto_deny(self) -> None: + """Auto-deny uses projected alert level, not current.""" + bc = BudgetConfig( + total_monthly=100.0, + alerts=BudgetAlertConfig(warn_at=75, critical_at=90, hard_stop_at=100), + ) + config = CostOptimizerConfig( + approval_auto_deny_alert_level=BudgetAlertLevel.HARD_STOP, + ) + optimizer, tracker = _make_optimizer(budget_config=bc, config=config) + + # Spend 95% — current alert is CRITICAL, but requesting 10 + # would push to 105% → projected HARD_STOP → denied + await tracker.record( + make_cost_record(cost_usd=95.0, timestamp=_START + timedelta(hours=1)), + ) + + decision = await optimizer.evaluate_operation( + agent_id="alice", + estimated_cost_usd=10.0, + now=_START + timedelta(days=15), + ) + assert decision.approved is False + assert "projected" in decision.reason.lower() + + +# ── Routing Optimization Tests ────────────────────────────────── + + +@pytest.mark.unit +class TestSuggestRoutingOptimizations: + async def test_no_resolver_empty_result(self) -> None: + optimizer, _ = _make_optimizer() + result = await optimizer.suggest_routing_optimizations( + start=_START, + end=_END, + ) + assert result.suggestions == () + assert result.agents_analyzed == 0 + + async def test_no_records_empty_suggestions(self) -> None: + resolver = _make_resolver() + optimizer, _ = _make_optimizer(model_resolver=resolver) + result = await optimizer.suggest_routing_optimizations( + start=_START, + end=_END, + ) + assert result.suggestions == () + assert result.agents_analyzed == 0 + + async def test_suggests_cheaper_model(self) -> None: + resolver = _make_resolver() + optimizer, tracker = _make_optimizer(model_resolver=resolver) + + # Alice uses the expensive large model + await tracker.record( + make_cost_record( + agent_id="alice", + model="test-large-001", + cost_usd=5.0, + input_tokens=1000, + output_tokens=500, + timestamp=_START + timedelta(hours=1), + ), + ) + + result = await optimizer.suggest_routing_optimizations( + start=_START, + end=_END, + ) + assert len(result.suggestions) == 1 + suggestion = result.suggestions[0] + assert suggestion.agent_id == "alice" + assert suggestion.current_model == "test-large-001" + assert suggestion.estimated_savings_per_1k > 0 + assert result.total_estimated_savings_per_1k > 0 + + async def test_no_suggestion_for_cheapest_model(self) -> None: + resolver = _make_resolver() + optimizer, tracker = _make_optimizer(model_resolver=resolver) + + # Alice already uses the cheapest model + await tracker.record( + make_cost_record( + agent_id="alice", + model="test-small-001", + cost_usd=0.1, + input_tokens=1000, + output_tokens=500, + timestamp=_START + timedelta(hours=1), + ), + ) + + result = await optimizer.suggest_routing_optimizations( + start=_START, + end=_END, + ) + assert result.suggestions == () + assert result.agents_analyzed == 1 + + async def test_start_after_end_rejected(self) -> None: + optimizer, _ = _make_optimizer() + with pytest.raises(ValueError, match=r"start .* must be before end"): + await optimizer.suggest_routing_optimizations(start=_END, end=_START) + + async def test_context_window_respected(self) -> None: + """Suggestions only include models with sufficient context window.""" + models = [ + ResolvedModel( + provider_name="test-provider", + model_id="test-large-001", + alias="large", + cost_per_1k_input=0.03, + cost_per_1k_output=0.06, + max_context=200000, + ), + ResolvedModel( + provider_name="test-provider", + model_id="test-small-001", + alias="small", + cost_per_1k_input=0.001, + cost_per_1k_output=0.002, + max_context=50000, # Smaller context than large + ), + ] + resolver = _make_resolver(models) + optimizer, tracker = _make_optimizer(model_resolver=resolver) + + await tracker.record( + make_cost_record( + agent_id="alice", + model="test-large-001", + cost_usd=5.0, + timestamp=_START + timedelta(hours=1), + ), + ) + + result = await optimizer.suggest_routing_optimizations( + start=_START, + end=_END, + ) + # small has insufficient context window → no suggestion + assert result.suggestions == () diff --git a/tests/unit/budget/test_optimizer_models.py b/tests/unit/budget/test_optimizer_models.py index f2489e01e5..77a5aa28ea 100644 --- a/tests/unit/budget/test_optimizer_models.py +++ b/tests/unit/budget/test_optimizer_models.py @@ -16,6 +16,8 @@ DowngradeRecommendation, EfficiencyAnalysis, EfficiencyRating, + RoutingOptimizationAnalysis, + RoutingSuggestion, SpendingAnomaly, ) @@ -265,12 +267,34 @@ class TestDowngradeAnalysis: def test_empty_analysis(self) -> None: analysis = DowngradeAnalysis( recommendations=(), - total_estimated_savings_per_1k=0.0, budget_pressure_percent=0.0, ) assert analysis.recommendations == () assert analysis.total_estimated_savings_per_1k == 0.0 + @pytest.mark.unit + def test_total_savings_is_computed(self) -> None: + analysis = DowngradeAnalysis( + recommendations=( + DowngradeRecommendation( + agent_id="alice", + current_model="test-large-001", + recommended_model="test-small-001", + estimated_savings_per_1k=0.05, + reason="Switch to cheaper model", + ), + DowngradeRecommendation( + agent_id="bob", + current_model="test-large-001", + recommended_model="test-small-001", + estimated_savings_per_1k=0.03, + reason="Switch to cheaper model", + ), + ), + budget_pressure_percent=50.0, + ) + assert analysis.total_estimated_savings_per_1k == 0.08 + # ── ApprovalDecision Tests ──────────────────────────────────────── @@ -393,3 +417,166 @@ def test_zero_savings_rejected(self) -> None: estimated_savings_per_1k=0.0, reason="Zero savings", ) + + +# ── EfficiencyAnalysis sort-order validator tests ──────────────── + + +class TestEfficiencyAnalysisSortOrder: + @pytest.mark.unit + def test_sorted_agents_accepted(self) -> None: + analysis = EfficiencyAnalysis( + agents=( + AgentEfficiency( + agent_id="bob", + total_cost_usd=10.0, + total_tokens=1000, + record_count=5, + efficiency_rating=EfficiencyRating.INEFFICIENT, + ), + AgentEfficiency( + agent_id="alice", + total_cost_usd=1.0, + total_tokens=1000, + record_count=5, + efficiency_rating=EfficiencyRating.NORMAL, + ), + ), + global_avg_cost_per_1k=5.0, + analysis_period_start=datetime(2026, 2, 1, tzinfo=UTC), + analysis_period_end=datetime(2026, 3, 1, tzinfo=UTC), + ) + assert len(analysis.agents) == 2 + + @pytest.mark.unit + def test_unsorted_agents_rejected(self) -> None: + with pytest.raises(ValueError, match="agents must be sorted"): + EfficiencyAnalysis( + agents=( + AgentEfficiency( + agent_id="alice", + total_cost_usd=1.0, + total_tokens=1000, + record_count=5, + efficiency_rating=EfficiencyRating.NORMAL, + ), + AgentEfficiency( + agent_id="bob", + total_cost_usd=10.0, + total_tokens=1000, + record_count=5, + efficiency_rating=EfficiencyRating.INEFFICIENT, + ), + ), + global_avg_cost_per_1k=5.0, + analysis_period_start=datetime(2026, 2, 1, tzinfo=UTC), + analysis_period_end=datetime(2026, 3, 1, tzinfo=UTC), + ) + + +# ── RoutingSuggestion Tests ────────────────────────────────────── + + +class TestRoutingSuggestion: + @pytest.mark.unit + def test_construction(self) -> None: + suggestion = RoutingSuggestion( + agent_id="alice", + current_model="test-large-001", + suggested_model="test-small-001", + current_cost_per_1k=0.09, + suggested_cost_per_1k=0.003, + reason="Switch to cheaper model", + ) + assert suggestion.agent_id == "alice" + assert suggestion.estimated_savings_per_1k == 0.087 + + @pytest.mark.unit + def test_frozen(self) -> None: + suggestion = RoutingSuggestion( + agent_id="alice", + current_model="test-large-001", + suggested_model="test-small-001", + current_cost_per_1k=0.09, + suggested_cost_per_1k=0.003, + reason="Switch to cheaper model", + ) + with pytest.raises(Exception): # noqa: B017, PT011 + suggestion.agent_id = "bob" # type: ignore[misc] + + @pytest.mark.unit + def test_same_model_rejected(self) -> None: + with pytest.raises(ValueError, match="must differ"): + RoutingSuggestion( + agent_id="alice", + current_model="test-large-001", + suggested_model="test-large-001", + current_cost_per_1k=0.09, + suggested_cost_per_1k=0.003, + reason="No actual suggestion", + ) + + @pytest.mark.unit + def test_no_savings_rejected(self) -> None: + with pytest.raises(ValueError, match="must be less than"): + RoutingSuggestion( + agent_id="alice", + current_model="test-large-001", + suggested_model="test-small-001", + current_cost_per_1k=0.003, + suggested_cost_per_1k=0.09, + reason="More expensive", + ) + + +# ── RoutingOptimizationAnalysis Tests ──────────────────────────── + + +class TestRoutingOptimizationAnalysis: + @pytest.mark.unit + def test_empty_analysis(self) -> None: + analysis = RoutingOptimizationAnalysis( + suggestions=(), + analysis_period_start=datetime(2026, 2, 1, tzinfo=UTC), + analysis_period_end=datetime(2026, 3, 1, tzinfo=UTC), + agents_analyzed=0, + ) + assert analysis.suggestions == () + assert analysis.total_estimated_savings_per_1k == 0.0 + + @pytest.mark.unit + def test_total_savings_is_computed(self) -> None: + analysis = RoutingOptimizationAnalysis( + suggestions=( + RoutingSuggestion( + agent_id="alice", + current_model="test-large-001", + suggested_model="test-small-001", + current_cost_per_1k=0.09, + suggested_cost_per_1k=0.003, + reason="Switch to cheaper", + ), + RoutingSuggestion( + agent_id="bob", + current_model="test-medium-001", + suggested_model="test-small-001", + current_cost_per_1k=0.03, + suggested_cost_per_1k=0.003, + reason="Switch to cheaper", + ), + ), + analysis_period_start=datetime(2026, 2, 1, tzinfo=UTC), + analysis_period_end=datetime(2026, 3, 1, tzinfo=UTC), + agents_analyzed=2, + ) + assert analysis.total_estimated_savings_per_1k == 0.114 + + @pytest.mark.unit + def test_period_ordering_invalid(self) -> None: + with pytest.raises(ValueError, match="analysis_period_start"): + RoutingOptimizationAnalysis( + suggestions=(), + analysis_period_start=datetime(2026, 3, 1, tzinfo=UTC), + analysis_period_end=datetime(2026, 2, 1, tzinfo=UTC), + agents_analyzed=0, + ) From f909c79f03836f00b342a333d8c91dddf381b7b0 Mon Sep 17 00:00:00 2001 From: Aurelio <19254254+Aureliolo@users.noreply.github.com> Date: Mon, 9 Mar 2026 16:20:37 +0100 Subject: [PATCH 4/4] fix: address 13 round-2 PR review findings for CFO optimizer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - (A) _find_most_used_model accepts pre-filtered agent records - (B) _find_cheaper_model respects min_context for context window - (C) recommend_downgrades returns real budget_pressure when no resolver - (D) evaluate_operation uses projected_alert for conditions - (E) reports.py logs WARNING before validation ValueErrors - (F) suggest_routing_optimizations docstring no longer claims latency - (G) generate_report derives total_cost from records for consistency - (H) evaluate_operation split into _check_denial/_build_approval_conditions; recommend_downgrades/suggest_routing_optimizations loops extracted - (I) recommend_downgrades parallelizes get_records + budget_pressure - (J) test_optimizer.py split into 3 files (analysis, decisions) - (K) DESIGN_SPEC §10.3 mentions routing optimization - (L) _find_cheaper_model tests exercise actual code path + min_context --- DESIGN_SPEC.md | 9 +- src/ai_company/budget/_optimizer_helpers.py | 30 +- src/ai_company/budget/optimizer.py | 335 ++++--- src/ai_company/budget/reports.py | 25 +- src/ai_company/observability/events/cfo.py | 1 + tests/unit/budget/conftest.py | 62 ++ tests/unit/budget/test_optimizer.py | 852 +----------------- tests/unit/budget/test_optimizer_analysis.py | 283 ++++++ tests/unit/budget/test_optimizer_decisions.py | 540 +++++++++++ 9 files changed, 1145 insertions(+), 992 deletions(-) create mode 100644 tests/unit/budget/test_optimizer_analysis.py create mode 100644 tests/unit/budget/test_optimizer_decisions.py diff --git a/DESIGN_SPEC.md b/DESIGN_SPEC.md index 6ad5968283..989beb7d38 100644 --- a/DESIGN_SPEC.md +++ b/DESIGN_SPEC.md @@ -1847,10 +1847,11 @@ The CFO agent (when enabled) acts as a cost management system: > **Implementation note (M5):** `CostOptimizer` service (`budget/optimizer.py`) > implements anomaly detection (sigma + spike factor), per-agent efficiency -> analysis, model downgrade recommendations (via `ModelResolver`), and -> operation approval evaluation. `ReportGenerator` service -> (`budget/reports.py`) produces multi-dimensional spending reports with -> task/provider/model breakdowns and period-over-period comparison. +> analysis, model downgrade recommendations (via `ModelResolver`), routing +> optimization suggestions (cost + context-window comparison), and operation +> approval evaluation. `ReportGenerator` service (`budget/reports.py`) +> produces multi-dimensional spending reports with task/provider/model +> breakdowns and period-over-period comparison. ### 10.4 Cost Controls diff --git a/src/ai_company/budget/_optimizer_helpers.py b/src/ai_company/budget/_optimizer_helpers.py index 210d6c72fe..4600af3bee 100644 --- a/src/ai_company/budget/_optimizer_helpers.py +++ b/src/ai_company/budget/_optimizer_helpers.py @@ -237,20 +237,13 @@ def _compute_global_avg_cost_per_1k( def _find_most_used_model( - records: Sequence[CostRecord], - agent_id: str, + agent_records: Sequence[CostRecord], ) -> str | None: - """Find the model most frequently used by an agent.""" + """Find the most frequently used model from pre-filtered records.""" model_counts: dict[str, int] = defaultdict(int) - for r in records: - if r.agent_id == agent_id: - model_counts[r.model] += 1 + for r in agent_records: + model_counts[r.model] += 1 if not model_counts: - logger.debug( - CFO_DOWNGRADE_SKIPPED, - agent_id=agent_id, - reason="no_records_for_agent", - ) return None return max(model_counts, key=lambda m: model_counts[m]) @@ -288,7 +281,11 @@ def _build_downgrade_recommendation( ) if target_ref is None: - cheaper = _find_cheaper_model(current_resolved.total_cost_per_1k, resolver) + cheaper = _find_cheaper_model( + current_resolved.total_cost_per_1k, + resolver, + min_context=current_resolved.max_context, + ) if cheaper is None: logger.debug( CFO_DOWNGRADE_SKIPPED, @@ -340,11 +337,16 @@ def _build_downgrade_recommendation( def _find_cheaper_model( current_cost_per_1k: float, resolver: ModelResolver, + *, + min_context: int = 0, ) -> ResolvedModel | None: - """Find the overall cheapest available model below current cost.""" + """Find the cheapest model below current cost with sufficient context.""" all_models = resolver.all_models_sorted_by_cost() for model in all_models: - if model.total_cost_per_1k < current_cost_per_1k: + if ( + model.total_cost_per_1k < current_cost_per_1k + and model.max_context >= min_context + ): return model return None diff --git a/src/ai_company/budget/optimizer.py b/src/ai_company/budget/optimizer.py index bf7fc68cf5..fd60502b3d 100644 --- a/src/ai_company/budget/optimizer.py +++ b/src/ai_company/budget/optimizer.py @@ -11,6 +11,7 @@ Service layer backing the CFO role (DESIGN_SPEC Section 10.3). """ +import asyncio from datetime import UTC, datetime from typing import TYPE_CHECKING @@ -55,7 +56,9 @@ if TYPE_CHECKING: from ai_company.budget.config import BudgetConfig + from ai_company.budget.cost_record import CostRecord from ai_company.budget.tracker import CostTracker + from ai_company.providers.routing.models import ResolvedModel from ai_company.providers.routing.resolver import ModelResolver logger = get_logger(__name__) @@ -303,15 +306,21 @@ async def recommend_downgrades( CFO_RESOLVER_MISSING, reason="no_model_resolver_configured", ) + budget_pressure = await self._compute_budget_pressure() return DowngradeAnalysis( recommendations=(), - budget_pressure_percent=0.0, + budget_pressure_percent=budget_pressure, ) - records = await self._cost_tracker.get_records( - start=start, - end=end, - ) + async with asyncio.TaskGroup() as tg: + records_task = tg.create_task( + self._cost_tracker.get_records(start=start, end=end), + ) + pressure_task = tg.create_task(self._compute_budget_pressure()) + + records = records_task.result() + budget_pressure = pressure_task.result() + efficiency = _build_efficiency_from_records( records, start=start, @@ -326,39 +335,11 @@ async def recommend_downgrades( global_avg_cost_per_1k=efficiency.global_avg_cost_per_1k, ) - downgrade_map = dict(self._budget_config.auto_downgrade.downgrade_map) - budget_pressure = await self._compute_budget_pressure() - - recommendations: list[DowngradeRecommendation] = [] - - for agent in efficiency.agents: - if agent.efficiency_rating != EfficiencyRating.INEFFICIENT: - continue - - most_used_model = _find_most_used_model(records, agent.agent_id) - if most_used_model is None: - logger.debug( - CFO_DOWNGRADE_SKIPPED, - agent_id=agent.agent_id, - reason="no_most_used_model", - ) - continue - - recommendation = _build_downgrade_recommendation( - agent_id=agent.agent_id, - current_model=most_used_model, - downgrade_map=downgrade_map, - resolver=self._model_resolver, - ) - if recommendation is not None: - recommendations.append(recommendation) - logger.info( - CFO_DOWNGRADE_RECOMMENDED, - agent_id=agent.agent_id, - current_model=most_used_model, - recommended_model=recommendation.recommended_model, - estimated_savings=recommendation.estimated_savings_per_1k, - ) + by_agent = _group_records_by_agent(records) + recommendations = self._build_recommendations( + efficiency=efficiency, + by_agent=by_agent, + ) return DowngradeAnalysis( recommendations=tuple(recommendations), @@ -375,7 +356,7 @@ async def suggest_routing_optimizations( Analyzes each agent's most-used model and suggests cheaper alternatives available through the model resolver, comparing by - cost, context window, and latency. + cost and context window size. Unlike ``recommend_downgrades`` which only targets INEFFICIENT agents, this method analyzes all agents and suggests cheaper @@ -422,49 +403,7 @@ async def suggest_routing_optimizations( by_agent = _group_records_by_agent(records) all_models = self._model_resolver.all_models_sorted_by_cost() - suggestions: list[RoutingSuggestion] = [] - - for agent_id in sorted(by_agent): - most_used = _find_most_used_model(records, agent_id) - if most_used is None: - continue - - current_resolved = self._model_resolver.resolve_safe(most_used) - if current_resolved is None: - continue - - # Find cheapest model with sufficient context window - for candidate in all_models: - if candidate.model_id == current_resolved.model_id: - continue - if candidate.total_cost_per_1k >= current_resolved.total_cost_per_1k: - continue - if candidate.max_context < current_resolved.max_context: - continue - - suggestions.append( - RoutingSuggestion( - agent_id=agent_id, - current_model=most_used, - suggested_model=candidate.model_id, - current_cost_per_1k=round( - current_resolved.total_cost_per_1k, - BUDGET_ROUNDING_PRECISION, - ), - suggested_cost_per_1k=round( - candidate.total_cost_per_1k, - BUDGET_ROUNDING_PRECISION, - ), - reason=( - f"Switch from {most_used!r} " - f"(${current_resolved.total_cost_per_1k:.4f}/1k) " - f"to {candidate.model_id!r} " - f"(${candidate.total_cost_per_1k:.4f}/1k) " - f"— same context window, lower cost" - ), - ), - ) - break # Take first (cheapest) match per agent + suggestions = self._find_routing_suggestions(by_agent, all_models) result = RoutingOptimizationAnalysis( suggestions=tuple(suggestions), @@ -564,10 +503,170 @@ async def evaluate_operation( ) projected_alert = _compute_alert_level(projected_pct, cfg) + denial = self._check_denial( + agent_id=agent_id, + estimated_cost_usd=estimated_cost_usd, + remaining=remaining, + used_pct=used_pct, + alert_level=alert_level, + projected_cost=projected_cost, + projected_alert=projected_alert, + ) + if denial is not None: + return denial + + conditions = self._build_approval_conditions( + estimated_cost_usd=estimated_cost_usd, + projected_alert=projected_alert, + projected_pct=projected_pct, + ) + + decision = ApprovalDecision( + approved=True, + reason="Approved", + budget_remaining_usd=remaining, + budget_used_percent=used_pct, + alert_level=alert_level, + conditions=conditions, + ) + + logger.info( + CFO_APPROVAL_EVALUATED, + agent_id=agent_id, + approved=True, + estimated_cost=estimated_cost_usd, + alert_level=alert_level.value, + conditions_count=len(conditions), + ) + + return decision + + # ── Private helpers ────────────────────────────────────────── + + def _build_recommendations( + self, + *, + efficiency: EfficiencyAnalysis, + by_agent: dict[str, list[CostRecord]], + ) -> list[DowngradeRecommendation]: + """Build downgrade recommendations for inefficient agents.""" + assert self._model_resolver is not None # noqa: S101 + downgrade_map = dict(self._budget_config.auto_downgrade.downgrade_map) + recommendations: list[DowngradeRecommendation] = [] + + for agent in efficiency.agents: + if agent.efficiency_rating != EfficiencyRating.INEFFICIENT: + continue + + agent_records = by_agent.get(agent.agent_id, []) + most_used_model = _find_most_used_model(agent_records) + if most_used_model is None: + logger.debug( + CFO_DOWNGRADE_SKIPPED, + agent_id=agent.agent_id, + reason="no_most_used_model", + ) + continue + + recommendation = _build_downgrade_recommendation( + agent_id=agent.agent_id, + current_model=most_used_model, + downgrade_map=downgrade_map, + resolver=self._model_resolver, + ) + if recommendation is not None: + recommendations.append(recommendation) + logger.info( + CFO_DOWNGRADE_RECOMMENDED, + agent_id=agent.agent_id, + current_model=most_used_model, + recommended_model=recommendation.recommended_model, + estimated_savings=recommendation.estimated_savings_per_1k, + ) + + return recommendations + + def _find_routing_suggestions( + self, + by_agent: dict[str, list[CostRecord]], + all_models: tuple[ResolvedModel, ...], + ) -> list[RoutingSuggestion]: + """Find routing suggestions for all agents.""" + assert self._model_resolver is not None # noqa: S101 + suggestions: list[RoutingSuggestion] = [] + + for agent_id in sorted(by_agent): + agent_records = by_agent[agent_id] + most_used = _find_most_used_model(agent_records) + if most_used is None: + continue + + current_resolved = self._model_resolver.resolve_safe(most_used) + if current_resolved is None: + continue + + # Find cheapest model with sufficient context window + for candidate in all_models: + if candidate.model_id == current_resolved.model_id: + continue + if candidate.total_cost_per_1k >= current_resolved.total_cost_per_1k: + continue + if candidate.max_context < current_resolved.max_context: + continue + + suggestions.append( + RoutingSuggestion( + agent_id=agent_id, + current_model=most_used, + suggested_model=candidate.model_id, + current_cost_per_1k=round( + current_resolved.total_cost_per_1k, + BUDGET_ROUNDING_PRECISION, + ), + suggested_cost_per_1k=round( + candidate.total_cost_per_1k, + BUDGET_ROUNDING_PRECISION, + ), + reason=( + f"Switch from {most_used!r} " + f"(${current_resolved.total_cost_per_1k:.4f}/1k) " + f"to {candidate.model_id!r} " + f"(${candidate.total_cost_per_1k:.4f}/1k) " + f"— same context window, lower cost" + ), + ), + ) + break # Take first (cheapest) match per agent + + return suggestions + + def _check_denial( # noqa: PLR0913 + self, + *, + agent_id: str, + estimated_cost_usd: float, + remaining: float, + used_pct: float, + alert_level: BudgetAlertLevel, + projected_cost: float, + projected_alert: BudgetAlertLevel, + ) -> ApprovalDecision | None: + """Check if the operation should be denied. + + Returns the denial decision, or ``None`` if not denied. + """ auto_deny_level = self._config.approval_auto_deny_alert_level if _ALERT_LEVEL_ORDER[projected_alert] >= _ALERT_LEVEL_ORDER[auto_deny_level]: - decision = ApprovalDecision( + logger.warning( + CFO_OPERATION_DENIED, + agent_id=agent_id, + estimated_cost=estimated_cost_usd, + alert_level=alert_level.value, + projected_alert_level=projected_alert.value, + reason="alert_level_exceeded", + ) + return ApprovalDecision( approved=False, reason=( f"Denied: projected alert level {projected_alert.value} " @@ -579,22 +678,23 @@ async def evaluate_operation( alert_level=alert_level, conditions=(), ) - logger.warning( - CFO_OPERATION_DENIED, - agent_id=agent_id, - estimated_cost=estimated_cost_usd, - alert_level=alert_level.value, - projected_alert_level=projected_alert.value, - reason="alert_level_exceeded", - ) - return decision hard_stop_limit = round( - cfg.total_monthly * cfg.alerts.hard_stop_at / 100, + self._budget_config.total_monthly + * self._budget_config.alerts.hard_stop_at + / 100, BUDGET_ROUNDING_PRECISION, ) if projected_cost >= hard_stop_limit: - decision = ApprovalDecision( + logger.warning( + CFO_OPERATION_DENIED, + agent_id=agent_id, + estimated_cost=estimated_cost_usd, + projected_cost=projected_cost, + hard_stop_limit=hard_stop_limit, + reason="would_exceed_hard_stop", + ) + return ApprovalDecision( approved=False, reason=( f"Denied: projected cost ${projected_cost:.2f} " @@ -605,16 +705,17 @@ async def evaluate_operation( alert_level=alert_level, conditions=(), ) - logger.warning( - CFO_OPERATION_DENIED, - agent_id=agent_id, - estimated_cost=estimated_cost_usd, - projected_cost=projected_cost, - hard_stop_limit=hard_stop_limit, - reason="would_exceed_hard_stop", - ) - return decision + return None + + def _build_approval_conditions( + self, + *, + estimated_cost_usd: float, + projected_alert: BudgetAlertLevel, + projected_pct: float, + ) -> tuple[str, ...]: + """Build warning conditions for an approved operation.""" conditions: list[str] = [] warn_threshold = self._config.approval_warn_threshold_usd if estimated_cost_usd >= warn_threshold: @@ -623,32 +724,12 @@ async def evaluate_operation( f"(threshold: ${warn_threshold:.2f})" ) - if alert_level in (BudgetAlertLevel.WARNING, BudgetAlertLevel.CRITICAL): + if projected_alert in (BudgetAlertLevel.WARNING, BudgetAlertLevel.CRITICAL): conditions.append( - f"Budget alert level is {alert_level.value} ({used_pct:.1f}% used)" + f"Budget alert level is {projected_alert.value} " + f"({projected_pct:.1f}% projected)" ) - - decision = ApprovalDecision( - approved=True, - reason="Approved", - budget_remaining_usd=remaining, - budget_used_percent=used_pct, - alert_level=alert_level, - conditions=tuple(conditions), - ) - - logger.info( - CFO_APPROVAL_EVALUATED, - agent_id=agent_id, - approved=True, - estimated_cost=estimated_cost_usd, - alert_level=alert_level.value, - conditions_count=len(conditions), - ) - - return decision - - # ── Private helpers ────────────────────────────────────────── + return tuple(conditions) async def _compute_budget_pressure(self) -> float: """Compute current budget utilization percentage.""" diff --git a/src/ai_company/budget/reports.py b/src/ai_company/budget/reports.py index 661edbdd61..914009ed24 100644 --- a/src/ai_company/budget/reports.py +++ b/src/ai_company/budget/reports.py @@ -22,6 +22,7 @@ from ai_company.observability.events.cfo import ( CFO_REPORT_GENERATED, CFO_REPORT_GENERATOR_CREATED, + CFO_REPORT_VALIDATION_ERROR, ) if TYPE_CHECKING: @@ -243,9 +244,9 @@ async def generate_report( ) -> SpendingReport: """Generate a spending report for the given period. - Uses a single ``get_records`` snapshot and derives the summary - from the same data to avoid race conditions between separate - ``build_summary`` and ``get_records`` calls. + Fetches records and summary concurrently; derives ``total_cost`` + from the records snapshot for consistent distribution + percentages. Args: start: Inclusive period start. @@ -261,15 +262,25 @@ async def generate_report( ValueError: If ``start >= end`` or ``top_n < 1``. """ if start >= end: + logger.warning( + CFO_REPORT_VALIDATION_ERROR, + error="start_after_end", + start=start.isoformat(), + end=end.isoformat(), + ) msg = f"start ({start.isoformat()}) must be before end ({end.isoformat()})" raise ValueError(msg) if top_n < 1: + logger.warning( + CFO_REPORT_VALIDATION_ERROR, + error="top_n_below_minimum", + top_n=top_n, + ) msg = f"top_n must be >= 1, got {top_n}" raise ValueError(msg) now = datetime.now(UTC) - # Single snapshot to avoid double-fetch race condition (#3) records = await self._cost_tracker.get_records( start=start, end=end, @@ -279,7 +290,11 @@ async def generate_report( end=end, ) - total_cost = summary.period.total_cost_usd + # Derive total_cost from records for consistent percentages + total_cost = round( + math.fsum(r.cost_usd for r in records), + BUDGET_ROUNDING_PRECISION, + ) by_task = _build_task_spendings(records) by_provider = _build_provider_distribution(records, total_cost) by_model = _build_model_distribution(records, total_cost) diff --git a/src/ai_company/observability/events/cfo.py b/src/ai_company/observability/events/cfo.py index 8dd33f268c..46625b0d05 100644 --- a/src/ai_company/observability/events/cfo.py +++ b/src/ai_company/observability/events/cfo.py @@ -15,3 +15,4 @@ CFO_RESOLVER_MISSING: Final[str] = "cfo.resolver.missing" CFO_INSUFFICIENT_WINDOWS: Final[str] = "cfo.anomaly.insufficient_windows" CFO_ROUTING_OPTIMIZATION_COMPLETE: Final[str] = "cfo.routing.optimization_complete" +CFO_REPORT_VALIDATION_ERROR: Final[str] = "cfo.report.validation_error" diff --git a/tests/unit/budget/conftest.py b/tests/unit/budget/conftest.py index 2a7a8fce0f..02da4aa185 100644 --- a/tests/unit/budget/conftest.py +++ b/tests/unit/budget/conftest.py @@ -35,6 +35,8 @@ SpendingSummary, ) from ai_company.budget.tracker import CostTracker +from ai_company.providers.routing.models import ResolvedModel +from ai_company.providers.routing.resolver import ModelResolver if TYPE_CHECKING: from collections.abc import Callable @@ -307,6 +309,66 @@ def make_cost_record( # noqa: PLR0913 ) +# ── CostOptimizer test helpers ─────────────────────────────────── + +OPT_START = datetime(2026, 2, 1, tzinfo=UTC) +OPT_END = datetime(2026, 3, 1, tzinfo=UTC) + + +def make_optimizer( + *, + budget_config: BudgetConfig | None = None, + config: CostOptimizerConfig | None = None, + model_resolver: ModelResolver | None = None, +) -> tuple[CostOptimizer, CostTracker]: + """Build a CostOptimizer with a fresh CostTracker.""" + bc = budget_config or BudgetConfig(total_monthly=100.0) + tracker = CostTracker(budget_config=bc) + optimizer = CostOptimizer( + cost_tracker=tracker, + budget_config=bc, + config=config, + model_resolver=model_resolver, + ) + return optimizer, tracker + + +def make_resolver( + models: list[ResolvedModel] | None = None, +) -> ModelResolver: + """Build a ModelResolver from a list of ResolvedModel.""" + if models is None: + models = [ + ResolvedModel( + provider_name="test-provider", + model_id="test-large-001", + alias="large", + cost_per_1k_input=0.03, + cost_per_1k_output=0.06, + ), + ResolvedModel( + provider_name="test-provider", + model_id="test-medium-001", + alias="medium", + cost_per_1k_input=0.01, + cost_per_1k_output=0.02, + ), + ResolvedModel( + provider_name="test-provider", + model_id="test-small-001", + alias="small", + cost_per_1k_input=0.001, + cost_per_1k_output=0.002, + ), + ] + index: dict[str, ResolvedModel] = {} + for m in models: + index[m.model_id] = m + if m.alias is not None: + index[m.alias] = m + return ModelResolver(index) + + # ── CFO / CostOptimizer fixtures ───────────────────────────────── diff --git a/tests/unit/budget/test_optimizer.py b/tests/unit/budget/test_optimizer.py index 9d9b293371..89ca458583 100644 --- a/tests/unit/budget/test_optimizer.py +++ b/tests/unit/budget/test_optimizer.py @@ -1,83 +1,13 @@ -"""Tests for CostOptimizer service.""" - -from datetime import UTC, datetime, timedelta +"""Tests for CostOptimizer — init, classify severity, input validation.""" import pytest from ai_company.budget._optimizer_helpers import _classify_severity -from ai_company.budget.config import BudgetAlertConfig, BudgetConfig -from ai_company.budget.enums import BudgetAlertLevel -from ai_company.budget.optimizer import CostOptimizer from ai_company.budget.optimizer_models import ( AnomalySeverity, - AnomalyType, CostOptimizerConfig, - EfficiencyRating, ) -from ai_company.budget.tracker import CostTracker -from ai_company.providers.routing.models import ResolvedModel -from ai_company.providers.routing.resolver import ModelResolver -from tests.unit.budget.conftest import make_cost_record - -# ── Helpers ─────────────────────────────────────────────────────── - -_START = datetime(2026, 2, 1, tzinfo=UTC) -_END = datetime(2026, 3, 1, tzinfo=UTC) - - -def _make_optimizer( - *, - budget_config: BudgetConfig | None = None, - config: CostOptimizerConfig | None = None, - model_resolver: ModelResolver | None = None, -) -> tuple[CostOptimizer, CostTracker]: - """Build a CostOptimizer with a fresh CostTracker.""" - bc = budget_config or BudgetConfig(total_monthly=100.0) - tracker = CostTracker(budget_config=bc) - optimizer = CostOptimizer( - cost_tracker=tracker, - budget_config=bc, - config=config, - model_resolver=model_resolver, - ) - return optimizer, tracker - - -def _make_resolver( - models: list[ResolvedModel] | None = None, -) -> ModelResolver: - """Build a ModelResolver from a list of ResolvedModel.""" - if models is None: - models = [ - ResolvedModel( - provider_name="test-provider", - model_id="test-large-001", - alias="large", - cost_per_1k_input=0.03, - cost_per_1k_output=0.06, - ), - ResolvedModel( - provider_name="test-provider", - model_id="test-medium-001", - alias="medium", - cost_per_1k_input=0.01, - cost_per_1k_output=0.02, - ), - ResolvedModel( - provider_name="test-provider", - model_id="test-small-001", - alias="small", - cost_per_1k_input=0.001, - cost_per_1k_output=0.002, - ), - ] - index: dict[str, ResolvedModel] = {} - for m in models: - index[m.model_id] = m - if m.alias is not None: - index[m.alias] = m - return ModelResolver(index) - +from tests.unit.budget.conftest import OPT_END, OPT_START, make_optimizer # ── Init Tests ──────────────────────────────────────────────────── @@ -85,514 +15,15 @@ def _make_resolver( @pytest.mark.unit class TestInit: async def test_defaults(self) -> None: - optimizer, _ = _make_optimizer() + optimizer, _ = make_optimizer() assert optimizer._config == CostOptimizerConfig() async def test_custom_config(self) -> None: cfg = CostOptimizerConfig(anomaly_sigma_threshold=3.0) - optimizer, _ = _make_optimizer(config=cfg) + optimizer, _ = make_optimizer(config=cfg) assert optimizer._config.anomaly_sigma_threshold == 3.0 -# ── Anomaly Detection Tests ────────────────────────────────────── - - -@pytest.mark.unit -class TestDetectAnomalies: - async def test_no_records_empty_result(self) -> None: - optimizer, _ = _make_optimizer() - result = await optimizer.detect_anomalies(start=_START, end=_END) - assert result.anomalies == () - assert result.agents_scanned == 0 - - async def test_normal_spending_no_anomalies(self) -> None: - optimizer, tracker = _make_optimizer() - # Create uniform spending across 5 windows - window_duration = (_END - _START) / 5 - for i in range(5): - ts = _START + window_duration * i + timedelta(hours=1) - await tracker.record( - make_cost_record(agent_id="alice", cost_usd=1.0, timestamp=ts), - ) - - result = await optimizer.detect_anomalies(start=_START, end=_END) - assert result.anomalies == () - assert result.agents_scanned == 1 - - async def test_spike_detected(self) -> None: - optimizer, tracker = _make_optimizer() - window_duration = (_END - _START) / 5 - - # Normal spending in first 4 windows - for i in range(4): - ts = _START + window_duration * i + timedelta(hours=1) - await tracker.record( - make_cost_record(agent_id="alice", cost_usd=1.0, timestamp=ts), - ) - - # Spike in last window - ts = _START + window_duration * 4 + timedelta(hours=1) - await tracker.record( - make_cost_record(agent_id="alice", cost_usd=20.0, timestamp=ts), - ) - - result = await optimizer.detect_anomalies(start=_START, end=_END) - assert len(result.anomalies) == 1 - anomaly = result.anomalies[0] - assert anomaly.agent_id == "alice" - assert anomaly.anomaly_type == AnomalyType.SPIKE - assert anomaly.current_value == 20.0 - - async def test_insufficient_windows_no_false_positive(self) -> None: - config = CostOptimizerConfig(min_anomaly_windows=5) - optimizer, tracker = _make_optimizer(config=config) - - # Only 3 windows of data in a 3-window analysis - window_duration = (_END - _START) / 3 - for i in range(3): - ts = _START + window_duration * i + timedelta(hours=1) - cost = 1.0 if i < 2 else 50.0 - await tracker.record( - make_cost_record(agent_id="alice", cost_usd=cost, timestamp=ts), - ) - - result = await optimizer.detect_anomalies( - start=_START, - end=_END, - window_count=3, - ) - assert result.anomalies == () - - async def test_multiple_agents_only_anomalous_flagged(self) -> None: - optimizer, tracker = _make_optimizer() - window_duration = (_END - _START) / 5 - - # Alice: uniform spending - for i in range(5): - ts = _START + window_duration * i + timedelta(hours=1) - await tracker.record( - make_cost_record(agent_id="alice", cost_usd=1.0, timestamp=ts), - ) - - # Bob: spike in last window - for i in range(4): - ts = _START + window_duration * i + timedelta(hours=1) - await tracker.record( - make_cost_record(agent_id="bob", cost_usd=1.0, timestamp=ts), - ) - ts = _START + window_duration * 4 + timedelta(hours=1) - await tracker.record( - make_cost_record(agent_id="bob", cost_usd=20.0, timestamp=ts), - ) - - result = await optimizer.detect_anomalies(start=_START, end=_END) - assert len(result.anomalies) == 1 - assert result.anomalies[0].agent_id == "bob" - assert result.agents_scanned == 2 - - async def test_window_count_validation(self) -> None: - optimizer, _ = _make_optimizer() - with pytest.raises(ValueError, match="window_count must be >= 2"): - await optimizer.detect_anomalies( - start=_START, - end=_END, - window_count=1, - ) - - async def test_spike_from_zero_baseline(self) -> None: - """Agent with no historical spending that suddenly appears.""" - optimizer, tracker = _make_optimizer( - config=CostOptimizerConfig(min_anomaly_windows=3), - ) - window_duration = (_END - _START) / 5 - - # No spending in first 4 windows, spending in window 5 - ts = _START + window_duration * 4 + timedelta(hours=1) - await tracker.record( - make_cost_record(agent_id="alice", cost_usd=5.0, timestamp=ts), - ) - - result = await optimizer.detect_anomalies(start=_START, end=_END) - assert len(result.anomalies) == 1 - anomaly = result.anomalies[0] - assert anomaly.severity == AnomalySeverity.HIGH - assert anomaly.baseline_value == 0.0 - - async def test_spike_severity_with_zero_stddev(self) -> None: - """Spike severity uses spike_ratio when stddev is 0.""" - optimizer, tracker = _make_optimizer( - config=CostOptimizerConfig( - anomaly_sigma_threshold=2.0, - anomaly_spike_factor=2.0, - min_anomaly_windows=3, - ), - ) - window_duration = (_END - _START) / 5 - - # Identical baseline → stddev=0 - for i in range(4): - ts = _START + window_duration * i + timedelta(hours=1) - await tracker.record( - make_cost_record(agent_id="alice", cost_usd=1.0, timestamp=ts), - ) - - # Spike: 4x baseline → spike_ratio=4.0 → HIGH (>=3.0) - ts = _START + window_duration * 4 + timedelta(hours=1) - await tracker.record( - make_cost_record(agent_id="alice", cost_usd=4.0, timestamp=ts), - ) - - result = await optimizer.detect_anomalies(start=_START, end=_END) - assert len(result.anomalies) == 1 - assert result.anomalies[0].severity == AnomalySeverity.HIGH - - -# ── Efficiency Analysis Tests ───────────────────────────────────── - - -@pytest.mark.unit -class TestAnalyzeEfficiency: - async def test_uniform_all_normal(self) -> None: - optimizer, tracker = _make_optimizer() - - # Same cost/token ratio for all agents - for agent in ("alice", "bob", "carol"): - await tracker.record( - make_cost_record( - agent_id=agent, - cost_usd=1.0, - input_tokens=1000, - output_tokens=0, - timestamp=_START + timedelta(hours=1), - ), - ) - - result = await optimizer.analyze_efficiency(start=_START, end=_END) - assert all( - a.efficiency_rating == EfficiencyRating.NORMAL for a in result.agents - ) - assert result.inefficient_agent_count == 0 - - async def test_one_inefficient(self) -> None: - optimizer, tracker = _make_optimizer() - - # Alice: cheap (1.0/1000 = 1.0 per 1k) - await tracker.record( - make_cost_record( - agent_id="alice", - cost_usd=1.0, - input_tokens=1000, - output_tokens=0, - timestamp=_START + timedelta(hours=1), - ), - ) - # Bob: expensive (10.0/1000 = 10.0 per 1k) - await tracker.record( - make_cost_record( - agent_id="bob", - cost_usd=10.0, - input_tokens=1000, - output_tokens=0, - timestamp=_START + timedelta(hours=1), - ), - ) - - result = await optimizer.analyze_efficiency(start=_START, end=_END) - assert result.inefficient_agent_count == 1 - # Sorted by cost_per_1k desc - assert result.agents[0].agent_id == "bob" - assert result.agents[0].efficiency_rating == EfficiencyRating.INEFFICIENT - - async def test_zero_tokens_handled(self) -> None: - optimizer, tracker = _make_optimizer() - - await tracker.record( - make_cost_record( - agent_id="alice", - cost_usd=0.0, - input_tokens=0, - output_tokens=0, - timestamp=_START + timedelta(hours=1), - ), - ) - - result = await optimizer.analyze_efficiency(start=_START, end=_END) - assert len(result.agents) == 1 - assert result.agents[0].cost_per_1k_tokens == 0.0 - assert result.agents[0].efficiency_rating == EfficiencyRating.NORMAL - - async def test_efficient_agent_flagged(self) -> None: - optimizer, tracker = _make_optimizer() - - # Alice: very cheap (0.1/10000 = 0.01 per 1k) - await tracker.record( - make_cost_record( - agent_id="alice", - cost_usd=0.1, - input_tokens=10000, - output_tokens=0, - timestamp=_START + timedelta(hours=1), - ), - ) - # Bob: normal (1.0/1000 = 1.0 per 1k) - await tracker.record( - make_cost_record( - agent_id="bob", - cost_usd=1.0, - input_tokens=1000, - output_tokens=0, - timestamp=_START + timedelta(hours=1), - ), - ) - # Carol: normal (1.0/1000 = 1.0 per 1k) - await tracker.record( - make_cost_record( - agent_id="carol", - cost_usd=1.0, - input_tokens=1000, - output_tokens=0, - timestamp=_START + timedelta(hours=1), - ), - ) - - result = await optimizer.analyze_efficiency(start=_START, end=_END) - alice = next(a for a in result.agents if a.agent_id == "alice") - assert alice.efficiency_rating == EfficiencyRating.EFFICIENT - - async def test_empty_records(self) -> None: - optimizer, _ = _make_optimizer() - result = await optimizer.analyze_efficiency(start=_START, end=_END) - assert result.agents == () - assert result.global_avg_cost_per_1k == 0.0 - - -# ── Downgrade Recommendation Tests ──────────────────────────────── - - -@pytest.mark.unit -class TestRecommendDowngrades: - async def test_no_resolver_empty_result(self) -> None: - optimizer, _ = _make_optimizer() - result = await optimizer.recommend_downgrades(start=_START, end=_END) - assert result.recommendations == () - - async def test_with_downgrade_path(self) -> None: - from ai_company.budget.config import AutoDowngradeConfig - - resolver = _make_resolver() - bc = BudgetConfig( - total_monthly=100.0, - auto_downgrade=AutoDowngradeConfig( - enabled=True, - threshold=80, - downgrade_map=(("large", "small"),), - ), - ) - tracker = CostTracker(budget_config=bc) - optimizer = CostOptimizer( - cost_tracker=tracker, - budget_config=bc, - model_resolver=resolver, - ) - - # Make alice inefficient using large model - await tracker.record( - make_cost_record( - agent_id="alice", - model="test-large-001", - cost_usd=10.0, - input_tokens=1000, - output_tokens=0, - timestamp=_START + timedelta(hours=1), - ), - ) - # Make bob efficient using small model - await tracker.record( - make_cost_record( - agent_id="bob", - model="test-small-001", - cost_usd=0.1, - input_tokens=1000, - output_tokens=0, - timestamp=_START + timedelta(hours=1), - ), - ) - - result = await optimizer.recommend_downgrades(start=_START, end=_END) - assert len(result.recommendations) == 1 - rec = result.recommendations[0] - assert rec.agent_id == "alice" - assert rec.current_model == "test-large-001" - assert rec.recommended_model == "test-small-001" - assert rec.estimated_savings_per_1k > 0 - - async def test_no_cheaper_model_empty(self) -> None: - """No recommendation when agent already uses cheapest model.""" - resolver = _make_resolver( - [ - ResolvedModel( - provider_name="test-provider", - model_id="test-only-001", - alias="only", - cost_per_1k_input=0.01, - cost_per_1k_output=0.02, - ), - ] - ) - bc = BudgetConfig(total_monthly=100.0) - tracker = CostTracker(budget_config=bc) - optimizer = CostOptimizer( - cost_tracker=tracker, - budget_config=bc, - model_resolver=resolver, - ) - - # Only agent, only model — inefficient by default since it's the only one - await tracker.record( - make_cost_record( - agent_id="alice", - model="test-only-001", - cost_usd=10.0, - input_tokens=1000, - output_tokens=0, - timestamp=_START + timedelta(hours=1), - ), - ) - - result = await optimizer.recommend_downgrades(start=_START, end=_END) - assert result.recommendations == () - - -# ── Evaluate Operation Tests ────────────────────────────────────── - - -@pytest.mark.unit -class TestEvaluateOperation: - async def test_healthy_budget_approved(self) -> None: - optimizer, tracker = _make_optimizer() - # Spend only 10% of budget - await tracker.record( - make_cost_record(cost_usd=10.0, timestamp=_START + timedelta(hours=1)), - ) - decision = await optimizer.evaluate_operation( - agent_id="alice", - estimated_cost_usd=0.5, - now=_START + timedelta(days=15), - ) - assert decision.approved is True - assert decision.alert_level == BudgetAlertLevel.NORMAL - - async def test_hard_stop_denied(self) -> None: - bc = BudgetConfig( - total_monthly=100.0, - alerts=BudgetAlertConfig(warn_at=75, critical_at=90, hard_stop_at=100), - ) - optimizer, tracker = _make_optimizer(budget_config=bc) - - # Spend 100% of budget - await tracker.record( - make_cost_record(cost_usd=100.0, timestamp=_START + timedelta(hours=1)), - ) - - decision = await optimizer.evaluate_operation( - agent_id="alice", - estimated_cost_usd=1.0, - now=_START + timedelta(days=15), - ) - assert decision.approved is False - assert decision.alert_level == BudgetAlertLevel.HARD_STOP - - async def test_would_exceed_budget_denied(self) -> None: - bc = BudgetConfig( - total_monthly=100.0, - alerts=BudgetAlertConfig(warn_at=75, critical_at=90, hard_stop_at=100), - ) - optimizer, tracker = _make_optimizer(budget_config=bc) - - # Spend 95% and request 10 more → projected 105% → HARD_STOP - await tracker.record( - make_cost_record(cost_usd=95.0, timestamp=_START + timedelta(hours=1)), - ) - - decision = await optimizer.evaluate_operation( - agent_id="alice", - estimated_cost_usd=10.0, - now=_START + timedelta(days=15), - ) - assert decision.approved is False - # With projected alert level, this now triggers auto-deny - assert "denied" in decision.reason.lower() - - async def test_warning_level_approved_with_conditions(self) -> None: - bc = BudgetConfig( - total_monthly=100.0, - alerts=BudgetAlertConfig(warn_at=75, critical_at=90, hard_stop_at=100), - ) - optimizer, tracker = _make_optimizer(budget_config=bc) - - # Spend 80% (warning level) - await tracker.record( - make_cost_record(cost_usd=80.0, timestamp=_START + timedelta(hours=1)), - ) - - decision = await optimizer.evaluate_operation( - agent_id="alice", - estimated_cost_usd=2.0, - now=_START + timedelta(days=15), - ) - assert decision.approved is True - assert decision.alert_level == BudgetAlertLevel.WARNING - assert len(decision.conditions) > 0 - - async def test_budget_enforcement_disabled(self) -> None: - bc = BudgetConfig(total_monthly=0.0) - optimizer, _ = _make_optimizer(budget_config=bc) - - decision = await optimizer.evaluate_operation( - agent_id="alice", - estimated_cost_usd=100.0, - ) - assert decision.approved is True - assert "disabled" in decision.reason.lower() - - async def test_critical_level_auto_deny_with_custom_config(self) -> None: - """Auto-deny at CRITICAL when configured.""" - bc = BudgetConfig( - total_monthly=100.0, - alerts=BudgetAlertConfig(warn_at=75, critical_at=90, hard_stop_at=100), - ) - config = CostOptimizerConfig( - approval_auto_deny_alert_level=BudgetAlertLevel.CRITICAL, - ) - optimizer, tracker = _make_optimizer(budget_config=bc, config=config) - - # Spend 92% (critical level) - await tracker.record( - make_cost_record(cost_usd=92.0, timestamp=_START + timedelta(hours=1)), - ) - - decision = await optimizer.evaluate_operation( - agent_id="alice", - estimated_cost_usd=0.01, - now=_START + timedelta(days=15), - ) - assert decision.approved is False - assert decision.alert_level == BudgetAlertLevel.CRITICAL - - async def test_high_cost_condition(self) -> None: - """High-cost warning condition when estimated cost >= threshold.""" - config = CostOptimizerConfig(approval_warn_threshold_usd=0.5) - optimizer, _ = _make_optimizer(config=config) - - decision = await optimizer.evaluate_operation( - agent_id="alice", - estimated_cost_usd=1.0, - now=_START + timedelta(days=15), - ) - assert decision.approved is True - assert any("High-cost" in c for c in decision.conditions) - - # ── _classify_severity Tests ───────────────────────────────────── @@ -622,279 +53,16 @@ def test_thresholds(self, deviation: float, expected: AnomalySeverity) -> None: @pytest.mark.unit class TestInputValidation: async def test_detect_anomalies_start_after_end(self) -> None: - optimizer, _ = _make_optimizer() + optimizer, _ = make_optimizer() with pytest.raises(ValueError, match=r"start .* must be before end"): - await optimizer.detect_anomalies(start=_END, end=_START) + await optimizer.detect_anomalies(start=OPT_END, end=OPT_START) async def test_analyze_efficiency_start_after_end(self) -> None: - optimizer, _ = _make_optimizer() + optimizer, _ = make_optimizer() with pytest.raises(ValueError, match=r"start .* must be before end"): - await optimizer.analyze_efficiency(start=_END, end=_START) + await optimizer.analyze_efficiency(start=OPT_END, end=OPT_START) async def test_recommend_downgrades_start_after_end(self) -> None: - optimizer, _ = _make_optimizer() + optimizer, _ = make_optimizer() with pytest.raises(ValueError, match=r"start .* must be before end"): - await optimizer.recommend_downgrades(start=_END, end=_START) - - -# ── Edge Case Tests ────────────────────────────────────────────── - - -@pytest.mark.unit -class TestEdgeCases: - async def test_find_cheaper_model_picks_cheapest(self) -> None: - """_find_cheaper_model selects the overall cheapest below current.""" - resolver = _make_resolver() - result = await _make_optimizer(model_resolver=resolver)[0].recommend_downgrades( - start=_START, end=_END - ) - # No records → no recommendations, but validates the path - assert result.recommendations == () - - async def test_budget_pressure_percent_reflects_spending(self) -> None: - """budget_pressure_percent reflects actual spend vs budget.""" - from ai_company.budget.billing import billing_period_start - - resolver = _make_resolver() - bc = BudgetConfig(total_monthly=100.0) - tracker = CostTracker(budget_config=bc) - optimizer = CostOptimizer( - cost_tracker=tracker, - budget_config=bc, - model_resolver=resolver, - ) - # Record in the current billing period so pressure reflects it - now = datetime.now(UTC) - period_start = billing_period_start(bc.reset_day, now=now) - await tracker.record( - make_cost_record( - cost_usd=60.0, - timestamp=period_start + timedelta(hours=1), - ), - ) - # Use a period that covers the data for the efficiency analysis - analysis_start = period_start - analysis_end = now + timedelta(days=1) - result = await optimizer.recommend_downgrades( - start=analysis_start, end=analysis_end - ) - assert result.budget_pressure_percent == 60.0 - - async def test_downgrade_target_not_resolved(self) -> None: - """No recommendation when downgrade target doesn't resolve.""" - from ai_company.budget.config import AutoDowngradeConfig - - resolver = _make_resolver( - [ - ResolvedModel( - provider_name="test-provider", - model_id="test-large-001", - alias="large", - cost_per_1k_input=0.03, - cost_per_1k_output=0.06, - ), - ] - ) - bc = BudgetConfig( - total_monthly=100.0, - auto_downgrade=AutoDowngradeConfig( - enabled=True, - threshold=80, - downgrade_map=(("large", "nonexistent"),), - ), - ) - tracker = CostTracker(budget_config=bc) - optimizer = CostOptimizer( - cost_tracker=tracker, - budget_config=bc, - model_resolver=resolver, - ) - - # Make alice inefficient (only agent, but needs another to set avg) - await tracker.record( - make_cost_record( - agent_id="alice", - model="test-large-001", - cost_usd=10.0, - input_tokens=1000, - output_tokens=0, - timestamp=_START + timedelta(hours=1), - ), - ) - await tracker.record( - make_cost_record( - agent_id="bob", - model="test-large-001", - cost_usd=0.1, - input_tokens=1000, - output_tokens=0, - timestamp=_START + timedelta(hours=1), - ), - ) - - result = await optimizer.recommend_downgrades(start=_START, end=_END) - # Target "nonexistent" can't be resolved → no recommendation - assert result.recommendations == () - - async def test_negative_estimated_cost_rejected(self) -> None: - """Negative estimated_cost_usd raises ValueError.""" - optimizer, _ = _make_optimizer() - with pytest.raises(ValueError, match="estimated_cost_usd must be >= 0"): - await optimizer.evaluate_operation( - agent_id="alice", - estimated_cost_usd=-1.0, - ) - - async def test_window_count_upper_bound(self) -> None: - """window_count > 1000 raises ValueError.""" - optimizer, _ = _make_optimizer() - with pytest.raises(ValueError, match="window_count must be <= 1000"): - await optimizer.detect_anomalies( - start=_START, - end=_END, - window_count=1001, - ) - - async def test_projected_alert_level_used_for_auto_deny(self) -> None: - """Auto-deny uses projected alert level, not current.""" - bc = BudgetConfig( - total_monthly=100.0, - alerts=BudgetAlertConfig(warn_at=75, critical_at=90, hard_stop_at=100), - ) - config = CostOptimizerConfig( - approval_auto_deny_alert_level=BudgetAlertLevel.HARD_STOP, - ) - optimizer, tracker = _make_optimizer(budget_config=bc, config=config) - - # Spend 95% — current alert is CRITICAL, but requesting 10 - # would push to 105% → projected HARD_STOP → denied - await tracker.record( - make_cost_record(cost_usd=95.0, timestamp=_START + timedelta(hours=1)), - ) - - decision = await optimizer.evaluate_operation( - agent_id="alice", - estimated_cost_usd=10.0, - now=_START + timedelta(days=15), - ) - assert decision.approved is False - assert "projected" in decision.reason.lower() - - -# ── Routing Optimization Tests ────────────────────────────────── - - -@pytest.mark.unit -class TestSuggestRoutingOptimizations: - async def test_no_resolver_empty_result(self) -> None: - optimizer, _ = _make_optimizer() - result = await optimizer.suggest_routing_optimizations( - start=_START, - end=_END, - ) - assert result.suggestions == () - assert result.agents_analyzed == 0 - - async def test_no_records_empty_suggestions(self) -> None: - resolver = _make_resolver() - optimizer, _ = _make_optimizer(model_resolver=resolver) - result = await optimizer.suggest_routing_optimizations( - start=_START, - end=_END, - ) - assert result.suggestions == () - assert result.agents_analyzed == 0 - - async def test_suggests_cheaper_model(self) -> None: - resolver = _make_resolver() - optimizer, tracker = _make_optimizer(model_resolver=resolver) - - # Alice uses the expensive large model - await tracker.record( - make_cost_record( - agent_id="alice", - model="test-large-001", - cost_usd=5.0, - input_tokens=1000, - output_tokens=500, - timestamp=_START + timedelta(hours=1), - ), - ) - - result = await optimizer.suggest_routing_optimizations( - start=_START, - end=_END, - ) - assert len(result.suggestions) == 1 - suggestion = result.suggestions[0] - assert suggestion.agent_id == "alice" - assert suggestion.current_model == "test-large-001" - assert suggestion.estimated_savings_per_1k > 0 - assert result.total_estimated_savings_per_1k > 0 - - async def test_no_suggestion_for_cheapest_model(self) -> None: - resolver = _make_resolver() - optimizer, tracker = _make_optimizer(model_resolver=resolver) - - # Alice already uses the cheapest model - await tracker.record( - make_cost_record( - agent_id="alice", - model="test-small-001", - cost_usd=0.1, - input_tokens=1000, - output_tokens=500, - timestamp=_START + timedelta(hours=1), - ), - ) - - result = await optimizer.suggest_routing_optimizations( - start=_START, - end=_END, - ) - assert result.suggestions == () - assert result.agents_analyzed == 1 - - async def test_start_after_end_rejected(self) -> None: - optimizer, _ = _make_optimizer() - with pytest.raises(ValueError, match=r"start .* must be before end"): - await optimizer.suggest_routing_optimizations(start=_END, end=_START) - - async def test_context_window_respected(self) -> None: - """Suggestions only include models with sufficient context window.""" - models = [ - ResolvedModel( - provider_name="test-provider", - model_id="test-large-001", - alias="large", - cost_per_1k_input=0.03, - cost_per_1k_output=0.06, - max_context=200000, - ), - ResolvedModel( - provider_name="test-provider", - model_id="test-small-001", - alias="small", - cost_per_1k_input=0.001, - cost_per_1k_output=0.002, - max_context=50000, # Smaller context than large - ), - ] - resolver = _make_resolver(models) - optimizer, tracker = _make_optimizer(model_resolver=resolver) - - await tracker.record( - make_cost_record( - agent_id="alice", - model="test-large-001", - cost_usd=5.0, - timestamp=_START + timedelta(hours=1), - ), - ) - - result = await optimizer.suggest_routing_optimizations( - start=_START, - end=_END, - ) - # small has insufficient context window → no suggestion - assert result.suggestions == () + await optimizer.recommend_downgrades(start=OPT_END, end=OPT_START) diff --git a/tests/unit/budget/test_optimizer_analysis.py b/tests/unit/budget/test_optimizer_analysis.py new file mode 100644 index 0000000000..3994fa78fd --- /dev/null +++ b/tests/unit/budget/test_optimizer_analysis.py @@ -0,0 +1,283 @@ +"""Tests for CostOptimizer — anomaly detection and efficiency analysis.""" + +from datetime import timedelta + +import pytest + +from ai_company.budget.optimizer_models import ( + AnomalySeverity, + AnomalyType, + CostOptimizerConfig, + EfficiencyRating, +) +from tests.unit.budget.conftest import ( + OPT_END, + OPT_START, + make_cost_record, + make_optimizer, +) + +# ── Anomaly Detection Tests ────────────────────────────────────── + + +@pytest.mark.unit +class TestDetectAnomalies: + async def test_no_records_empty_result(self) -> None: + optimizer, _ = make_optimizer() + result = await optimizer.detect_anomalies(start=OPT_START, end=OPT_END) + assert result.anomalies == () + assert result.agents_scanned == 0 + + async def test_normal_spending_no_anomalies(self) -> None: + optimizer, tracker = make_optimizer() + window_duration = (OPT_END - OPT_START) / 5 + for i in range(5): + ts = OPT_START + window_duration * i + timedelta(hours=1) + await tracker.record( + make_cost_record(agent_id="alice", cost_usd=1.0, timestamp=ts), + ) + + result = await optimizer.detect_anomalies(start=OPT_START, end=OPT_END) + assert result.anomalies == () + assert result.agents_scanned == 1 + + async def test_spike_detected(self) -> None: + optimizer, tracker = make_optimizer() + window_duration = (OPT_END - OPT_START) / 5 + + for i in range(4): + ts = OPT_START + window_duration * i + timedelta(hours=1) + await tracker.record( + make_cost_record(agent_id="alice", cost_usd=1.0, timestamp=ts), + ) + + ts = OPT_START + window_duration * 4 + timedelta(hours=1) + await tracker.record( + make_cost_record(agent_id="alice", cost_usd=20.0, timestamp=ts), + ) + + result = await optimizer.detect_anomalies(start=OPT_START, end=OPT_END) + assert len(result.anomalies) == 1 + anomaly = result.anomalies[0] + assert anomaly.agent_id == "alice" + assert anomaly.anomaly_type == AnomalyType.SPIKE + assert anomaly.current_value == 20.0 + + async def test_insufficient_windows_no_false_positive(self) -> None: + config = CostOptimizerConfig(min_anomaly_windows=5) + optimizer, tracker = make_optimizer(config=config) + + window_duration = (OPT_END - OPT_START) / 3 + for i in range(3): + ts = OPT_START + window_duration * i + timedelta(hours=1) + cost = 1.0 if i < 2 else 50.0 + await tracker.record( + make_cost_record(agent_id="alice", cost_usd=cost, timestamp=ts), + ) + + result = await optimizer.detect_anomalies( + start=OPT_START, + end=OPT_END, + window_count=3, + ) + assert result.anomalies == () + + async def test_multiple_agents_only_anomalous_flagged(self) -> None: + optimizer, tracker = make_optimizer() + window_duration = (OPT_END - OPT_START) / 5 + + for i in range(5): + ts = OPT_START + window_duration * i + timedelta(hours=1) + await tracker.record( + make_cost_record(agent_id="alice", cost_usd=1.0, timestamp=ts), + ) + + for i in range(4): + ts = OPT_START + window_duration * i + timedelta(hours=1) + await tracker.record( + make_cost_record(agent_id="bob", cost_usd=1.0, timestamp=ts), + ) + ts = OPT_START + window_duration * 4 + timedelta(hours=1) + await tracker.record( + make_cost_record(agent_id="bob", cost_usd=20.0, timestamp=ts), + ) + + result = await optimizer.detect_anomalies(start=OPT_START, end=OPT_END) + assert len(result.anomalies) == 1 + assert result.anomalies[0].agent_id == "bob" + assert result.agents_scanned == 2 + + async def test_window_count_validation(self) -> None: + optimizer, _ = make_optimizer() + with pytest.raises(ValueError, match="window_count must be >= 2"): + await optimizer.detect_anomalies( + start=OPT_START, + end=OPT_END, + window_count=1, + ) + + async def test_spike_from_zero_baseline(self) -> None: + """Agent with no historical spending that suddenly appears.""" + optimizer, tracker = make_optimizer( + config=CostOptimizerConfig(min_anomaly_windows=3), + ) + window_duration = (OPT_END - OPT_START) / 5 + + ts = OPT_START + window_duration * 4 + timedelta(hours=1) + await tracker.record( + make_cost_record(agent_id="alice", cost_usd=5.0, timestamp=ts), + ) + + result = await optimizer.detect_anomalies(start=OPT_START, end=OPT_END) + assert len(result.anomalies) == 1 + anomaly = result.anomalies[0] + assert anomaly.severity == AnomalySeverity.HIGH + assert anomaly.baseline_value == 0.0 + + async def test_spike_severity_with_zero_stddev(self) -> None: + """Spike severity uses spike_ratio when stddev is 0.""" + optimizer, tracker = make_optimizer( + config=CostOptimizerConfig( + anomaly_sigma_threshold=2.0, + anomaly_spike_factor=2.0, + min_anomaly_windows=3, + ), + ) + window_duration = (OPT_END - OPT_START) / 5 + + for i in range(4): + ts = OPT_START + window_duration * i + timedelta(hours=1) + await tracker.record( + make_cost_record(agent_id="alice", cost_usd=1.0, timestamp=ts), + ) + + ts = OPT_START + window_duration * 4 + timedelta(hours=1) + await tracker.record( + make_cost_record(agent_id="alice", cost_usd=4.0, timestamp=ts), + ) + + result = await optimizer.detect_anomalies(start=OPT_START, end=OPT_END) + assert len(result.anomalies) == 1 + assert result.anomalies[0].severity == AnomalySeverity.HIGH + + async def test_window_count_upper_bound(self) -> None: + """window_count > 1000 raises ValueError.""" + optimizer, _ = make_optimizer() + with pytest.raises(ValueError, match="window_count must be <= 1000"): + await optimizer.detect_anomalies( + start=OPT_START, + end=OPT_END, + window_count=1001, + ) + + +# ── Efficiency Analysis Tests ───────────────────────────────────── + + +@pytest.mark.unit +class TestAnalyzeEfficiency: + async def test_uniform_all_normal(self) -> None: + optimizer, tracker = make_optimizer() + + for agent in ("alice", "bob", "carol"): + await tracker.record( + make_cost_record( + agent_id=agent, + cost_usd=1.0, + input_tokens=1000, + output_tokens=0, + timestamp=OPT_START + timedelta(hours=1), + ), + ) + + result = await optimizer.analyze_efficiency(start=OPT_START, end=OPT_END) + assert all( + a.efficiency_rating == EfficiencyRating.NORMAL for a in result.agents + ) + assert result.inefficient_agent_count == 0 + + async def test_one_inefficient(self) -> None: + optimizer, tracker = make_optimizer() + + await tracker.record( + make_cost_record( + agent_id="alice", + cost_usd=1.0, + input_tokens=1000, + output_tokens=0, + timestamp=OPT_START + timedelta(hours=1), + ), + ) + await tracker.record( + make_cost_record( + agent_id="bob", + cost_usd=10.0, + input_tokens=1000, + output_tokens=0, + timestamp=OPT_START + timedelta(hours=1), + ), + ) + + result = await optimizer.analyze_efficiency(start=OPT_START, end=OPT_END) + assert result.inefficient_agent_count == 1 + assert result.agents[0].agent_id == "bob" + assert result.agents[0].efficiency_rating == EfficiencyRating.INEFFICIENT + + async def test_zero_tokens_handled(self) -> None: + optimizer, tracker = make_optimizer() + + await tracker.record( + make_cost_record( + agent_id="alice", + cost_usd=0.0, + input_tokens=0, + output_tokens=0, + timestamp=OPT_START + timedelta(hours=1), + ), + ) + + result = await optimizer.analyze_efficiency(start=OPT_START, end=OPT_END) + assert len(result.agents) == 1 + assert result.agents[0].cost_per_1k_tokens == 0.0 + assert result.agents[0].efficiency_rating == EfficiencyRating.NORMAL + + async def test_efficient_agent_flagged(self) -> None: + optimizer, tracker = make_optimizer() + + await tracker.record( + make_cost_record( + agent_id="alice", + cost_usd=0.1, + input_tokens=10000, + output_tokens=0, + timestamp=OPT_START + timedelta(hours=1), + ), + ) + await tracker.record( + make_cost_record( + agent_id="bob", + cost_usd=1.0, + input_tokens=1000, + output_tokens=0, + timestamp=OPT_START + timedelta(hours=1), + ), + ) + await tracker.record( + make_cost_record( + agent_id="carol", + cost_usd=1.0, + input_tokens=1000, + output_tokens=0, + timestamp=OPT_START + timedelta(hours=1), + ), + ) + + result = await optimizer.analyze_efficiency(start=OPT_START, end=OPT_END) + alice = next(a for a in result.agents if a.agent_id == "alice") + assert alice.efficiency_rating == EfficiencyRating.EFFICIENT + + async def test_empty_records(self) -> None: + optimizer, _ = make_optimizer() + result = await optimizer.analyze_efficiency(start=OPT_START, end=OPT_END) + assert result.agents == () + assert result.global_avg_cost_per_1k == 0.0 diff --git a/tests/unit/budget/test_optimizer_decisions.py b/tests/unit/budget/test_optimizer_decisions.py new file mode 100644 index 0000000000..2c5e7846a3 --- /dev/null +++ b/tests/unit/budget/test_optimizer_decisions.py @@ -0,0 +1,540 @@ +"""Tests for CostOptimizer — downgrades, approval, routing, edge cases.""" + +from datetime import UTC, datetime, timedelta + +import pytest + +from ai_company.budget._optimizer_helpers import _find_cheaper_model +from ai_company.budget.config import ( + AutoDowngradeConfig, + BudgetAlertConfig, + BudgetConfig, +) +from ai_company.budget.enums import BudgetAlertLevel +from ai_company.budget.optimizer import CostOptimizer +from ai_company.budget.optimizer_models import CostOptimizerConfig +from ai_company.budget.tracker import CostTracker +from ai_company.providers.routing.models import ResolvedModel +from tests.unit.budget.conftest import ( + OPT_END, + OPT_START, + make_cost_record, + make_optimizer, + make_resolver, +) + +# ── Downgrade Recommendation Tests ──────────────────────────────── + + +@pytest.mark.unit +class TestRecommendDowngrades: + async def test_no_resolver_empty_result(self) -> None: + optimizer, _ = make_optimizer() + result = await optimizer.recommend_downgrades(start=OPT_START, end=OPT_END) + assert result.recommendations == () + + async def test_with_downgrade_path(self) -> None: + resolver = make_resolver() + bc = BudgetConfig( + total_monthly=100.0, + auto_downgrade=AutoDowngradeConfig( + enabled=True, + threshold=80, + downgrade_map=(("large", "small"),), + ), + ) + tracker = CostTracker(budget_config=bc) + optimizer = CostOptimizer( + cost_tracker=tracker, + budget_config=bc, + model_resolver=resolver, + ) + + await tracker.record( + make_cost_record( + agent_id="alice", + model="test-large-001", + cost_usd=10.0, + input_tokens=1000, + output_tokens=0, + timestamp=OPT_START + timedelta(hours=1), + ), + ) + await tracker.record( + make_cost_record( + agent_id="bob", + model="test-small-001", + cost_usd=0.1, + input_tokens=1000, + output_tokens=0, + timestamp=OPT_START + timedelta(hours=1), + ), + ) + + result = await optimizer.recommend_downgrades(start=OPT_START, end=OPT_END) + assert len(result.recommendations) == 1 + rec = result.recommendations[0] + assert rec.agent_id == "alice" + assert rec.current_model == "test-large-001" + assert rec.recommended_model == "test-small-001" + assert rec.estimated_savings_per_1k > 0 + + async def test_no_cheaper_model_empty(self) -> None: + """No recommendation when agent already uses cheapest model.""" + resolver = make_resolver( + [ + ResolvedModel( + provider_name="test-provider", + model_id="test-only-001", + alias="only", + cost_per_1k_input=0.01, + cost_per_1k_output=0.02, + ), + ] + ) + bc = BudgetConfig(total_monthly=100.0) + tracker = CostTracker(budget_config=bc) + optimizer = CostOptimizer( + cost_tracker=tracker, + budget_config=bc, + model_resolver=resolver, + ) + + await tracker.record( + make_cost_record( + agent_id="alice", + model="test-only-001", + cost_usd=10.0, + input_tokens=1000, + output_tokens=0, + timestamp=OPT_START + timedelta(hours=1), + ), + ) + + result = await optimizer.recommend_downgrades(start=OPT_START, end=OPT_END) + assert result.recommendations == () + + +# ── Evaluate Operation Tests ────────────────────────────────────── + + +@pytest.mark.unit +class TestEvaluateOperation: + async def test_healthy_budget_approved(self) -> None: + optimizer, tracker = make_optimizer() + await tracker.record( + make_cost_record(cost_usd=10.0, timestamp=OPT_START + timedelta(hours=1)), + ) + decision = await optimizer.evaluate_operation( + agent_id="alice", + estimated_cost_usd=0.5, + now=OPT_START + timedelta(days=15), + ) + assert decision.approved is True + assert decision.alert_level == BudgetAlertLevel.NORMAL + + async def test_hard_stop_denied(self) -> None: + bc = BudgetConfig( + total_monthly=100.0, + alerts=BudgetAlertConfig(warn_at=75, critical_at=90, hard_stop_at=100), + ) + optimizer, tracker = make_optimizer(budget_config=bc) + + await tracker.record( + make_cost_record(cost_usd=100.0, timestamp=OPT_START + timedelta(hours=1)), + ) + + decision = await optimizer.evaluate_operation( + agent_id="alice", + estimated_cost_usd=1.0, + now=OPT_START + timedelta(days=15), + ) + assert decision.approved is False + assert decision.alert_level == BudgetAlertLevel.HARD_STOP + + async def test_would_exceed_budget_denied(self) -> None: + bc = BudgetConfig( + total_monthly=100.0, + alerts=BudgetAlertConfig(warn_at=75, critical_at=90, hard_stop_at=100), + ) + optimizer, tracker = make_optimizer(budget_config=bc) + + # Spend 95% and request 10 more → projected 105% → HARD_STOP + await tracker.record( + make_cost_record(cost_usd=95.0, timestamp=OPT_START + timedelta(hours=1)), + ) + + decision = await optimizer.evaluate_operation( + agent_id="alice", + estimated_cost_usd=10.0, + now=OPT_START + timedelta(days=15), + ) + assert decision.approved is False + # With projected alert level, this now triggers auto-deny + assert "denied" in decision.reason.lower() + + async def test_warning_level_approved_with_conditions(self) -> None: + bc = BudgetConfig( + total_monthly=100.0, + alerts=BudgetAlertConfig(warn_at=75, critical_at=90, hard_stop_at=100), + ) + optimizer, tracker = make_optimizer(budget_config=bc) + + # Spend 80% (warning level) + await tracker.record( + make_cost_record(cost_usd=80.0, timestamp=OPT_START + timedelta(hours=1)), + ) + + decision = await optimizer.evaluate_operation( + agent_id="alice", + estimated_cost_usd=2.0, + now=OPT_START + timedelta(days=15), + ) + assert decision.approved is True + assert decision.alert_level == BudgetAlertLevel.WARNING + assert len(decision.conditions) > 0 + + async def test_budget_enforcement_disabled(self) -> None: + bc = BudgetConfig(total_monthly=0.0) + optimizer, _ = make_optimizer(budget_config=bc) + + decision = await optimizer.evaluate_operation( + agent_id="alice", + estimated_cost_usd=100.0, + ) + assert decision.approved is True + assert "disabled" in decision.reason.lower() + + async def test_critical_level_auto_deny_with_custom_config(self) -> None: + """Auto-deny at CRITICAL when configured.""" + bc = BudgetConfig( + total_monthly=100.0, + alerts=BudgetAlertConfig(warn_at=75, critical_at=90, hard_stop_at=100), + ) + config = CostOptimizerConfig( + approval_auto_deny_alert_level=BudgetAlertLevel.CRITICAL, + ) + optimizer, tracker = make_optimizer(budget_config=bc, config=config) + + # Spend 92% (critical level) + await tracker.record( + make_cost_record(cost_usd=92.0, timestamp=OPT_START + timedelta(hours=1)), + ) + + decision = await optimizer.evaluate_operation( + agent_id="alice", + estimated_cost_usd=0.01, + now=OPT_START + timedelta(days=15), + ) + assert decision.approved is False + assert decision.alert_level == BudgetAlertLevel.CRITICAL + + async def test_high_cost_condition(self) -> None: + """High-cost warning condition when estimated cost >= threshold.""" + config = CostOptimizerConfig(approval_warn_threshold_usd=0.5) + optimizer, _ = make_optimizer(config=config) + + decision = await optimizer.evaluate_operation( + agent_id="alice", + estimated_cost_usd=1.0, + now=OPT_START + timedelta(days=15), + ) + assert decision.approved is True + assert any("High-cost" in c for c in decision.conditions) + + async def test_negative_estimated_cost_rejected(self) -> None: + """Negative estimated_cost_usd raises ValueError.""" + optimizer, _ = make_optimizer() + with pytest.raises(ValueError, match="estimated_cost_usd must be >= 0"): + await optimizer.evaluate_operation( + agent_id="alice", + estimated_cost_usd=-1.0, + ) + + async def test_projected_alert_level_used_for_auto_deny(self) -> None: + """Auto-deny uses projected alert level, not current.""" + bc = BudgetConfig( + total_monthly=100.0, + alerts=BudgetAlertConfig(warn_at=75, critical_at=90, hard_stop_at=100), + ) + config = CostOptimizerConfig( + approval_auto_deny_alert_level=BudgetAlertLevel.HARD_STOP, + ) + optimizer, tracker = make_optimizer(budget_config=bc, config=config) + + # Spend 95% — current alert is CRITICAL, but requesting 10 + # would push to 105% → projected HARD_STOP → denied + await tracker.record( + make_cost_record(cost_usd=95.0, timestamp=OPT_START + timedelta(hours=1)), + ) + + decision = await optimizer.evaluate_operation( + agent_id="alice", + estimated_cost_usd=10.0, + now=OPT_START + timedelta(days=15), + ) + assert decision.approved is False + assert "projected" in decision.reason.lower() + + +# ── Routing Optimization Tests ────────────────────────────────── + + +@pytest.mark.unit +class TestSuggestRoutingOptimizations: + async def test_no_resolver_empty_result(self) -> None: + optimizer, _ = make_optimizer() + result = await optimizer.suggest_routing_optimizations( + start=OPT_START, + end=OPT_END, + ) + assert result.suggestions == () + assert result.agents_analyzed == 0 + + async def test_no_records_empty_suggestions(self) -> None: + resolver = make_resolver() + optimizer, _ = make_optimizer(model_resolver=resolver) + result = await optimizer.suggest_routing_optimizations( + start=OPT_START, + end=OPT_END, + ) + assert result.suggestions == () + assert result.agents_analyzed == 0 + + async def test_suggests_cheaper_model(self) -> None: + resolver = make_resolver() + optimizer, tracker = make_optimizer(model_resolver=resolver) + + await tracker.record( + make_cost_record( + agent_id="alice", + model="test-large-001", + cost_usd=5.0, + input_tokens=1000, + output_tokens=500, + timestamp=OPT_START + timedelta(hours=1), + ), + ) + + result = await optimizer.suggest_routing_optimizations( + start=OPT_START, + end=OPT_END, + ) + assert len(result.suggestions) == 1 + suggestion = result.suggestions[0] + assert suggestion.agent_id == "alice" + assert suggestion.current_model == "test-large-001" + assert suggestion.estimated_savings_per_1k > 0 + assert result.total_estimated_savings_per_1k > 0 + + async def test_no_suggestion_for_cheapest_model(self) -> None: + resolver = make_resolver() + optimizer, tracker = make_optimizer(model_resolver=resolver) + + await tracker.record( + make_cost_record( + agent_id="alice", + model="test-small-001", + cost_usd=0.1, + input_tokens=1000, + output_tokens=500, + timestamp=OPT_START + timedelta(hours=1), + ), + ) + + result = await optimizer.suggest_routing_optimizations( + start=OPT_START, + end=OPT_END, + ) + assert result.suggestions == () + assert result.agents_analyzed == 1 + + async def test_start_after_end_rejected(self) -> None: + optimizer, _ = make_optimizer() + with pytest.raises(ValueError, match=r"start .* must be before end"): + await optimizer.suggest_routing_optimizations(start=OPT_END, end=OPT_START) + + async def test_context_window_respected(self) -> None: + """Suggestions only include models with sufficient context window.""" + models = [ + ResolvedModel( + provider_name="test-provider", + model_id="test-large-001", + alias="large", + cost_per_1k_input=0.03, + cost_per_1k_output=0.06, + max_context=200000, + ), + ResolvedModel( + provider_name="test-provider", + model_id="test-small-001", + alias="small", + cost_per_1k_input=0.001, + cost_per_1k_output=0.002, + max_context=50000, # Smaller context than large + ), + ] + resolver = make_resolver(models) + optimizer, tracker = make_optimizer(model_resolver=resolver) + + await tracker.record( + make_cost_record( + agent_id="alice", + model="test-large-001", + cost_usd=5.0, + timestamp=OPT_START + timedelta(hours=1), + ), + ) + + result = await optimizer.suggest_routing_optimizations( + start=OPT_START, + end=OPT_END, + ) + # small has insufficient context window → no suggestion + assert result.suggestions == () + + +# ── Edge Case Tests ────────────────────────────────────────────── + + +@pytest.mark.unit +class TestEdgeCases: + async def test_find_cheaper_model_exercises_fallback_path(self) -> None: + """_find_cheaper_model selects the cheapest with sufficient context.""" + resolver = make_resolver() + # Directly call _find_cheaper_model to verify it picks the cheapest + result = _find_cheaper_model(0.09, resolver) + assert result is not None + assert result.model_id == "test-small-001" + + async def test_find_cheaper_model_respects_min_context(self) -> None: + """_find_cheaper_model skips models with insufficient context.""" + models = [ + ResolvedModel( + provider_name="test-provider", + model_id="test-large-001", + alias="large", + cost_per_1k_input=0.03, + cost_per_1k_output=0.06, + max_context=200000, + ), + ResolvedModel( + provider_name="test-provider", + model_id="test-small-001", + alias="small", + cost_per_1k_input=0.001, + cost_per_1k_output=0.002, + max_context=50000, + ), + ] + resolver = make_resolver(models) + # Require 200k context — small model has only 50k + result = _find_cheaper_model(0.09, resolver, min_context=200000) + assert result is None + + async def test_budget_pressure_percent_reflects_spending(self) -> None: + """budget_pressure_percent reflects actual spend vs budget.""" + from ai_company.budget.billing import billing_period_start + + resolver = make_resolver() + bc = BudgetConfig(total_monthly=100.0) + tracker = CostTracker(budget_config=bc) + optimizer = CostOptimizer( + cost_tracker=tracker, + budget_config=bc, + model_resolver=resolver, + ) + now = datetime.now(UTC) + period_start = billing_period_start(bc.reset_day, now=now) + await tracker.record( + make_cost_record( + cost_usd=60.0, + timestamp=period_start + timedelta(hours=1), + ), + ) + analysis_start = period_start + analysis_end = now + timedelta(days=1) + result = await optimizer.recommend_downgrades( + start=analysis_start, end=analysis_end + ) + assert result.budget_pressure_percent == 60.0 + + async def test_downgrade_target_not_resolved(self) -> None: + """No recommendation when downgrade target doesn't resolve.""" + resolver = make_resolver( + [ + ResolvedModel( + provider_name="test-provider", + model_id="test-large-001", + alias="large", + cost_per_1k_input=0.03, + cost_per_1k_output=0.06, + ), + ] + ) + bc = BudgetConfig( + total_monthly=100.0, + auto_downgrade=AutoDowngradeConfig( + enabled=True, + threshold=80, + downgrade_map=(("large", "nonexistent"),), + ), + ) + tracker = CostTracker(budget_config=bc) + optimizer = CostOptimizer( + cost_tracker=tracker, + budget_config=bc, + model_resolver=resolver, + ) + + await tracker.record( + make_cost_record( + agent_id="alice", + model="test-large-001", + cost_usd=10.0, + input_tokens=1000, + output_tokens=0, + timestamp=OPT_START + timedelta(hours=1), + ), + ) + await tracker.record( + make_cost_record( + agent_id="bob", + model="test-large-001", + cost_usd=0.1, + input_tokens=1000, + output_tokens=0, + timestamp=OPT_START + timedelta(hours=1), + ), + ) + + result = await optimizer.recommend_downgrades(start=OPT_START, end=OPT_END) + # Target "nonexistent" can't be resolved → no recommendation + assert result.recommendations == () + + async def test_no_resolver_returns_real_budget_pressure(self) -> None: + """recommend_downgrades without resolver still reports real pressure.""" + from ai_company.budget.billing import billing_period_start + + bc = BudgetConfig(total_monthly=100.0) + tracker = CostTracker(budget_config=bc) + optimizer = CostOptimizer( + cost_tracker=tracker, + budget_config=bc, + model_resolver=None, + ) + now = datetime.now(UTC) + period_start = billing_period_start(bc.reset_day, now=now) + await tracker.record( + make_cost_record( + cost_usd=40.0, + timestamp=period_start + timedelta(hours=1), + ), + ) + analysis_start = period_start + analysis_end = now + timedelta(days=1) + result = await optimizer.recommend_downgrades( + start=analysis_start, end=analysis_end + ) + assert result.recommendations == () + assert result.budget_pressure_percent == 40.0