diff --git a/e2e/profiles/routing-strategies/profile.go b/e2e/profiles/routing-strategies/profile.go index 3c418c98ef..342ac0c81b 100644 --- a/e2e/profiles/routing-strategies/profile.go +++ b/e2e/profiles/routing-strategies/profile.go @@ -127,6 +127,7 @@ func (p *Profile) Teardown(ctx context.Context, opts *framework.TeardownOptions) func (p *Profile) GetTestCases() []string { return []string{ "keyword-routing", + "entropy-routing", "routing-fallback", // Test sequential fallback: Keyword → Embedding → BERT → MCP // MCP tests are registered but not run by default // To run MCP tests, use: E2E_TESTS="mcp-stdio-classification,mcp-http-classification,..." diff --git a/e2e/testcases/entropy_routing.go b/e2e/testcases/entropy_routing.go new file mode 100644 index 0000000000..b6c3395602 --- /dev/null +++ b/e2e/testcases/entropy_routing.go @@ -0,0 +1,379 @@ +package testcases + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "os" + "strconv" + "strings" + "time" + + pkgtestcases "github.com/vllm-project/semantic-router/e2e/pkg/testcases" + "k8s.io/client-go/kubernetes" +) + +func init() { + pkgtestcases.Register("entropy-routing", pkgtestcases.TestCase{ + Description: "Test entropy-based routing decisions and verify uncertainty-aware model selection", + Tags: []string{"ai-gateway", "routing", "entropy", "uncertainty"}, + Fn: testEntropyRouting, + }) +} + +// EntropyRoutingCase represents a test case for entropy-based routing +type EntropyRoutingCase struct { + Name string `json:"name"` + Description string `json:"description"` + Query string `json:"query"` + ExpectedUncertaintyLevel string `json:"expected_uncertainty_level"` + ExpectedUseReasoning bool `json:"expected_use_reasoning"` + MinConfidence float64 `json:"min_confidence"` + MaxConfidence float64 `json:"max_confidence"` + ExpectedTopCategory string `json:"expected_top_category,omitempty"` +} + +// EntropyRoutingResult tracks the result of a single entropy routing test +type EntropyRoutingResult struct { + Name string + Query string + ExpectedUncertainty string + ActualUncertainty string + ExpectedReasoning bool + ActualReasoning bool + ExpectedCategory string + ActualCategory string + ActualConfidence float64 + MinConfidence float64 + MaxConfidence float64 + UncertaintyMatch bool + ReasoningMatch bool + ConfidenceInRange bool + CategoryMatch bool + Error string +} + +func testEntropyRouting(ctx context.Context, client *kubernetes.Clientset, opts pkgtestcases.TestCaseOptions) error { + if opts.Verbose { + fmt.Println("[Test] Testing entropy-based routing decisions") + } + + // Setup service connection and get local port + localPort, stopPortForward, err := setupServiceConnection(ctx, client, opts) + if err != nil { + return err + } + defer stopPortForward() // Ensure port forwarding is stopped when test completes + + // Load test cases from JSON file + testCases, err := loadEntropyRoutingCases("e2e/testcases/testdata/entropy_routing_cases.json") + if err != nil { + return fmt.Errorf("failed to load test cases: %w", err) + } + + // Run entropy routing tests + var results []EntropyRoutingResult + totalTests := 0 + uncertaintyMatches := 0 + reasoningMatches := 0 + confidenceMatches := 0 + categoryMatches := 0 + + for _, testCase := range testCases { + totalTests++ + result := testSingleEntropyRouting(ctx, testCase, localPort, opts.Verbose) + results = append(results, result) + + if result.UncertaintyMatch { + uncertaintyMatches++ + } + if result.ReasoningMatch { + reasoningMatches++ + } + if result.ConfidenceInRange { + confidenceMatches++ + } + if result.CategoryMatch || result.ExpectedCategory == "" { + categoryMatches++ + } + } + + // Calculate accuracy + uncertaintyAccuracy := float64(uncertaintyMatches) / float64(totalTests) * 100 + reasoningAccuracy := float64(reasoningMatches) / float64(totalTests) * 100 + confidenceAccuracy := float64(confidenceMatches) / float64(totalTests) * 100 + categoryAccuracy := float64(categoryMatches) / float64(totalTests) * 100 + + // Set details for reporting + if opts.SetDetails != nil { + opts.SetDetails(map[string]interface{}{ + "total_tests": totalTests, + "uncertainty_matches": uncertaintyMatches, + "reasoning_matches": reasoningMatches, + "confidence_matches": confidenceMatches, + "category_matches": categoryMatches, + "uncertainty_accuracy": fmt.Sprintf("%.2f%%", uncertaintyAccuracy), + "reasoning_accuracy": fmt.Sprintf("%.2f%%", reasoningAccuracy), + "confidence_accuracy": fmt.Sprintf("%.2f%%", confidenceAccuracy), + "category_accuracy": fmt.Sprintf("%.2f%%", categoryAccuracy), + }) + } + + // Print results + printEntropyRoutingResults(results, totalTests, uncertaintyMatches, reasoningMatches, + confidenceMatches, categoryMatches, uncertaintyAccuracy, reasoningAccuracy, + confidenceAccuracy, categoryAccuracy) + + if opts.Verbose { + fmt.Printf("[Test] Entropy routing test completed:\n") + fmt.Printf(" Uncertainty: %d/%d (%.2f%%)\n", uncertaintyMatches, totalTests, uncertaintyAccuracy) + fmt.Printf(" Reasoning: %d/%d (%.2f%%)\n", reasoningMatches, totalTests, reasoningAccuracy) + fmt.Printf(" Confidence: %d/%d (%.2f%%)\n", confidenceMatches, totalTests, confidenceAccuracy) + fmt.Printf(" Category: %d/%d (%.2f%%)\n", categoryMatches, totalTests, categoryAccuracy) + } + + // Return error if any critical metric is 0% + if reasoningMatches == 0 { + return fmt.Errorf("entropy routing test failed: 0%% reasoning accuracy (0/%d correct)", totalTests) + } + + return nil +} + +func loadEntropyRoutingCases(filepath string) ([]EntropyRoutingCase, error) { + data, err := os.ReadFile(filepath) + if err != nil { + return nil, fmt.Errorf("failed to read test cases file: %w", err) + } + + var cases []EntropyRoutingCase + if err := json.Unmarshal(data, &cases); err != nil { + return nil, fmt.Errorf("failed to parse test cases: %w", err) + } + + return cases, nil +} + +func testSingleEntropyRouting(ctx context.Context, testCase EntropyRoutingCase, localPort string, verbose bool) EntropyRoutingResult { + result := EntropyRoutingResult{ + Name: testCase.Name, + Query: testCase.Query, + ExpectedUncertainty: testCase.ExpectedUncertaintyLevel, + ExpectedReasoning: testCase.ExpectedUseReasoning, + ExpectedCategory: testCase.ExpectedTopCategory, + MinConfidence: testCase.MinConfidence, + MaxConfidence: testCase.MaxConfidence, + } + + // Create chat completion request with MoM model to trigger decision engine + requestBody := map[string]interface{}{ + "model": "MoM", + "messages": []map[string]string{ + {"role": "user", "content": testCase.Query}, + }, + } + + jsonData, err := json.Marshal(requestBody) + if err != nil { + result.Error = fmt.Sprintf("failed to marshal request: %v", err) + return result + } + + // Send request + url := fmt.Sprintf("http://localhost:%s/v1/chat/completions", localPort) + req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewBuffer(jsonData)) + if err != nil { + result.Error = fmt.Sprintf("failed to create request: %v", err) + return result + } + req.Header.Set("Content-Type", "application/json") + + httpClient := &http.Client{Timeout: 30 * time.Second} + resp, err := httpClient.Do(req) + if err != nil { + result.Error = fmt.Sprintf("failed to send request: %v", err) + return result + } + defer resp.Body.Close() + + // Check response status + if resp.StatusCode != http.StatusOK { + bodyBytes, _ := io.ReadAll(resp.Body) + + // Log detailed error information including headers + var errorMsg strings.Builder + errorMsg.WriteString(fmt.Sprintf("Unexpected status code: %d\n", resp.StatusCode)) + errorMsg.WriteString(fmt.Sprintf("Response body: %s\n", string(bodyBytes))) + errorMsg.WriteString("Response headers:\n") + errorMsg.WriteString(formatResponseHeaders(resp.Header)) + + result.Error = errorMsg.String() + + // Print detailed error to console for debugging + if verbose { + fmt.Printf("[Test] ✗ HTTP %d Error for test case: %s\n", resp.StatusCode, testCase.Name) + fmt.Printf(" Query: %s\n", testCase.Query) + fmt.Printf(" Response Headers:\n%s", formatResponseHeaders(resp.Header)) + fmt.Printf(" Response Body: %s\n", string(bodyBytes)) + } + + return result + } + + // Extract entropy-related headers + result.ActualUncertainty = resp.Header.Get("x-vsr-uncertainty-level") + result.ActualCategory = resp.Header.Get("x-vsr-selected-category") + + // Parse reasoning flag + reasoningHeader := resp.Header.Get("x-vsr-selected-reasoning") + result.ActualReasoning = (reasoningHeader == "true" || reasoningHeader == "True" || reasoningHeader == "1") + + // Parse confidence + confidenceHeader := resp.Header.Get("x-vsr-confidence") + if confidenceHeader != "" { + if conf, err := strconv.ParseFloat(confidenceHeader, 64); err == nil { + result.ActualConfidence = conf + } + } + + // Check if uncertainty level matches + result.UncertaintyMatch = (result.ActualUncertainty == testCase.ExpectedUncertaintyLevel) + + // Check if reasoning decision matches + result.ReasoningMatch = (result.ActualReasoning == testCase.ExpectedUseReasoning) + + // Check if confidence is in expected range + result.ConfidenceInRange = (result.ActualConfidence >= testCase.MinConfidence && + result.ActualConfidence <= testCase.MaxConfidence) + + // Check if category matches (if expected category is specified) + if testCase.ExpectedTopCategory != "" { + result.CategoryMatch = (result.ActualCategory == testCase.ExpectedTopCategory) + } else { + result.CategoryMatch = true // Skip category check if not specified + } + + if verbose && (!result.UncertaintyMatch || !result.ReasoningMatch || !result.ConfidenceInRange) { + fmt.Printf("[Test] Test case failed: %s\n", testCase.Name) + if !result.UncertaintyMatch { + fmt.Printf(" Uncertainty mismatch: expected=%s, actual=%s\n", + testCase.ExpectedUncertaintyLevel, result.ActualUncertainty) + } + if !result.ReasoningMatch { + fmt.Printf(" Reasoning mismatch: expected=%v, actual=%v\n", + testCase.ExpectedUseReasoning, result.ActualReasoning) + } + if !result.ConfidenceInRange { + fmt.Printf(" Confidence out of range: expected [%.2f, %.2f], actual=%.3f\n", + testCase.MinConfidence, testCase.MaxConfidence, result.ActualConfidence) + } + if !result.CategoryMatch && testCase.ExpectedTopCategory != "" { + fmt.Printf(" Category mismatch: expected=%s, actual=%s\n", + testCase.ExpectedTopCategory, result.ActualCategory) + } + } + + return result +} + +func printEntropyRoutingResults(results []EntropyRoutingResult, totalTests, uncertaintyMatches, + reasoningMatches, confidenceMatches, categoryMatches int, + uncertaintyAccuracy, reasoningAccuracy, confidenceAccuracy, categoryAccuracy float64) { + + separator := "================================================================================" + fmt.Println("\n" + separator) + fmt.Println("ENTROPY-BASED ROUTING TEST RESULTS") + fmt.Println(separator) + fmt.Printf("Total Tests: %d\n", totalTests) + fmt.Printf("Uncertainty Level Matches: %d (%.2f%%)\n", uncertaintyMatches, uncertaintyAccuracy) + fmt.Printf("Reasoning Decision Matches: %d (%.2f%%)\n", reasoningMatches, reasoningAccuracy) + fmt.Printf("Confidence Range Matches: %d (%.2f%%)\n", confidenceMatches, confidenceAccuracy) + fmt.Printf("Category Matches: %d (%.2f%%)\n", categoryMatches, categoryAccuracy) + fmt.Println(separator) + + // Print failed uncertainty matches + uncertaintyFailures := 0 + for _, result := range results { + if !result.UncertaintyMatch && result.Error == "" { + uncertaintyFailures++ + } + } + + if uncertaintyFailures > 0 { + fmt.Println("\nFailed Uncertainty Matches:") + for _, result := range results { + if !result.UncertaintyMatch && result.Error == "" { + fmt.Printf(" - Test: %s\n", result.Name) + fmt.Printf(" Query: %s\n", result.Query) + fmt.Printf(" Expected Uncertainty: %s\n", result.ExpectedUncertainty) + fmt.Printf(" Actual Uncertainty: %s\n", result.ActualUncertainty) + } + } + } + + // Print failed reasoning matches + reasoningFailures := 0 + for _, result := range results { + if !result.ReasoningMatch && result.Error == "" { + reasoningFailures++ + } + } + + if reasoningFailures > 0 { + fmt.Println("\nFailed Reasoning Matches:") + for _, result := range results { + if !result.ReasoningMatch && result.Error == "" { + fmt.Printf(" - Test: %s\n", result.Name) + fmt.Printf(" Query: %s\n", result.Query) + fmt.Printf(" Expected Reasoning: %v\n", result.ExpectedReasoning) + fmt.Printf(" Actual Reasoning: %v\n", result.ActualReasoning) + fmt.Printf(" Uncertainty Level: %s\n", result.ActualUncertainty) + fmt.Printf(" Confidence: %.3f\n", result.ActualConfidence) + } + } + } + + // Print failed confidence ranges + confidenceFailures := 0 + for _, result := range results { + if !result.ConfidenceInRange && result.Error == "" { + confidenceFailures++ + } + } + + if confidenceFailures > 0 { + fmt.Println("\nFailed Confidence Ranges:") + for _, result := range results { + if !result.ConfidenceInRange && result.Error == "" { + fmt.Printf(" - Test: %s\n", result.Name) + fmt.Printf(" Query: %s\n", result.Query) + fmt.Printf(" Expected Range: [%.2f, %.2f]\n", result.MinConfidence, result.MaxConfidence) + fmt.Printf(" Actual Confidence: %.3f\n", result.ActualConfidence) + } + } + } + + // Print errors + errorCount := 0 + for _, result := range results { + if result.Error != "" { + errorCount++ + } + } + + if errorCount > 0 { + fmt.Println("\nErrors:") + for _, result := range results { + if result.Error != "" { + fmt.Printf(" - Test: %s\n", result.Name) + fmt.Printf(" Query: %s\n", result.Query) + fmt.Printf(" Error: %s\n", result.Error) + } + } + } + + fmt.Println(separator + "\n") +} diff --git a/e2e/testcases/testdata/entropy_routing_cases.json b/e2e/testcases/testdata/entropy_routing_cases.json new file mode 100644 index 0000000000..9b8d8f1391 --- /dev/null +++ b/e2e/testcases/testdata/entropy_routing_cases.json @@ -0,0 +1,252 @@ +[ + { + "name": "very_high_entropy_uniform_distribution", + "description": "Test very high entropy with uniform probability distribution", + "query": "What is the best approach to analyze this data?", + "expected_uncertainty_level": "very_high", + "expected_use_reasoning": true, + "min_confidence": 0.25, + "max_confidence": 0.35 + }, + { + "name": "very_low_entropy_math_confident", + "description": "Test very low entropy with very confident math classification", + "query": "Solve the equation 2x + 5 = 15 for x", + "expected_uncertainty_level": "very_low", + "expected_use_reasoning": true, + "min_confidence": 0.85, + "max_confidence": 1.0, + "expected_top_category": "math" + }, + { + "name": "very_low_entropy_physics_confident", + "description": "Test very low entropy with very confident physics classification", + "query": "Calculate the force required to accelerate a 10kg mass at 5m/s²", + "expected_uncertainty_level": "very_low", + "expected_use_reasoning": true, + "min_confidence": 0.85, + "max_confidence": 1.0, + "expected_top_category": "physics" + }, + { + "name": "very_low_entropy_code_confident", + "description": "Test very low entropy with very confident code classification", + "query": "Write a Python function to reverse a string", + "expected_uncertainty_level": "very_low", + "expected_use_reasoning": true, + "min_confidence": 0.85, + "max_confidence": 1.0, + "expected_top_category": "code" + }, + { + "name": "very_low_entropy_biology_confident", + "description": "Test very low entropy with very confident biology classification", + "query": "What is the process of photosynthesis in plants?", + "expected_uncertainty_level": "very_low", + "expected_use_reasoning": false, + "min_confidence": 0.85, + "max_confidence": 1.0, + "expected_top_category": "biology" + }, + { + "name": "low_entropy_math_strong", + "description": "Test low entropy with strong math classification", + "query": "Find the derivative of x³ + 2x² - 5x + 7", + "expected_uncertainty_level": "low", + "expected_use_reasoning": true, + "min_confidence": 0.70, + "max_confidence": 0.90, + "expected_top_category": "math" + }, + { + "name": "low_entropy_code_strong", + "description": "Test low entropy with strong code classification", + "query": "Implement a binary search algorithm in Java", + "expected_uncertainty_level": "low", + "expected_use_reasoning": true, + "min_confidence": 0.70, + "max_confidence": 0.90, + "expected_top_category": "code" + }, + { + "name": "medium_entropy_math_above_threshold", + "description": "Test medium entropy with math above confidence threshold", + "query": "What is the integral of sin(x) with respect to x?", + "expected_uncertainty_level": "medium", + "expected_use_reasoning": true, + "min_confidence": 0.55, + "max_confidence": 0.75, + "expected_top_category": "math" + }, + { + "name": "medium_entropy_physics_above_threshold", + "description": "Test medium entropy with physics above confidence threshold", + "query": "Explain Newton's third law of motion", + "expected_uncertainty_level": "medium", + "expected_use_reasoning": true, + "min_confidence": 0.55, + "max_confidence": 0.75, + "expected_top_category": "physics" + }, + { + "name": "medium_entropy_biology_above_threshold", + "description": "Test medium entropy with biology above confidence threshold", + "query": "Describe the structure of DNA", + "expected_uncertainty_level": "medium", + "expected_use_reasoning": false, + "min_confidence": 0.55, + "max_confidence": 0.75, + "expected_top_category": "biology" + }, + { + "name": "high_entropy_math_physics_weighted", + "description": "Test high entropy with math and physics competing (both reasoning)", + "query": "Calculate the trajectory of a projectile launched at 45 degrees", + "expected_uncertainty_level": "high", + "expected_use_reasoning": true, + "min_confidence": 0.50, + "max_confidence": 0.75 + }, + { + "name": "high_entropy_code_math_weighted", + "description": "Test high entropy with code and math competing (both reasoning)", + "query": "Implement an algorithm to calculate Fibonacci numbers", + "expected_uncertainty_level": "high", + "expected_use_reasoning": true, + "min_confidence": 0.50, + "max_confidence": 0.75 + }, + { + "name": "high_entropy_biology_chemistry_weighted", + "description": "Test high entropy with biology and chemistry competing (both non-reasoning)", + "query": "Explain the process of cellular respiration and its chemical reactions", + "expected_uncertainty_level": "high", + "expected_use_reasoning": false, + "min_confidence": 0.50, + "max_confidence": 0.75 + }, + { + "name": "high_entropy_mixed_reasoning_nonreasoning", + "description": "Test high entropy with mixed reasoning and non-reasoning categories", + "query": "How does machine learning work in biological research?", + "expected_uncertainty_level": "high", + "expected_use_reasoning": true, + "min_confidence": 0.45, + "max_confidence": 0.70 + }, + { + "name": "confidence_adjustment_very_low", + "description": "Test confidence adjustment for very low uncertainty (95% multiplier)", + "query": "What is 2 + 2?", + "expected_uncertainty_level": "very_low", + "expected_use_reasoning": true, + "min_confidence": 0.90, + "max_confidence": 1.0, + "expected_top_category": "math" + }, + { + "name": "confidence_adjustment_low", + "description": "Test confidence adjustment for low uncertainty (90% multiplier)", + "query": "Solve for x: 3x - 7 = 11", + "expected_uncertainty_level": "low", + "expected_use_reasoning": true, + "min_confidence": 0.75, + "max_confidence": 0.90, + "expected_top_category": "math" + }, + { + "name": "confidence_adjustment_medium", + "description": "Test confidence adjustment for medium uncertainty (80% multiplier)", + "query": "What is the quadratic formula?", + "expected_uncertainty_level": "medium", + "expected_use_reasoning": true, + "min_confidence": 0.55, + "max_confidence": 0.75, + "expected_top_category": "math" + }, + { + "name": "confidence_adjustment_high", + "description": "Test confidence adjustment for high uncertainty (weighted decision)", + "query": "Analyze the mathematical patterns in biological systems", + "expected_uncertainty_level": "high", + "expected_use_reasoning": true, + "min_confidence": 0.50, + "max_confidence": 0.70 + }, + { + "name": "weighted_decision_all_reasoning", + "description": "Test weighted decision when top categories all require reasoning", + "query": "Develop a mathematical model to simulate physical phenomena", + "expected_uncertainty_level": "high", + "expected_use_reasoning": true, + "min_confidence": 0.50, + "max_confidence": 0.75 + }, + { + "name": "weighted_decision_all_nonreasoning", + "description": "Test weighted decision when top categories don't require reasoning", + "query": "Describe the historical and biological context of human evolution", + "expected_uncertainty_level": "high", + "expected_use_reasoning": false, + "min_confidence": 0.50, + "max_confidence": 0.75 + }, + { + "name": "edge_case_complex_interdisciplinary", + "description": "Test edge case with complex interdisciplinary query", + "query": "How do quantum mechanics principles apply to computational algorithms in bioinformatics?", + "expected_uncertainty_level": "very_high", + "expected_use_reasoning": true, + "min_confidence": 0.25, + "max_confidence": 0.40 + }, + { + "name": "fallback_conservative_default", + "description": "Test conservative fallback for ambiguous queries", + "query": "Tell me about it", + "expected_uncertainty_level": "very_high", + "expected_use_reasoning": true, + "min_confidence": 0.25, + "max_confidence": 0.40 + }, + { + "name": "reasoning_physics_kinematics", + "description": "Test reasoning decision for physics kinematics problem", + "query": "A car accelerates from rest to 60 mph in 8 seconds. Calculate the acceleration and distance traveled.", + "expected_uncertainty_level": "low", + "expected_use_reasoning": true, + "min_confidence": 0.70, + "max_confidence": 0.90, + "expected_top_category": "physics" + }, + { + "name": "reasoning_code_algorithm", + "description": "Test reasoning decision for algorithmic coding problem", + "query": "Design a data structure to support insert, delete, and getRandom operations in O(1) time", + "expected_uncertainty_level": "very_low", + "expected_use_reasoning": true, + "min_confidence": 0.85, + "max_confidence": 1.0, + "expected_top_category": "code" + }, + { + "name": "nonreasoning_history_facts", + "description": "Test non-reasoning decision for historical facts", + "query": "When did World War II end?", + "expected_uncertainty_level": "very_low", + "expected_use_reasoning": false, + "min_confidence": 0.85, + "max_confidence": 1.0, + "expected_top_category": "history" + }, + { + "name": "nonreasoning_language_grammar", + "description": "Test non-reasoning decision for language/grammar query", + "query": "What is the plural form of 'mouse' in English?", + "expected_uncertainty_level": "very_low", + "expected_use_reasoning": false, + "min_confidence": 0.85, + "max_confidence": 1.0, + "expected_top_category": "language" + } +] diff --git a/src/semantic-router/pkg/utils/entropy/entropy_test.go b/src/semantic-router/pkg/utils/entropy/entropy_test.go index 2d7c2e1b72..6dc561dc0f 100644 --- a/src/semantic-router/pkg/utils/entropy/entropy_test.go +++ b/src/semantic-router/pkg/utils/entropy/entropy_test.go @@ -2,6 +2,7 @@ package entropy import ( "math" + "strings" "testing" ) @@ -352,3 +353,413 @@ func TestEntropyMetricsIntegration(t *testing.T) { }) } } + +// TestEntropyEdgeCases tests edge cases and boundary conditions +func TestEntropyEdgeCases(t *testing.T) { + tests := []struct { + name string + probabilities []float32 + expectedEntropy float64 + expectedNormalized float64 + description string + }{ + { + name: "Empty probability array", + probabilities: []float32{}, + expectedEntropy: 0.0, + expectedNormalized: 0.0, + description: "Should return 0 entropy for empty array", + }, + { + name: "Single probability", + probabilities: []float32{1.0}, + expectedEntropy: 0.0, + expectedNormalized: 0.0, + description: "Should return 0 normalized entropy for single probability", + }, + { + name: "All zeros", + probabilities: []float32{0.0, 0.0, 0.0, 0.0}, + expectedEntropy: 0.0, + expectedNormalized: 0.0, + description: "Should handle all zeros gracefully", + }, + { + name: "Very small probabilities", + probabilities: []float32{0.9999, 0.0001, 0.0, 0.0}, + expectedEntropy: 0.0014, + expectedNormalized: 0.0007, + description: "Should handle very small probabilities", + }, + { + name: "Perfect uniform distribution (2 classes)", + probabilities: []float32{0.5, 0.5}, + expectedEntropy: 1.0, + expectedNormalized: 1.0, + description: "Should return max entropy for 2-class uniform distribution", + }, + { + name: "Perfect uniform distribution (8 classes)", + probabilities: []float32{0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125}, + expectedEntropy: 3.0, + expectedNormalized: 1.0, + description: "Should return max entropy for 8-class uniform distribution", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + entropy := CalculateEntropy(tt.probabilities) + normalized := CalculateNormalizedEntropy(tt.probabilities) + + if math.Abs(entropy-tt.expectedEntropy) > 0.01 { + t.Errorf("%s: Expected entropy %.4f, got %.4f", tt.description, tt.expectedEntropy, entropy) + } + + if math.Abs(normalized-tt.expectedNormalized) > 0.01 { + t.Errorf("%s: Expected normalized entropy %.4f, got %.4f", tt.description, tt.expectedNormalized, normalized) + } + }) + } +} + +// TestEntropyUncertaintyLevels tests all uncertainty level classifications +func TestEntropyUncertaintyLevels(t *testing.T) { + tests := []struct { + name string + probabilities []float32 + expectedUncertaintyLevel string + expectedNormalizedRange [2]float64 // [min, max] + description string + }{ + { + name: "Very high uncertainty - uniform", + probabilities: []float32{0.25, 0.25, 0.25, 0.25}, + expectedUncertaintyLevel: "very_high", + expectedNormalizedRange: [2]float64{0.8, 1.0}, + description: "Uniform distribution should have very high uncertainty", + }, + { + name: "Very high uncertainty - near uniform", + probabilities: []float32{0.28, 0.26, 0.24, 0.22}, + expectedUncertaintyLevel: "very_high", + expectedNormalizedRange: [2]float64{0.8, 1.0}, + description: "Near-uniform distribution should have very high uncertainty", + }, + { + name: "High uncertainty - two dominant", + probabilities: []float32{0.45, 0.40, 0.10, 0.05}, + expectedUncertaintyLevel: "high", + expectedNormalizedRange: [2]float64{0.6, 0.8}, + description: "Two competing categories should have high uncertainty", + }, + { + name: "Medium uncertainty - clear leader", + probabilities: []float32{0.70, 0.15, 0.10, 0.05}, + expectedUncertaintyLevel: "high", + expectedNormalizedRange: [2]float64{0.4, 0.8}, + description: "Clear leader with some uncertainty", + }, + { + name: "Low uncertainty - strong leader", + probabilities: []float32{0.85, 0.08, 0.04, 0.03}, + expectedUncertaintyLevel: "medium", + expectedNormalizedRange: [2]float64{0.2, 0.6}, + description: "Strong leader should have low uncertainty", + }, + { + name: "Very low uncertainty - dominant", + probabilities: []float32{0.95, 0.03, 0.01, 0.01}, + expectedUncertaintyLevel: "very_low", + expectedNormalizedRange: [2]float64{0.0, 0.2}, + description: "Very dominant category should have very low uncertainty", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := AnalyzeEntropy(tt.probabilities) + + if result.UncertaintyLevel != tt.expectedUncertaintyLevel { + t.Errorf("%s: Expected uncertainty level '%s', got '%s'", + tt.description, tt.expectedUncertaintyLevel, result.UncertaintyLevel) + } + + if result.NormalizedEntropy < tt.expectedNormalizedRange[0] || + result.NormalizedEntropy > tt.expectedNormalizedRange[1] { + t.Errorf("%s: Expected normalized entropy in range [%.2f, %.2f], got %.3f", + tt.description, tt.expectedNormalizedRange[0], tt.expectedNormalizedRange[1], + result.NormalizedEntropy) + } + + t.Logf("%s: normalized_entropy=%.3f, certainty=%.3f, level=%s", + tt.description, result.NormalizedEntropy, result.Certainty, result.UncertaintyLevel) + }) + } +} + +// TestReasoningDecisionComprehensive tests all reasoning decision scenarios comprehensively +func TestReasoningDecisionComprehensive(t *testing.T) { + categoryReasoningMap := map[string]bool{ + "math": true, + "physics": true, + "code": true, + "biology": false, + "history": false, + "language": false, + } + + tests := []struct { + name string + probabilities []float32 + categoryNames []string + threshold float64 + expectedUseReasoning bool + expectedDecisionReason string + expectedConfidenceMin float64 + expectedConfidenceMax float64 + description string + }{ + { + name: "Very high entropy - enable reasoning", + probabilities: []float32{0.25, 0.25, 0.25, 0.25}, + categoryNames: []string{"math", "physics", "code", "biology"}, + threshold: 0.6, + expectedUseReasoning: true, + expectedDecisionReason: "very_high_uncertainty_conservative_default", + expectedConfidenceMin: 0.25, + expectedConfidenceMax: 0.35, + description: "Uniform distribution should enable reasoning conservatively", + }, + { + name: "Very low entropy - trust classification (math)", + probabilities: []float32{0.95, 0.02, 0.02, 0.01}, + categoryNames: []string{"math", "physics", "code", "biology"}, + threshold: 0.6, + expectedUseReasoning: true, + expectedDecisionReason: "very_low_uncertainty_trust_classification", + expectedConfidenceMin: 0.85, + expectedConfidenceMax: 0.95, + description: "Very confident math classification should enable reasoning", + }, + { + name: "Very low entropy - trust classification (biology)", + probabilities: []float32{0.95, 0.02, 0.02, 0.01}, + categoryNames: []string{"biology", "history", "language", "math"}, + threshold: 0.6, + expectedUseReasoning: false, + expectedDecisionReason: "very_low_uncertainty_trust_classification", + expectedConfidenceMin: 0.85, + expectedConfidenceMax: 0.95, + description: "Very confident biology classification should not enable reasoning", + }, + { + name: "High uncertainty - weighted decision (both reasoning)", + probabilities: []float32{0.45, 0.40, 0.10, 0.05}, + categoryNames: []string{"math", "physics", "code", "biology"}, + threshold: 0.6, + expectedUseReasoning: true, + expectedDecisionReason: "high_uncertainty_weighted_decision", + expectedConfidenceMin: 0.5, + expectedConfidenceMax: 1.0, + description: "High uncertainty between two reasoning categories should enable reasoning", + }, + { + name: "High uncertainty - weighted decision (both non-reasoning)", + probabilities: []float32{0.45, 0.40, 0.10, 0.05}, + categoryNames: []string{"biology", "history", "language", "math"}, + threshold: 0.6, + expectedUseReasoning: false, + expectedDecisionReason: "high_uncertainty_weighted_decision", + expectedConfidenceMin: 0.5, + expectedConfidenceMax: 1.0, + description: "High uncertainty between two non-reasoning categories should not enable reasoning", + }, + { + name: "High uncertainty - weighted decision (mixed)", + probabilities: []float32{0.45, 0.40, 0.10, 0.05}, + categoryNames: []string{"math", "biology", "history", "language"}, + threshold: 0.6, + expectedUseReasoning: true, // 0.45/(0.45+0.40) = 0.529 > 0.5, so reasoning enabled + expectedDecisionReason: "high_uncertainty_weighted_decision", + expectedConfidenceMin: 0.5, + expectedConfidenceMax: 0.55, + description: "High uncertainty between reasoning and non-reasoning should use weighted decision", + }, + { + name: "Medium uncertainty - above threshold (math)", + probabilities: []float32{0.75, 0.15, 0.05, 0.05}, + categoryNames: []string{"math", "physics", "code", "biology"}, + threshold: 0.6, + expectedUseReasoning: true, + expectedDecisionReason: "medium_uncertainty_top_category_above_threshold", + expectedConfidenceMin: 0.55, + expectedConfidenceMax: 0.65, + description: "Medium uncertainty with math above threshold should enable reasoning", + }, + { + name: "Medium uncertainty - above threshold (biology)", + probabilities: []float32{0.75, 0.15, 0.05, 0.05}, + categoryNames: []string{"biology", "history", "language", "math"}, + threshold: 0.6, + expectedUseReasoning: false, + expectedDecisionReason: "medium_uncertainty_top_category_above_threshold", + expectedConfidenceMin: 0.55, + expectedConfidenceMax: 0.65, + description: "Medium uncertainty with biology above threshold should not enable reasoning", + }, + { + name: "Medium uncertainty - trust classification (code)", + probabilities: []float32{0.80, 0.10, 0.06, 0.04}, + categoryNames: []string{"code", "math", "physics", "biology"}, + threshold: 0.6, + expectedUseReasoning: true, + expectedDecisionReason: "medium_uncertainty_top_category_above_threshold", + expectedConfidenceMin: 0.60, + expectedConfidenceMax: 0.70, + description: "Medium uncertainty code classification should enable reasoning", + }, + { + name: "Empty probabilities - default behavior", + probabilities: []float32{}, + categoryNames: []string{}, + threshold: 0.6, + expectedUseReasoning: false, + expectedDecisionReason: "no_classification_data", + expectedConfidenceMin: 0.0, + expectedConfidenceMax: 0.0, + description: "Empty data should return safe default", + }, + { + name: "Category not in reasoning map", + probabilities: []float32{0.90, 0.05, 0.03, 0.02}, + categoryNames: []string{"unknown", "math", "physics", "biology"}, + threshold: 0.6, + expectedUseReasoning: false, + expectedDecisionReason: "category_not_in_reasoning_map", + expectedConfidenceMin: 0.70, + expectedConfidenceMax: 0.80, + description: "Unknown category should default to no reasoning", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := MakeEntropyBasedReasoningDecision( + tt.probabilities, + tt.categoryNames, + categoryReasoningMap, + tt.threshold, + ) + + if result.UseReasoning != tt.expectedUseReasoning { + t.Errorf("%s: Expected UseReasoning=%v, got %v", + tt.description, tt.expectedUseReasoning, result.UseReasoning) + } + + if result.DecisionReason != tt.expectedDecisionReason { + t.Errorf("%s: Expected DecisionReason='%s', got '%s'", + tt.description, tt.expectedDecisionReason, result.DecisionReason) + } + + if result.Confidence < tt.expectedConfidenceMin || result.Confidence > tt.expectedConfidenceMax { + t.Errorf("%s: Expected confidence in range [%.2f, %.2f], got %.3f", + tt.description, tt.expectedConfidenceMin, tt.expectedConfidenceMax, result.Confidence) + } + + if len(tt.probabilities) > 0 && len(result.TopCategories) == 0 { + t.Errorf("%s: Expected TopCategories to be populated", tt.description) + } + + t.Logf("%s: use_reasoning=%v, confidence=%.3f, reason=%s", + tt.description, result.UseReasoning, result.Confidence, result.DecisionReason) + }) + } +} + +// TestConfidenceAdjustment tests confidence adjustment based on uncertainty levels +func TestConfidenceAdjustment(t *testing.T) { + categoryReasoningMap := map[string]bool{ + "math": true, + "physics": true, + "code": true, + } + + tests := []struct { + name string + probabilities []float32 + expectedConfMin float64 + expectedConfMax float64 + expectedReasonSubstr string + description string + }{ + { + name: "Very low uncertainty - minimal adjustment", + probabilities: []float32{0.95, 0.03, 0.01, 0.01}, + expectedConfMin: 0.85, + expectedConfMax: 0.95, + expectedReasonSubstr: "very_low_uncertainty", + description: "Very low uncertainty should have highest confidence retention (~95% multiplier)", + }, + { + name: "Low uncertainty - small adjustment", + probabilities: []float32{0.90, 0.06, 0.02, 0.02}, + expectedConfMin: 0.75, + expectedConfMax: 0.85, + expectedReasonSubstr: "low_uncertainty", + description: "Low uncertainty should have good confidence retention (~90% multiplier)", + }, + { + name: "Medium uncertainty - moderate adjustment", + probabilities: []float32{0.75, 0.15, 0.05, 0.05}, + expectedConfMin: 0.55, + expectedConfMax: 0.65, + expectedReasonSubstr: "medium_uncertainty", + description: "Medium uncertainty should reduce confidence moderately (~80% multiplier)", + }, + { + name: "High uncertainty - weighted decision", + probabilities: []float32{0.45, 0.40, 0.10, 0.05}, + expectedConfMin: 0.50, + expectedConfMax: 1.00, + expectedReasonSubstr: "high_uncertainty_weighted", + description: "High uncertainty should use weighted decision", + }, + { + name: "Very high uncertainty - low confidence", + probabilities: []float32{0.25, 0.25, 0.25, 0.25}, + expectedConfMin: 0.25, + expectedConfMax: 0.35, + expectedReasonSubstr: "very_high_uncertainty", + description: "Very high uncertainty should have fixed low confidence (0.3)", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + categoryNames := []string{"math", "physics", "code", "biology"} + + result := MakeEntropyBasedReasoningDecision( + tt.probabilities, + categoryNames, + categoryReasoningMap, + 0.6, + ) + + // Check that confidence is in expected range + if result.Confidence < tt.expectedConfMin || result.Confidence > tt.expectedConfMax { + t.Errorf("%s: Expected confidence in range [%.2f, %.2f], got %.3f", + tt.description, tt.expectedConfMin, tt.expectedConfMax, result.Confidence) + } + + // Check that decision reason contains expected substring + if !strings.Contains(result.DecisionReason, tt.expectedReasonSubstr) { + t.Errorf("%s: Expected reason to contain '%s', got '%s'", + tt.description, tt.expectedReasonSubstr, result.DecisionReason) + } + + t.Logf("%s: confidence=%.3f, reason=%s", + tt.description, result.Confidence, result.DecisionReason) + }) + } +}