From fece72640f8ebfdda199f689160ca09ee4ef385f Mon Sep 17 00:00:00 2001 From: Dorien Koelemeijer Date: Mon, 12 Jan 2026 13:12:39 +1000 Subject: [PATCH 1/5] If context safe, don't flag prompt injection unless critical tool call result --- crates/goose/src/security/scanner.rs | 51 ++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 10 deletions(-) diff --git a/crates/goose/src/security/scanner.rs b/crates/goose/src/security/scanner.rs index 2e658b399909..73c5ee583e44 100644 --- a/crates/goose/src/security/scanner.rs +++ b/crates/goose/src/security/scanner.rs @@ -16,6 +16,7 @@ pub struct ScanResult { pub explanation: String, } +#[derive(Clone)] struct DetailedScanResult { confidence: f32, pattern_matches: Vec, @@ -106,20 +107,23 @@ impl PromptInjectionScanner { self.scan_conversation(messages) ); - let highest_confidence_result = - self.select_highest_confidence_result(tool_result?, context_result?); + let tool_result = tool_result?; + let context_result = context_result?; let threshold = self.get_threshold_from_config(); + let final_result = + self.select_result_with_context_awareness(tool_result, context_result, threshold); + tracing::info!( - "✅ Security analysis complete: confidence={:.3}, malicious={}", - highest_confidence_result.confidence, - highest_confidence_result.confidence >= threshold + "Security analysis complete: confidence={:.3}, malicious={}", + final_result.confidence, + final_result.confidence >= threshold ); Ok(ScanResult { - is_malicious: highest_confidence_result.confidence >= threshold, - confidence: highest_confidence_result.confidence, - explanation: self.build_explanation(&highest_confidence_result, threshold), + is_malicious: final_result.confidence >= threshold, + confidence: final_result.confidence, + explanation: self.build_explanation(&final_result, threshold), }) } @@ -169,12 +173,39 @@ impl PromptInjectionScanner { }) } - fn select_highest_confidence_result( + fn select_result_with_context_awareness( &self, tool_result: DetailedScanResult, context_result: DetailedScanResult, + threshold: f32, ) -> DetailedScanResult { - if tool_result.confidence >= context_result.confidence { + let context_is_safe = context_result + .ml_confidence + .is_some_and(|conf| conf < threshold); + + let tool_has_only_non_critical = !tool_result.pattern_matches.is_empty() + && tool_result + .pattern_matches + .iter() + .all(|m| m.threat.risk_level != crate::security::patterns::RiskLevel::Critical); + + if context_is_safe && tool_has_only_non_critical { + tracing::info!( + "Suppressing non-critical pattern match due to safe context evaluation" + ); + tracing::debug!( + context_ml_confidence = ?context_result.ml_confidence, + pattern_count = tool_result.pattern_matches.len(), + max_pattern_risk = ?tool_result.pattern_matches.first().map(|m| &m.threat.risk_level), + "Suppression conditions met: safe context + non-critical patterns" + ); + + DetailedScanResult { + confidence: 0.0, + pattern_matches: Vec::new(), + ml_confidence: context_result.ml_confidence, + } + } else if tool_result.confidence >= context_result.confidence { tool_result } else { context_result From 836884bc69907c4942c7b40689e4f951451b693d Mon Sep 17 00:00:00 2001 From: Dorien Koelemeijer Date: Mon, 12 Jan 2026 13:57:11 +1000 Subject: [PATCH 2/5] fix --- crates/goose/src/security/scanner.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/crates/goose/src/security/scanner.rs b/crates/goose/src/security/scanner.rs index 73c5ee583e44..3c0f8ac83ed8 100644 --- a/crates/goose/src/security/scanner.rs +++ b/crates/goose/src/security/scanner.rs @@ -16,7 +16,6 @@ pub struct ScanResult { pub explanation: String, } -#[derive(Clone)] struct DetailedScanResult { confidence: f32, pattern_matches: Vec, @@ -190,9 +189,7 @@ impl PromptInjectionScanner { .all(|m| m.threat.risk_level != crate::security::patterns::RiskLevel::Critical); if context_is_safe && tool_has_only_non_critical { - tracing::info!( - "Suppressing non-critical pattern match due to safe context evaluation" - ); + tracing::info!("Suppressing non-critical pattern match due to safe context evaluation"); tracing::debug!( context_ml_confidence = ?context_result.ml_confidence, pattern_count = tool_result.pattern_matches.len(), From ce9e44fbba181346367d5c42d7438103ff4d6fa9 Mon Sep 17 00:00:00 2001 From: Dorien Koelemeijer Date: Mon, 12 Jan 2026 14:30:45 +1000 Subject: [PATCH 3/5] Make sure 'SECURITY_PROMPT_CLASSIFIER_MODEL' is set if 'SECURITY_ML_MODEL_MAPPING' is set but 'SECURITY_PROMPT_CLASSIFIER_MODEL' is set to empty string --- crates/goose/src/security/scanner.rs | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/crates/goose/src/security/scanner.rs b/crates/goose/src/security/scanner.rs index 3c0f8ac83ed8..e43266b52a5b 100644 --- a/crates/goose/src/security/scanner.rs +++ b/crates/goose/src/security/scanner.rs @@ -46,7 +46,7 @@ impl PromptInjectionScanner { fn create_classifier_from_config() -> Result { let config = Config::global(); - let model_name = config + let mut model_name = config .get_param::("SECURITY_PROMPT_CLASSIFIER_MODEL") .ok() .filter(|s| !s.trim().is_empty()); @@ -59,6 +59,23 @@ impl PromptInjectionScanner { .ok() .filter(|s| !s.trim().is_empty()); + if model_name.is_none() { + if let Ok(mapping_json) = std::env::var("SECURITY_ML_MODEL_MAPPING") { + if let Ok(mapping) = serde_json::from_str::< + crate::security::classification_client::ModelMappingConfig, + >(&mapping_json) + { + if let Some(first_model) = mapping.models.keys().next() { + tracing::info!( + default_model = %first_model, + "SECURITY_ML_MODEL_MAPPING available but no model selected - using first available model as default" + ); + model_name = Some(first_model.clone()); + } + } + } + } + tracing::debug!( model_name = ?model_name, has_endpoint = endpoint.is_some(), From 7143412f804681b7dcba66c1c9ee6f3ef6c4c833 Mon Sep 17 00:00:00 2001 From: Dorien Koelemeijer Date: Mon, 12 Jan 2026 14:50:44 +1000 Subject: [PATCH 4/5] Add test --- crates/goose/src/security/scanner.rs | 128 +++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) diff --git a/crates/goose/src/security/scanner.rs b/crates/goose/src/security/scanner.rs index e43266b52a5b..72194b61dcaa 100644 --- a/crates/goose/src/security/scanner.rs +++ b/crates/goose/src/security/scanner.rs @@ -359,4 +359,132 @@ mod tests { assert!(result.is_malicious); assert!(result.explanation.contains("Security threat")); } + + #[test] + fn test_context_aware_suppression() { + let scanner = PromptInjectionScanner::new(); + let threshold = 0.8; + + // Test case 1: Safe context + non-critical pattern → should suppress + let result = scanner.select_result_with_context_awareness( + DetailedScanResult { + confidence: 0.6, + pattern_matches: vec![PatternMatch { + matched_text: "test".to_string(), + threat: crate::security::patterns::ThreatPattern { + name: "test_pattern", + pattern: "test", + description: "Test pattern", + risk_level: crate::security::patterns::RiskLevel::Medium, + category: crate::security::patterns::ThreatCategory::CommandInjection, + }, + start_pos: 0, + end_pos: 4, + }], + ml_confidence: None, + }, + DetailedScanResult { + confidence: 0.3, + pattern_matches: Vec::new(), + ml_confidence: Some(0.3), + }, + threshold, + ); + assert_eq!( + result.confidence, 0.0, + "Should suppress non-critical pattern with safe context" + ); + assert!(result.pattern_matches.is_empty()); + + // Test case 2: Safe context + critical pattern → should NOT suppress + let result = scanner.select_result_with_context_awareness( + DetailedScanResult { + confidence: 0.95, + pattern_matches: vec![PatternMatch { + matched_text: "rm -rf /".to_string(), + threat: crate::security::patterns::ThreatPattern { + name: "rm_rf_root", + pattern: r"rm\s+-rf", + description: "Dangerous command", + risk_level: crate::security::patterns::RiskLevel::Critical, + category: crate::security::patterns::ThreatCategory::FileSystemDestruction, + }, + start_pos: 0, + end_pos: 9, + }], + ml_confidence: None, + }, + DetailedScanResult { + confidence: 0.3, + pattern_matches: Vec::new(), + ml_confidence: Some(0.3), + }, + threshold, + ); + assert!( + result.confidence > 0.0, + "Should NOT suppress critical pattern even with safe context" + ); + assert!(!result.pattern_matches.is_empty()); + + // Test case 3: Unsafe context + non-critical pattern → should NOT suppress + let result = scanner.select_result_with_context_awareness( + DetailedScanResult { + confidence: 0.6, + pattern_matches: vec![PatternMatch { + matched_text: "test".to_string(), + threat: crate::security::patterns::ThreatPattern { + name: "test_pattern", + pattern: "test", + description: "Test pattern", + risk_level: crate::security::patterns::RiskLevel::Medium, + category: crate::security::patterns::ThreatCategory::CommandInjection, + }, + start_pos: 0, + end_pos: 4, + }], + ml_confidence: None, + }, + DetailedScanResult { + confidence: 0.9, + pattern_matches: Vec::new(), + ml_confidence: Some(0.9), + }, + threshold, + ); + assert!( + result.confidence > 0.0, + "Should NOT suppress with unsafe context" + ); + + // Test case 4: No ML confidence (ML disabled) + non-critical pattern → should NOT suppress + let result = scanner.select_result_with_context_awareness( + DetailedScanResult { + confidence: 0.6, + pattern_matches: vec![PatternMatch { + matched_text: "test".to_string(), + threat: crate::security::patterns::ThreatPattern { + name: "test_pattern", + pattern: "test", + description: "Test pattern", + risk_level: crate::security::patterns::RiskLevel::Medium, + category: crate::security::patterns::ThreatCategory::CommandInjection, + }, + start_pos: 0, + end_pos: 4, + }], + ml_confidence: None, + }, + DetailedScanResult { + confidence: 0.0, + pattern_matches: Vec::new(), + ml_confidence: None, + }, + threshold, + ); + assert!( + result.confidence > 0.0, + "Should NOT suppress when ML is disabled" + ); + } } From 5a703efaceced9db3dbf72178e64aa016eeca431 Mon Sep 17 00:00:00 2001 From: Dorien Koelemeijer Date: Tue, 13 Jan 2026 07:57:18 +1000 Subject: [PATCH 5/5] address PR comments --- crates/goose/src/security/scanner.rs | 136 --------------------------- 1 file changed, 136 deletions(-) diff --git a/crates/goose/src/security/scanner.rs b/crates/goose/src/security/scanner.rs index 72194b61dcaa..3411b89a6a5a 100644 --- a/crates/goose/src/security/scanner.rs +++ b/crates/goose/src/security/scanner.rs @@ -206,14 +206,6 @@ impl PromptInjectionScanner { .all(|m| m.threat.risk_level != crate::security::patterns::RiskLevel::Critical); if context_is_safe && tool_has_only_non_critical { - tracing::info!("Suppressing non-critical pattern match due to safe context evaluation"); - tracing::debug!( - context_ml_confidence = ?context_result.ml_confidence, - pattern_count = tool_result.pattern_matches.len(), - max_pattern_risk = ?tool_result.pattern_matches.first().map(|m| &m.threat.risk_level), - "Suppression conditions met: safe context + non-critical patterns" - ); - DetailedScanResult { confidence: 0.0, pattern_matches: Vec::new(), @@ -359,132 +351,4 @@ mod tests { assert!(result.is_malicious); assert!(result.explanation.contains("Security threat")); } - - #[test] - fn test_context_aware_suppression() { - let scanner = PromptInjectionScanner::new(); - let threshold = 0.8; - - // Test case 1: Safe context + non-critical pattern → should suppress - let result = scanner.select_result_with_context_awareness( - DetailedScanResult { - confidence: 0.6, - pattern_matches: vec![PatternMatch { - matched_text: "test".to_string(), - threat: crate::security::patterns::ThreatPattern { - name: "test_pattern", - pattern: "test", - description: "Test pattern", - risk_level: crate::security::patterns::RiskLevel::Medium, - category: crate::security::patterns::ThreatCategory::CommandInjection, - }, - start_pos: 0, - end_pos: 4, - }], - ml_confidence: None, - }, - DetailedScanResult { - confidence: 0.3, - pattern_matches: Vec::new(), - ml_confidence: Some(0.3), - }, - threshold, - ); - assert_eq!( - result.confidence, 0.0, - "Should suppress non-critical pattern with safe context" - ); - assert!(result.pattern_matches.is_empty()); - - // Test case 2: Safe context + critical pattern → should NOT suppress - let result = scanner.select_result_with_context_awareness( - DetailedScanResult { - confidence: 0.95, - pattern_matches: vec![PatternMatch { - matched_text: "rm -rf /".to_string(), - threat: crate::security::patterns::ThreatPattern { - name: "rm_rf_root", - pattern: r"rm\s+-rf", - description: "Dangerous command", - risk_level: crate::security::patterns::RiskLevel::Critical, - category: crate::security::patterns::ThreatCategory::FileSystemDestruction, - }, - start_pos: 0, - end_pos: 9, - }], - ml_confidence: None, - }, - DetailedScanResult { - confidence: 0.3, - pattern_matches: Vec::new(), - ml_confidence: Some(0.3), - }, - threshold, - ); - assert!( - result.confidence > 0.0, - "Should NOT suppress critical pattern even with safe context" - ); - assert!(!result.pattern_matches.is_empty()); - - // Test case 3: Unsafe context + non-critical pattern → should NOT suppress - let result = scanner.select_result_with_context_awareness( - DetailedScanResult { - confidence: 0.6, - pattern_matches: vec![PatternMatch { - matched_text: "test".to_string(), - threat: crate::security::patterns::ThreatPattern { - name: "test_pattern", - pattern: "test", - description: "Test pattern", - risk_level: crate::security::patterns::RiskLevel::Medium, - category: crate::security::patterns::ThreatCategory::CommandInjection, - }, - start_pos: 0, - end_pos: 4, - }], - ml_confidence: None, - }, - DetailedScanResult { - confidence: 0.9, - pattern_matches: Vec::new(), - ml_confidence: Some(0.9), - }, - threshold, - ); - assert!( - result.confidence > 0.0, - "Should NOT suppress with unsafe context" - ); - - // Test case 4: No ML confidence (ML disabled) + non-critical pattern → should NOT suppress - let result = scanner.select_result_with_context_awareness( - DetailedScanResult { - confidence: 0.6, - pattern_matches: vec![PatternMatch { - matched_text: "test".to_string(), - threat: crate::security::patterns::ThreatPattern { - name: "test_pattern", - pattern: "test", - description: "Test pattern", - risk_level: crate::security::patterns::RiskLevel::Medium, - category: crate::security::patterns::ThreatCategory::CommandInjection, - }, - start_pos: 0, - end_pos: 4, - }], - ml_confidence: None, - }, - DetailedScanResult { - confidence: 0.0, - pattern_matches: Vec::new(), - ml_confidence: None, - }, - threshold, - ); - assert!( - result.confidence > 0.0, - "Should NOT suppress when ML is disabled" - ); - } }