diff --git a/crates/goose/src/agents/agent.rs b/crates/goose/src/agents/agent.rs index 107c7fea8505..06b1d00f21be 100644 --- a/crates/goose/src/agents/agent.rs +++ b/crates/goose/src/agents/agent.rs @@ -45,6 +45,7 @@ use crate::providers::errors::ProviderError; use crate::recipe::{Author, Recipe, Response, Settings}; use crate::scheduler_trait::SchedulerTrait; use crate::security::adversary_inspector::AdversaryInspector; +use crate::security::egress_inspector::EgressInspector; use crate::security::security_inspector::SecurityInspector; use crate::session::extension_data::{EnabledExtensionsState, ExtensionState}; use crate::session::{Session, SessionManager}; @@ -262,6 +263,7 @@ impl Agent { // Add security inspector (highest priority - runs first) tool_inspection_manager.add_inspector(Box::new(SecurityInspector::new())); + tool_inspection_manager.add_inspector(Box::new(EgressInspector::new())); // Add adversary inspector (LLM-based review, enabled by ~/.config/goose/adversary.md) tool_inspection_manager.add_inspector(Box::new(AdversaryInspector::new(provider.clone()))); diff --git a/crates/goose/src/security/egress_inspector.rs b/crates/goose/src/security/egress_inspector.rs new file mode 100644 index 000000000000..a7e34d01e5cb --- /dev/null +++ b/crates/goose/src/security/egress_inspector.rs @@ -0,0 +1,413 @@ +use anyhow::Result; +use async_trait::async_trait; +use regex::Regex; +use std::collections::HashSet; +use std::sync::OnceLock; + +use crate::config::GooseMode; +use crate::conversation::message::{Message, ToolRequest}; +use crate::tool_inspection::{InspectionAction, InspectionResult, ToolInspector}; + +pub struct EgressInspector; + +impl EgressInspector { + pub fn new() -> Self { + Self + } +} + +impl Default for EgressInspector { + fn default() -> Self { + Self::new() + } +} + +#[derive(Debug, Clone)] +struct EgressDestination { + kind: String, + destination: String, + domain: String, +} + +fn extract_destinations(command: &str) -> Vec { + let mut destinations = Vec::new(); + + static URL_RE: OnceLock = OnceLock::new(); + let url_re = URL_RE.get_or_init(|| Regex::new(r#"(?i)(https?|ftp)://[^\s'"<>|;&)]+"#).unwrap()); + for cap in url_re.find_iter(command) { + let url = cap.as_str().to_string(); + let domain = extract_domain_from_url(&url).unwrap_or_default(); + if !domain.is_empty() { + destinations.push(EgressDestination { + kind: "url".to_string(), + destination: url, + domain, + }); + } + } + + static GIT_SSH_RE: OnceLock = OnceLock::new(); + let git_ssh_re = GIT_SSH_RE.get_or_init(|| Regex::new(r#"git@([^:]+):([^\s'"]+)"#).unwrap()); + for cap in git_ssh_re.captures_iter(command) { + let domain = cap[1].to_string(); + let path = cap[2].to_string(); + destinations.push(EgressDestination { + kind: "git_remote".to_string(), + destination: format!("git@{}:{}", domain, path), + domain, + }); + } + + static S3_RE: OnceLock = OnceLock::new(); + let s3_re = S3_RE.get_or_init(|| Regex::new(r#"s3://([^/\s'"]+)(/[^\s'"]*)?"#).unwrap()); + for cap in s3_re.captures_iter(command) { + let bucket = cap[1].to_string(); + let full = cap[0].to_string(); + destinations.push(EgressDestination { + kind: "s3_bucket".to_string(), + destination: full, + domain: format!("{}.s3.amazonaws.com", bucket), + }); + } + + static GCS_RE: OnceLock = OnceLock::new(); + let gcs_re = GCS_RE.get_or_init(|| Regex::new(r#"gs://([^/\s'"]+)(/[^\s'"]*)?"#).unwrap()); + for cap in gcs_re.captures_iter(command) { + let bucket = cap[1].to_string(); + let full = cap[0].to_string(); + destinations.push(EgressDestination { + kind: "gcs_bucket".to_string(), + destination: full, + domain: format!("{}.storage.googleapis.com", bucket), + }); + } + + static SCP_RE: OnceLock = OnceLock::new(); + let scp_re = SCP_RE + .get_or_init(|| Regex::new(r"(?:scp|rsync)\s+.*?(?:\S+@)?([a-zA-Z0-9][\w.-]+):").unwrap()); + for cap in scp_re.captures_iter(command) { + let host = cap[1].to_string(); + destinations.push(EgressDestination { + kind: "scp_target".to_string(), + destination: cap[0].to_string(), + domain: host, + }); + } + + static SSH_RE: OnceLock = OnceLock::new(); + let ssh_re = SSH_RE.get_or_init(|| { + Regex::new(r"ssh\s+(?:-\w+\s+\S+\s+)*(?:\S+@)?([a-zA-Z0-9][\w.-]+)").unwrap() + }); + for cap in ssh_re.captures_iter(command) { + let host = cap[1].to_string(); + if !host.starts_with('-') { + destinations.push(EgressDestination { + kind: "ssh_target".to_string(), + destination: cap[0].to_string(), + domain: host, + }); + } + } + + static DOCKER_RE: OnceLock = OnceLock::new(); + let docker_re = DOCKER_RE.get_or_init(|| { + Regex::new(r#"docker\s+(?:push|login)\s+(?:--[^\s]+\s+)*([^\s'"]+)"#).unwrap() + }); + for cap in docker_re.captures_iter(command) { + let target = cap[1].to_string(); + let domain = target.split('/').next().unwrap_or(&target).to_string(); + destinations.push(EgressDestination { + kind: "docker_registry".to_string(), + destination: target, + domain, + }); + } + + static GENERIC_NET_CMD_RE: OnceLock = OnceLock::new(); + let generic_net_cmd_re = GENERIC_NET_CMD_RE.get_or_init(|| { + Regex::new( + r"(?i)\b(fetch|nc|ncat|netcat|ftp|sftp|socat|httpie|xh)\b[^\n]*?\b((?:[a-zA-Z0-9](?:[a-zA-Z0-9\-]*[a-zA-Z0-9])?\.)+[a-zA-Z]{2,})\b" + ).unwrap() + }); + let already_seen: HashSet = destinations + .iter() + .map(|d| d.domain.to_lowercase()) + .collect(); + for cap in generic_net_cmd_re.captures_iter(command) { + let domain = cap[2].to_string(); + if !already_seen.contains(&domain) { + destinations.push(EgressDestination { + kind: "generic_network".to_string(), + destination: cap[0].to_string(), + domain, + }); + } + } + + static NPM_PUBLISH_RE: OnceLock = OnceLock::new(); + let npm_publish_re = NPM_PUBLISH_RE + .get_or_init(|| Regex::new(r"(?:^|[;&|]\s*|\n)\s*npm\s+publish(?:\s|$)").unwrap()); + if npm_publish_re.is_match(command) { + destinations.push(EgressDestination { + kind: "package_publish".to_string(), + destination: "npm publish".to_string(), + domain: "registry.npmjs.org".to_string(), + }); + } + + static CARGO_PUBLISH_RE: OnceLock = OnceLock::new(); + let cargo_publish_re = CARGO_PUBLISH_RE + .get_or_init(|| Regex::new(r"(?:^|[;&|]\s*|\n)\s*cargo\s+publish(?:\s|$)").unwrap()); + if cargo_publish_re.is_match(command) { + destinations.push(EgressDestination { + kind: "package_publish".to_string(), + destination: "cargo publish".to_string(), + domain: "crates.io".to_string(), + }); + } + + destinations +} + +fn extract_domain_from_url(url: &str) -> Option { + let after_scheme = url + .find("://") + .and_then(|i| url.get(i + 3..)) + .unwrap_or(url); + let authority = after_scheme.split('/').next()?; + let host_port = authority.split('@').next_back()?; + let host = if host_port.contains('[') { + host_port + .split(']') + .next() + .map(|s| s.trim_start_matches('['))? + } else { + host_port.split(':').next()? + }; + if host.is_empty() { + None + } else { + Some(host.to_string()) + } +} + +fn is_shell_tool(name: &str) -> bool { + matches!( + name, + "shell" | "bash" | "execute_command" | "run_command" | "terminal" + ) || name.ends_with("__shell") + || name.ends_with("__bash") + || name.ends_with("__terminal") +} + +fn is_web_tool(name: &str) -> bool { + matches!( + name, + "web_fetch" | "fetch" | "browser_navigate" | "http_request" + ) || name.ends_with("__web_fetch") + || name.ends_with("__fetch") + || name.ends_with("__browser_navigate") +} + +fn extract_text_for_inspection( + tool_call: &rmcp::model::CallToolRequestParams, + is_web: bool, +) -> Option { + let args = tool_call.arguments.as_ref()?; + let keys: &[&str] = if is_web { + &["url", "uri", "endpoint"] + } else { + &["command", "cmd", "script", "input"] + }; + keys.iter() + .find_map(|k| args.get(*k).and_then(|v| v.as_str()).map(|s| s.to_string())) +} + +#[async_trait] +impl ToolInspector for EgressInspector { + fn name(&self) -> &'static str { + "egress" + } + + fn as_any(&self) -> &dyn std::any::Any { + self + } + + async fn inspect( + &self, + _session_id: &str, + tool_requests: &[ToolRequest], + _messages: &[Message], + _goose_mode: GooseMode, + ) -> Result> { + let mut results = Vec::new(); + let mut seen_destinations: HashSet = HashSet::new(); + + for tool_request in tool_requests { + let tool_call = match &tool_request.tool_call { + Ok(tc) => tc, + Err(_) => continue, + }; + + let name = tool_call.name.as_ref(); + let is_web = is_web_tool(name); + if !is_shell_tool(name) && !is_web { + continue; + } + + let text = match extract_text_for_inspection(tool_call, is_web) { + Some(t) => t, + None => continue, + }; + + let destinations: Vec<_> = extract_destinations(&text) + .into_iter() + .filter(|d| seen_destinations.insert(d.destination.clone())) + .collect(); + + if destinations.is_empty() { + continue; + } + + for dest in &destinations { + tracing::info!( + egress_kind = dest.kind.as_str(), + domain = dest.domain.as_str(), + destination = dest.destination.as_str(), + "egress destination detected" + ); + } + + results.push(InspectionResult { + tool_request_id: tool_request.id.clone(), + action: InspectionAction::Allow, + reason: format!( + "Egress destinations detected: {}", + destinations + .iter() + .map(|d| d.destination.as_str()) + .collect::>() + .join(", ") + ), + confidence: 0.0, + inspector_name: self.name().to_string(), + finding_id: None, + }); + } + + Ok(results) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_extract_destinations() { + let dests = extract_destinations("curl https://example.com/api/data"); + assert_eq!(dests.len(), 1); + assert_eq!(dests[0].domain, "example.com"); + + let dests = extract_destinations("git remote add origin git@github.com:personal/repo.git"); + assert_eq!(dests.len(), 1); + assert_eq!(dests[0].domain, "github.com"); + + let dests = extract_destinations("aws s3 cp data.csv s3://my-bucket/path/data.csv"); + assert_eq!(dests.len(), 1); + assert_eq!(dests[0].kind, "s3_bucket"); + + assert_eq!(extract_destinations("ls -la /tmp").len(), 0); + } + + #[test] + fn test_package_publish_detection() { + // Should detect + assert_eq!(extract_destinations("npm publish").len(), 1); + assert_eq!(extract_destinations("cd pkg && npm publish").len(), 1); + assert_eq!(extract_destinations("cargo publish").len(), 1); + assert_eq!(extract_destinations("cargo publish --dry-run").len(), 1); + + // Should not detect (false positives) + assert_eq!(extract_destinations("echo 'npm publish'").len(), 0); + assert_eq!(extract_destinations("# npm publish").len(), 0); + assert_eq!(extract_destinations("cat npm_publish_guide.md").len(), 0); + } + + #[test] + fn test_gcs_detection() { + let dests = extract_destinations("gsutil cp data.csv gs://my-bucket/path/data.csv"); + assert_eq!(dests.len(), 1); + assert_eq!(dests[0].kind, "gcs_bucket"); + assert_eq!(dests[0].destination, "gs://my-bucket/path/data.csv"); + assert_eq!(dests[0].domain, "my-bucket.storage.googleapis.com"); + } + + #[test] + fn test_scp_detection() { + let dests = extract_destinations("scp file.txt user@remote.example.com:/tmp/file.txt"); + assert_eq!(dests.len(), 1); + assert_eq!(dests[0].kind, "scp_target"); + assert_eq!(dests[0].domain, "remote.example.com"); + + let dests = extract_destinations("rsync -av ./dist/ deploy@prod.example.com:/var/www/"); + assert_eq!(dests.len(), 1); + assert_eq!(dests[0].kind, "scp_target"); + assert_eq!(dests[0].domain, "prod.example.com"); + } + + #[test] + fn test_ssh_detection() { + let dests = extract_destinations("ssh user@bastion.example.com"); + assert_eq!(dests.len(), 1); + assert_eq!(dests[0].kind, "ssh_target"); + assert_eq!(dests[0].domain, "bastion.example.com"); + + let dests = extract_destinations("ssh -i key.pem ec2-user@10.0.0.1"); + assert_eq!(dests.len(), 1); + assert_eq!(dests[0].kind, "ssh_target"); + assert_eq!(dests[0].domain, "10.0.0.1"); + } + + #[test] + fn test_docker_detection() { + let dests = extract_destinations("docker push registry.example.com/myapp:latest"); + assert_eq!(dests.len(), 1); + assert_eq!(dests[0].kind, "docker_registry"); + assert_eq!(dests[0].domain, "registry.example.com"); + + let dests = extract_destinations("docker login ghcr.io"); + assert_eq!(dests.len(), 1); + assert_eq!(dests[0].kind, "docker_registry"); + assert_eq!(dests[0].domain, "ghcr.io"); + } + + #[test] + fn test_generic_network_catchall() { + let dests = extract_destinations("nc data.exfil.io 9999"); + assert!(dests + .iter() + .any(|d| d.kind == "generic_network" && d.domain == "data.exfil.io")); + + let dests = extract_destinations("curl https://example.com/api/data"); + assert!(!dests.iter().any(|d| d.kind == "generic_network")); + + let dests = extract_destinations("ssh user@bastion.example.com"); + assert!(!dests.iter().any(|d| d.kind == "generic_network")); + + let dests = extract_destinations("scp file.txt user@remote.example.com:/tmp/"); + assert!(!dests.iter().any(|d| d.kind == "generic_network")); + } + + #[test] + fn test_extract_domain_from_url() { + assert_eq!( + extract_domain_from_url("https://example.com/path"), + Some("example.com".to_string()) + ); + assert_eq!( + extract_domain_from_url("https://user:pass@example.com/path"), + Some("example.com".to_string()) + ); + } +} diff --git a/crates/goose/src/security/mod.rs b/crates/goose/src/security/mod.rs index 820767fa9841..54d9856ec9de 100644 --- a/crates/goose/src/security/mod.rs +++ b/crates/goose/src/security/mod.rs @@ -1,5 +1,6 @@ pub mod adversary_inspector; pub mod classification_client; +pub mod egress_inspector; pub mod patterns; pub mod scanner; pub mod security_inspector;