From df6c021eacf49f675435483657c45e04168b5493 Mon Sep 17 00:00:00 2001 From: Amed Rodriguez Date: Fri, 8 Aug 2025 11:20:27 -0700 Subject: [PATCH 1/2] sanitize message content on deserialization --- crates/goose/src/conversation/message.rs | 107 ++++++++++++++++++++--- 1 file changed, 93 insertions(+), 14 deletions(-) diff --git a/crates/goose/src/conversation/message.rs b/crates/goose/src/conversation/message.rs index 20b703c79576..b36e190b74ae 100644 --- a/crates/goose/src/conversation/message.rs +++ b/crates/goose/src/conversation/message.rs @@ -4,7 +4,7 @@ use rmcp::model::{ AnnotateAble, Content, ImageContent, PromptMessage, PromptMessageContent, PromptMessageRole, RawContent, RawImageContent, RawTextContent, ResourceContents, Role, TextContent, }; -use serde::{Deserialize, Serialize}; +use serde::{Deserialize, Deserializer, Serialize}; use serde_json::Value; use std::collections::HashSet; use std::fmt; @@ -13,6 +13,42 @@ use utoipa::ToSchema; use crate::conversation::tool_result_serde; +/// Sanitize Unicode Tags Block characters from text +fn sanitize_unicode_tags(text: &str) -> String { + let normalized: String = text.nfc().collect(); + + normalized + .chars() + .filter(|&c| !matches!(c, '\u{E0000}'..='\u{E007F}')) + .collect() +} + +/// Custom deserializer for MessageContent that sanitizes Unicode Tags in text content +fn deserialize_sanitized_content<'de, D>(deserializer: D) -> Result, D::Error> +where + D: Deserializer<'de>, +{ + let mut content: Vec = Vec::deserialize(deserializer)?; + + for message_content in &mut content { + if let MessageContent::Text(text_content) = message_content { + let original = &text_content.text; + let sanitized = sanitize_unicode_tags(original); + if *original != sanitized { + tracing::info!( + original = %original, + sanitized = %sanitized, + removed_count = original.len() - sanitized.len(), + "Unicode Tags sanitized during Message deserialization" + ); + text_content.text = sanitized; + } + } + } + + Ok(content) +} + #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] #[derive(ToSchema)] @@ -346,6 +382,7 @@ pub struct Message { pub role: Role, #[serde(default = "default_created")] pub created: i64, + #[serde(deserialize_with = "deserialize_sanitized_content")] pub content: Vec, } @@ -410,20 +447,10 @@ impl Message { self } - fn sanitize_unicode_tags(text: &str) -> String { - let normalized: String = text.nfc().collect(); - - // Remove Unicode Tags Block characters only - normalized - .chars() - .filter(|&c| !matches!(c, '\u{E0000}'..='\u{E007F}')) - .collect() - } - /// Add text content to the message pub fn with_text>(self, text: S) -> Self { let raw_text = text.into(); - let sanitized_text = Self::sanitize_unicode_tags(&raw_text); + let sanitized_text = sanitize_unicode_tags(&raw_text); self.with_content(MessageContent::Text( RawTextContent { @@ -587,14 +614,14 @@ mod tests { #[test] fn test_sanitize_unicode_tags() { let malicious = "Hello\u{E0041}\u{E0042}\u{E0043}world"; // Invisible "ABC" - let cleaned = Message::sanitize_unicode_tags(malicious); + let cleaned = super::sanitize_unicode_tags(malicious); assert_eq!(cleaned, "Helloworld"); } #[test] fn test_no_sanitize_unicode_tags() { let clean_text = "Hello world δΈ–η•Œ 🌍"; - let cleaned = Message::sanitize_unicode_tags(clean_text); + let cleaned = super::sanitize_unicode_tags(clean_text); assert_eq!(cleaned, clean_text); } @@ -872,4 +899,56 @@ mod tests { assert_eq!(ids.len(), 1); assert!(ids.contains("req1")); } + + #[test] + fn test_message_deserialization_sanitizes_text_content() { + // Create a test string with Unicode Tags characters + let malicious_text = "Hello\u{E0041}\u{E0042}\u{E0043}world"; + let malicious_json = format!(r#"{{ + "id": "test-id", + "role": "user", + "created": 1640995200, + "content": [ + {{ + "type": "text", + "text": "{}" + }}, + {{ + "type": "image", + "data": "base64data", + "mimeType": "image/png" + }} + ] + }}"#, malicious_text); + + let message: Message = serde_json::from_str(&malicious_json).unwrap(); + + // Text content should be sanitized + assert_eq!(message.as_concat_text(), "Helloworld"); + + // Image content should be unchanged + if let MessageContent::Image(img) = &message.content[1] { + assert_eq!(img.data, "base64data"); + assert_eq!(img.mime_type, "image/png"); + } else { + panic!("Expected ImageContent"); + } + } + + #[test] + fn test_legitimate_unicode_preserved_during_message_deserialization() { + let clean_json = r#"{ + "id": "test-id", + "role": "user", + "created": 1640995200, + "content": [{ + "type": "text", + "text": "Hello world δΈ–η•Œ 🌍" + }] + }"#; + + let message: Message = serde_json::from_str(clean_json).unwrap(); + + assert_eq!(message.as_concat_text(), "Hello world δΈ–η•Œ 🌍"); + } } From 91f1fc833687b4cca2ab12464bb32e6041c42986 Mon Sep 17 00:00:00 2001 From: Amed Rodriguez Date: Fri, 8 Aug 2025 13:34:00 -0700 Subject: [PATCH 2/2] formatting --- crates/goose/src/conversation/message.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/crates/goose/src/conversation/message.rs b/crates/goose/src/conversation/message.rs index b36e190b74ae..800517165808 100644 --- a/crates/goose/src/conversation/message.rs +++ b/crates/goose/src/conversation/message.rs @@ -904,7 +904,8 @@ mod tests { fn test_message_deserialization_sanitizes_text_content() { // Create a test string with Unicode Tags characters let malicious_text = "Hello\u{E0041}\u{E0042}\u{E0043}world"; - let malicious_json = format!(r#"{{ + let malicious_json = format!( + r#"{{ "id": "test-id", "role": "user", "created": 1640995200, @@ -919,7 +920,9 @@ mod tests { "mimeType": "image/png" }} ] - }}"#, malicious_text); + }}"#, + malicious_text + ); let message: Message = serde_json::from_str(&malicious_json).unwrap();