diff --git a/src/changelog/3.3.1/291-fix-surrogate-pairs-in-mask-xml-invalid-characters.xml b/src/changelog/3.3.1/291-fix-surrogate-pairs-in-mask-xml-invalid-characters.xml new file mode 100644 index 00000000..9ecba009 --- /dev/null +++ b/src/changelog/3.3.1/291-fix-surrogate-pairs-in-mask-xml-invalid-characters.xml @@ -0,0 +1,13 @@ + + + + + + fix silent corruption of supplementary Unicode characters (U+10000–U+10FFFF) in `MaskXmlInvalidCharacters`: + valid UTF-16 surrogate pairs are now preserved instead of being replaced with the mask string + (reported by @N0tre3l, fixed by @freeandnil in https://github.com/apache/logging-log4net/pull/291[#291]) + + \ No newline at end of file diff --git a/src/log4net.Tests/Util/TransformTest.cs b/src/log4net.Tests/Util/TransformTest.cs index b0e932ac..2b9b15f6 100644 --- a/src/log4net.Tests/Util/TransformTest.cs +++ b/src/log4net.Tests/Util/TransformTest.cs @@ -17,16 +17,20 @@ // #endregion +using System; +using System.Reflection; using log4net.Util; - using NUnit.Framework; namespace log4net.Tests.Util; - [TestFixture] public class TransformTest { + /// + /// Verifies that a valid BMP character in the allowed XML 1.0 range (U+203B β€») + /// is passed through unchanged. + /// [Test] public void MaskXmlInvalidCharactersAllowsJapaneseCharacters() { @@ -34,10 +38,147 @@ public void MaskXmlInvalidCharactersAllowsJapaneseCharacters() Assert.That(Transform.MaskXmlInvalidCharacters(kome, "?"), Is.EqualTo(kome)); } + /// + /// Verifies that the NUL character (U+0000), which is illegal in XML 1.0, + /// is replaced with the supplied mask string. + /// [Test] public void MaskXmlInvalidCharactersMasks0Char() { const string c = "\0"; Assert.That(Transform.MaskXmlInvalidCharacters(c, "?"), Is.EqualTo("?")); } + + /// + /// Verifies that a well-formed UTF-16 surrogate pair representing a supplementary + /// character (U+1F511 πŸ”‘) is treated as a valid XML character and preserved intact. + /// + [Test] + public void MaskXmlInvalidCharactersPreservesValidSurrogatePair() + { + // U+1F511 KEY (πŸ”‘) β€” encoded as surrogate pair \uD83D\uDD11 + const string key = "\uD83D\uDD11"; + Assert.That(Transform.MaskXmlInvalidCharacters(key, "?"), Is.EqualTo(key)); + } + + /// + /// Verifies that a supplementary character embedded within plain ASCII text + /// is preserved without corrupting the surrounding characters. + /// + [Test] + public void MaskXmlInvalidCharactersPreservesSupplementaryCharacterInContext() + { + // "adminπŸ”‘" β€” only the emoji is a surrogate pair; all other chars are BMP + const string input = "admin\uD83D\uDD11"; + Assert.That(Transform.MaskXmlInvalidCharacters(input, "?"), Is.EqualTo(input)); + } + + /// + /// Verifies that a lone high surrogate (U+D800–U+DBFF) not followed by a + /// low surrogate is malformed UTF-16 and is replaced with the mask string. + /// + [Test] + public void MaskXmlInvalidCharactersMasksLoneHighSurrogate() + { + // A lone high surrogate with no following low surrogate is illegal + const string loneSurrogate = "\uD83D"; + Assert.That(Transform.MaskXmlInvalidCharacters(loneSurrogate, "?"), Is.EqualTo("?")); + } + + /// + /// Verifies that a lone low surrogate (U+DC00–U+DFFF) not preceded by a + /// high surrogate is malformed UTF-16 and is replaced with the mask string. + /// + [Test] + public void MaskXmlInvalidCharactersMasksLoneLowSurrogate() + { + const string loneSurrogate = "\uDD11"; + Assert.That(Transform.MaskXmlInvalidCharacters(loneSurrogate, "?"), Is.EqualTo("?")); + } + + /// + /// Verifies current behaviour: a null input throws + /// (delegated to the regex engine). Add a null-guard to + /// if a graceful fallback is preferred. + /// + [Test] + public void MaskXmlInvalidCharactersNullInputThrowsArgumentNullException() + => Assert.That(() => Transform.MaskXmlInvalidCharacters(null!, "?"), + Throws.ArgumentNullException); + + #region CountSubstrings tests (private method accessed via reflection) + + /// + /// Resolves the private static method once for all tests + /// + private static Func CountSubstrings { get; } = (Func)Delegate.CreateDelegate( + typeof(Func), typeof(Transform).GetMethod(nameof(CountSubstrings), BindingFlags.NonPublic | BindingFlags.Static) + ?? throw new MissingMethodException(nameof(Transform), nameof(CountSubstrings))); + + /// + /// Verifies that an empty text returns zero regardless of the substring. + /// + [Test] + public void CountSubstringsEmptyTextReturnsZero() + => Assert.That(CountSubstrings("", "a"), Is.Zero); + + /// + /// Verifies that an empty substring returns zero regardless of the text. + /// + [Test] + public void CountSubstringsEmptySubstringReturnsZero() + => Assert.That(CountSubstrings("abc", ""), Is.Zero); + + /// + /// Verifies that a substring not present in the text returns zero. + /// + [Test] + public void CountSubstringsNoMatchReturnsZero() + => Assert.That(CountSubstrings("hello", "x"), Is.Zero); + + /// + /// Verifies that a single-character substring occurring once is counted correctly. + /// + [Test] + public void CountSubstringsSingleCharSingleMatchReturnsOne() + => Assert.That(CountSubstrings("hello", "h"), Is.EqualTo(1)); + + /// + /// Verifies that a single-character substring occurring multiple times is counted correctly. + /// + [Test] + public void CountSubstringsSingleCharMultipleMatchesReturnsCorrectCount() + => Assert.That(CountSubstrings("banana", "a"), Is.EqualTo(3)); + + /// + /// Verifies counting of XML special characters as used by WriteEscapedXmlString. + /// + [TestCase("", "<", 2)] + [TestCase("", ">", 2)] + [TestCase("a&&b", "&", 2)] + public void CountSubstringsXmlSpecialCharsReturnsCorrectCount(string text, string substring, int expected) + => Assert.That(CountSubstrings(text, substring), Is.EqualTo(expected)); + + /// + /// Verifies that the CDATA end token ]]> is counted correctly as a multi-character substring. + /// + [Test] + public void CountSubstringsCdataEndReturnsCorrectCount() + => Assert.That(CountSubstrings("foo]]>bar]]>baz", "]]>"), Is.EqualTo(2)); + + /// + /// Verifies that non-overlapping occurrences are counted and overlapping ones are not, + /// consistent with the method's documented assumption. + /// + [Test] + public void CountSubstringsNonOverlappingReturnsCorrectCount() + => Assert.That(CountSubstrings("aaaa", "aa"), Is.EqualTo(2)); + + /// + /// Verifies that a substring equal to the entire text counts as one match. + /// + [Test] + public void CountSubstringsSubstringEqualsTextReturnsOne() + => Assert.That(CountSubstrings("]]>", "]]>"), Is.EqualTo(1)); + #endregion } \ No newline at end of file diff --git a/src/log4net/Util/Transform.cs b/src/log4net/Util/Transform.cs index b646974b..843716e6 100644 --- a/src/log4net/Util/Transform.cs +++ b/src/log4net/Util/Transform.cs @@ -17,8 +17,9 @@ // #endregion -using System.Xml; +using System; using System.Text.RegularExpressions; +using System.Xml; namespace log4net.Util; @@ -110,9 +111,15 @@ public static void WriteEscapedXmlString(this XmlWriter writer, string textData, /// This method replaces any illegal characters in the input string /// with the mask string specified. /// + /// + /// Supplementary characters (U+10000–U+10FFFF) encoded as valid UTF-16 surrogate + /// pairs are preserved; only lone surrogates and other illegal code units are masked. + /// /// - public static string MaskXmlInvalidCharacters(string textData, string mask) - => _invalidchars.Replace(textData, mask); + public static string MaskXmlInvalidCharacters(string textData, string mask) + => _invalidChars.Replace(textData, m => + // A valid surrogate pair represents a legal supplementary character — preserve it. + m.Value.Length == 2 ? m.Value : mask); /// /// Count the number of times that the substring occurs in the text @@ -127,29 +134,38 @@ public static string MaskXmlInvalidCharacters(string textData, string mask) /// private static int CountSubstrings(string text, string substring) { - int count = 0; - int offset = 0; - int length = text.Length; - int substringLength = substring.Length; - - if (length == 0) + if (text.Length == 0 || substring.Length == 0) { return 0; } - if (substringLength == 0) + + // Use the char overload for single-character substrings — avoids string + // comparison overhead; all current callers pass single ASCII characters. + if (substring.Length == 1) { - return 0; + int charCount = 0; + char target = substring[0]; + foreach (char c in text) + { + if (c == target) + { + charCount++; + } + } + return charCount; } - while (offset < length) + int count = 0; + int offset = 0; + int substringLength = substring.Length; + while (offset < text.Length) { - int index = text.IndexOf(substring, offset); - + // Ordinal avoids culture-sensitive comparison for these ASCII-only tokens. + int index = text.IndexOf(substring, offset, StringComparison.Ordinal); if (index == -1) { break; } - count++; offset = index + substringLength; } @@ -160,7 +176,12 @@ private static int CountSubstrings(string text, string substring) private const string CdataUnescapableToken = "]]"; /// - /// Characters illegal in XML 1.0 + /// Matches either a valid UTF-16 surrogate pair (preserved) or a single character + /// that is illegal in XML 1.0 (replaced). The surrogate-pair alternative must come + /// first so the engine consumes both code units together before the single-char + /// alternative can match them individually. /// - private static readonly Regex _invalidchars = new(@"[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD]", RegexOptions.Compiled); -} + private static readonly Regex _invalidChars = new( + @"[\uD800-\uDBFF][\uDC00-\uDFFF]|[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD]", + RegexOptions.Compiled); +} \ No newline at end of file