diff --git a/src/changelog/3.3.1/291-fix-surrogate-pairs-in-mask-xml-invalid-characters.xml b/src/changelog/3.3.1/291-fix-surrogate-pairs-in-mask-xml-invalid-characters.xml
new file mode 100644
index 00000000..9ecba009
--- /dev/null
+++ b/src/changelog/3.3.1/291-fix-surrogate-pairs-in-mask-xml-invalid-characters.xml
@@ -0,0 +1,13 @@
+
+
+
+
+
+ fix silent corruption of supplementary Unicode characters (U+10000βU+10FFFF) in `MaskXmlInvalidCharacters`:
+ valid UTF-16 surrogate pairs are now preserved instead of being replaced with the mask string
+ (reported by @N0tre3l, fixed by @freeandnil in https://github.com/apache/logging-log4net/pull/291[#291])
+
+
\ No newline at end of file
diff --git a/src/log4net.Tests/Util/TransformTest.cs b/src/log4net.Tests/Util/TransformTest.cs
index b0e932ac..2b9b15f6 100644
--- a/src/log4net.Tests/Util/TransformTest.cs
+++ b/src/log4net.Tests/Util/TransformTest.cs
@@ -17,16 +17,20 @@
//
#endregion
+using System;
+using System.Reflection;
using log4net.Util;
-
using NUnit.Framework;
namespace log4net.Tests.Util;
-
[TestFixture]
public class TransformTest
{
+ ///
+ /// Verifies that a valid BMP character in the allowed XML 1.0 range (U+203B β»)
+ /// is passed through unchanged.
+ ///
[Test]
public void MaskXmlInvalidCharactersAllowsJapaneseCharacters()
{
@@ -34,10 +38,147 @@ public void MaskXmlInvalidCharactersAllowsJapaneseCharacters()
Assert.That(Transform.MaskXmlInvalidCharacters(kome, "?"), Is.EqualTo(kome));
}
+ ///
+ /// Verifies that the NUL character (U+0000), which is illegal in XML 1.0,
+ /// is replaced with the supplied mask string.
+ ///
[Test]
public void MaskXmlInvalidCharactersMasks0Char()
{
const string c = "\0";
Assert.That(Transform.MaskXmlInvalidCharacters(c, "?"), Is.EqualTo("?"));
}
+
+ ///
+ /// Verifies that a well-formed UTF-16 surrogate pair representing a supplementary
+ /// character (U+1F511 π) is treated as a valid XML character and preserved intact.
+ ///
+ [Test]
+ public void MaskXmlInvalidCharactersPreservesValidSurrogatePair()
+ {
+ // U+1F511 KEY (π) β encoded as surrogate pair \uD83D\uDD11
+ const string key = "\uD83D\uDD11";
+ Assert.That(Transform.MaskXmlInvalidCharacters(key, "?"), Is.EqualTo(key));
+ }
+
+ ///
+ /// Verifies that a supplementary character embedded within plain ASCII text
+ /// is preserved without corrupting the surrounding characters.
+ ///
+ [Test]
+ public void MaskXmlInvalidCharactersPreservesSupplementaryCharacterInContext()
+ {
+ // "adminπ" β only the emoji is a surrogate pair; all other chars are BMP
+ const string input = "admin\uD83D\uDD11";
+ Assert.That(Transform.MaskXmlInvalidCharacters(input, "?"), Is.EqualTo(input));
+ }
+
+ ///
+ /// Verifies that a lone high surrogate (U+D800βU+DBFF) not followed by a
+ /// low surrogate is malformed UTF-16 and is replaced with the mask string.
+ ///
+ [Test]
+ public void MaskXmlInvalidCharactersMasksLoneHighSurrogate()
+ {
+ // A lone high surrogate with no following low surrogate is illegal
+ const string loneSurrogate = "\uD83D";
+ Assert.That(Transform.MaskXmlInvalidCharacters(loneSurrogate, "?"), Is.EqualTo("?"));
+ }
+
+ ///
+ /// Verifies that a lone low surrogate (U+DC00βU+DFFF) not preceded by a
+ /// high surrogate is malformed UTF-16 and is replaced with the mask string.
+ ///
+ [Test]
+ public void MaskXmlInvalidCharactersMasksLoneLowSurrogate()
+ {
+ const string loneSurrogate = "\uDD11";
+ Assert.That(Transform.MaskXmlInvalidCharacters(loneSurrogate, "?"), Is.EqualTo("?"));
+ }
+
+ ///
+ /// Verifies current behaviour: a null input throws
+ /// (delegated to the regex engine). Add a null-guard to
+ /// if a graceful fallback is preferred.
+ ///
+ [Test]
+ public void MaskXmlInvalidCharactersNullInputThrowsArgumentNullException()
+ => Assert.That(() => Transform.MaskXmlInvalidCharacters(null!, "?"),
+ Throws.ArgumentNullException);
+
+ #region CountSubstrings tests (private method accessed via reflection)
+
+ ///
+ /// Resolves the private static method once for all tests
+ ///
+ private static Func CountSubstrings { get; } = (Func)Delegate.CreateDelegate(
+ typeof(Func), typeof(Transform).GetMethod(nameof(CountSubstrings), BindingFlags.NonPublic | BindingFlags.Static)
+ ?? throw new MissingMethodException(nameof(Transform), nameof(CountSubstrings)));
+
+ ///
+ /// Verifies that an empty text returns zero regardless of the substring.
+ ///
+ [Test]
+ public void CountSubstringsEmptyTextReturnsZero()
+ => Assert.That(CountSubstrings("", "a"), Is.Zero);
+
+ ///
+ /// Verifies that an empty substring returns zero regardless of the text.
+ ///
+ [Test]
+ public void CountSubstringsEmptySubstringReturnsZero()
+ => Assert.That(CountSubstrings("abc", ""), Is.Zero);
+
+ ///
+ /// Verifies that a substring not present in the text returns zero.
+ ///
+ [Test]
+ public void CountSubstringsNoMatchReturnsZero()
+ => Assert.That(CountSubstrings("hello", "x"), Is.Zero);
+
+ ///
+ /// Verifies that a single-character substring occurring once is counted correctly.
+ ///
+ [Test]
+ public void CountSubstringsSingleCharSingleMatchReturnsOne()
+ => Assert.That(CountSubstrings("hello", "h"), Is.EqualTo(1));
+
+ ///
+ /// Verifies that a single-character substring occurring multiple times is counted correctly.
+ ///
+ [Test]
+ public void CountSubstringsSingleCharMultipleMatchesReturnsCorrectCount()
+ => Assert.That(CountSubstrings("banana", "a"), Is.EqualTo(3));
+
+ ///
+ /// Verifies counting of XML special characters as used by WriteEscapedXmlString.
+ ///
+ [TestCase("", "<", 2)]
+ [TestCase("", ">", 2)]
+ [TestCase("a&&b", "&", 2)]
+ public void CountSubstringsXmlSpecialCharsReturnsCorrectCount(string text, string substring, int expected)
+ => Assert.That(CountSubstrings(text, substring), Is.EqualTo(expected));
+
+ ///
+ /// Verifies that the CDATA end token ]]> is counted correctly as a multi-character substring.
+ ///
+ [Test]
+ public void CountSubstringsCdataEndReturnsCorrectCount()
+ => Assert.That(CountSubstrings("foo]]>bar]]>baz", "]]>"), Is.EqualTo(2));
+
+ ///
+ /// Verifies that non-overlapping occurrences are counted and overlapping ones are not,
+ /// consistent with the method's documented assumption.
+ ///
+ [Test]
+ public void CountSubstringsNonOverlappingReturnsCorrectCount()
+ => Assert.That(CountSubstrings("aaaa", "aa"), Is.EqualTo(2));
+
+ ///
+ /// Verifies that a substring equal to the entire text counts as one match.
+ ///
+ [Test]
+ public void CountSubstringsSubstringEqualsTextReturnsOne()
+ => Assert.That(CountSubstrings("]]>", "]]>"), Is.EqualTo(1));
+ #endregion
}
\ No newline at end of file
diff --git a/src/log4net/Util/Transform.cs b/src/log4net/Util/Transform.cs
index b646974b..843716e6 100644
--- a/src/log4net/Util/Transform.cs
+++ b/src/log4net/Util/Transform.cs
@@ -17,8 +17,9 @@
//
#endregion
-using System.Xml;
+using System;
using System.Text.RegularExpressions;
+using System.Xml;
namespace log4net.Util;
@@ -110,9 +111,15 @@ public static void WriteEscapedXmlString(this XmlWriter writer, string textData,
/// This method replaces any illegal characters in the input string
/// with the mask string specified.
///
+ ///
+ /// Supplementary characters (U+10000U+10FFFF) encoded as valid UTF-16 surrogate
+ /// pairs are preserved; only lone surrogates and other illegal code units are masked.
+ ///
///
- public static string MaskXmlInvalidCharacters(string textData, string mask)
- => _invalidchars.Replace(textData, mask);
+ public static string MaskXmlInvalidCharacters(string textData, string mask)
+ => _invalidChars.Replace(textData, m =>
+ // A valid surrogate pair represents a legal supplementary character preserve it.
+ m.Value.Length == 2 ? m.Value : mask);
///
/// Count the number of times that the substring occurs in the text
@@ -127,29 +134,38 @@ public static string MaskXmlInvalidCharacters(string textData, string mask)
///
private static int CountSubstrings(string text, string substring)
{
- int count = 0;
- int offset = 0;
- int length = text.Length;
- int substringLength = substring.Length;
-
- if (length == 0)
+ if (text.Length == 0 || substring.Length == 0)
{
return 0;
}
- if (substringLength == 0)
+
+ // Use the char overload for single-character substrings avoids string
+ // comparison overhead; all current callers pass single ASCII characters.
+ if (substring.Length == 1)
{
- return 0;
+ int charCount = 0;
+ char target = substring[0];
+ foreach (char c in text)
+ {
+ if (c == target)
+ {
+ charCount++;
+ }
+ }
+ return charCount;
}
- while (offset < length)
+ int count = 0;
+ int offset = 0;
+ int substringLength = substring.Length;
+ while (offset < text.Length)
{
- int index = text.IndexOf(substring, offset);
-
+ // Ordinal avoids culture-sensitive comparison for these ASCII-only tokens.
+ int index = text.IndexOf(substring, offset, StringComparison.Ordinal);
if (index == -1)
{
break;
}
-
count++;
offset = index + substringLength;
}
@@ -160,7 +176,12 @@ private static int CountSubstrings(string text, string substring)
private const string CdataUnescapableToken = "]]";
///
- /// Characters illegal in XML 1.0
+ /// Matches either a valid UTF-16 surrogate pair (preserved) or a single character
+ /// that is illegal in XML 1.0 (replaced). The surrogate-pair alternative must come
+ /// first so the engine consumes both code units together before the single-char
+ /// alternative can match them individually.
///
- private static readonly Regex _invalidchars = new(@"[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD]", RegexOptions.Compiled);
-}
+ private static readonly Regex _invalidChars = new(
+ @"[\uD800-\uDBFF][\uDC00-\uDFFF]|[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD]",
+ RegexOptions.Compiled);
+}
\ No newline at end of file