diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 821bbc95438d6..30c8e9a0404be 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -4512,9 +4512,7 @@ private static string MatchCharacterClass(RegexOptions options, string chExpr, s // (In the future, we could possibly extend the rm.Analysis to produce a known // lower-bound and compare against that rather than always using 128 as the // pivot point.) - return negate ? - $"((ch = {chExpr}) < 128 || !RegexRunner.CharInClass((char)ch, {Literal(charClass)}))" : - $"((ch = {chExpr}) >= 128 && RegexRunner.CharInClass((char)ch, {Literal(charClass)}))"; + return EmitContainsNoAscii(); } if (analysis.AllAsciiContained) @@ -4522,9 +4520,7 @@ private static string MatchCharacterClass(RegexOptions options, string chExpr, s // We determined that every ASCII character is in the class, for example // if the class were the negated example from case 1 above: // [^\p{IsGreek}\p{IsGreekExtended}]. - return negate ? - $"((ch = {chExpr}) >= 128 && !RegexRunner.CharInClass((char)ch, {Literal(charClass)}))" : - $"((ch = {chExpr}) < 128 || RegexRunner.CharInClass((char)ch, {Literal(charClass)}))"; + return EmitAllAsciiContained(); } // Now, our big hammer is to generate a lookup table that lets us quickly index by character into a yes/no @@ -4554,6 +4550,15 @@ private static string MatchCharacterClass(RegexOptions options, string chExpr, s } }); + // There's a chance that the class contains either no ASCII characters or all of them, + // and the analysis could not find it (for example if the class has a subtraction). + // We optimize away the bit vector in these trivial cases. + switch (bitVectorString) + { + case "\0\0\0\0\0\0\0\0": return EmitContainsNoAscii(); + case "\uffff\uffff\uffff\uffff\uffff\uffff\uffff\uffff": return EmitAllAsciiContained(); + } + // We determined that the character class may contain ASCII, so we // output the lookup against the lookup table. @@ -4583,6 +4588,20 @@ private static string MatchCharacterClass(RegexOptions options, string chExpr, s return negate ? $"((ch = {chExpr}) < 128 ? ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) == 0 : !RegexRunner.CharInClass((char)ch, {Literal(charClass)}))" : $"((ch = {chExpr}) < 128 ? ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0 : RegexRunner.CharInClass((char)ch, {Literal(charClass)}))"; + + string EmitContainsNoAscii() + { + return negate ? + $"((ch = {chExpr}) < 128 || !RegexRunner.CharInClass((char)ch, {Literal(charClass)}))" : + $"((ch = {chExpr}) >= 128 && RegexRunner.CharInClass((char)ch, {Literal(charClass)}))"; + } + + string EmitAllAsciiContained() + { + return negate ? + $"((ch = {chExpr}) >= 128 && !RegexRunner.CharInClass((char)ch, {Literal(charClass)}))" : + $"((ch = {chExpr}) < 128 || RegexRunner.CharInClass((char)ch, {Literal(charClass)}))"; + } } /// diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 943b0363ea211..69a87ab534f9a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -5472,14 +5472,8 @@ void EmitCharInClass() Label doneLabel = DefineLabel(); Label comparisonLabel = DefineLabel(); - if (analysis.ContainsNoAscii) + void EmitContainsNoAscii() { - // We determined that the character class contains only non-ASCII, - // for example if the class were [\u1000-\u2000\u3000-\u4000\u5000-\u6000]. - // (In the future, we could possibly extend the analysis to produce a known - // lower-bound and compare against that rather than always using 128 as the - // pivot point.) - // ch >= 128 && RegexRunner.CharInClass(ch, "...") Ldloc(tempLocal); Ldc(128); @@ -5491,15 +5485,10 @@ void EmitCharInClass() Stloc(resultLocal); MarkLabel(doneLabel); Ldloc(resultLocal); - return; } - if (analysis.AllAsciiContained) + void EmitAllAsciiContained() { - // We determined that every ASCII character is in the class, for example - // if the class were the negated example from case 1 above: - // [^\p{IsGreek}\p{IsGreekExtended}]. - // ch < 128 || RegexRunner.CharInClass(ch, "...") Ldloc(tempLocal); Ldc(128); @@ -5511,6 +5500,27 @@ void EmitCharInClass() Stloc(resultLocal); MarkLabel(doneLabel); Ldloc(resultLocal); + } + + if (analysis.ContainsNoAscii) + { + // We determined that the character class contains only non-ASCII, + // for example if the class were [\u1000-\u2000\u3000-\u4000\u5000-\u6000]. + // (In the future, we could possibly extend the analysis to produce a known + // lower-bound and compare against that rather than always using 128 as the + // pivot point.) + + EmitContainsNoAscii(); + return; + } + + if (analysis.AllAsciiContained) + { + // We determined that every ASCII character is in the class, for example + // if the class were the negated example from case 1 above: + // [^\p{IsGreek}\p{IsGreekExtended}]. + + EmitAllAsciiContained(); return; } @@ -5541,6 +5551,19 @@ void EmitCharInClass() } }); + // There's a chance that the class contains either no ASCII characters or all of them, + // and the analysis could not find it (for example if the class has a subtraction). + // We optimize away the bit vector in these trivial cases. + switch (bitVectorString) + { + case "\0\0\0\0\0\0\0\0": + EmitContainsNoAscii(); + return; + case "\uffff\uffff\uffff\uffff\uffff\uffff\uffff\uffff": + EmitAllAsciiContained(); + return; + } + // We determined that the character class may contain ASCII, so we // output the lookup against the lookup table. diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Count.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Count.Tests.cs index bef3bb13d1873..038851643b9bc 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Count.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Count.Tests.cs @@ -75,6 +75,8 @@ public static IEnumerable Count_ReturnsExpectedCount_TestData() yield return new object[] { engine, @".", "\n\n\n", 0, RegexOptions.None, 0 }; yield return new object[] { engine, @".", "\n\n\n", 0, RegexOptions.Singleline, 3 }; + yield return new object[] { engine, @"[а-я-[аeиоуыэюя]]", "спокойной ночи", 0, RegexOptions.None, 8 }; + if (!RegexHelpers.IsNonBacktracking(engine)) { // Lookbehinds