Skip to content

Commit

Permalink
Do not use the string bit vector in regexes, if it's all ones or all …
Browse files Browse the repository at this point in the history
…zeroes. (#72317)

* Do not use the string bit vector in regexes, if it's all ones or all zeroes.

* Add a test.
  • Loading branch information
teo-tsirpanis authored Jul 17, 2022
1 parent cd5e461 commit 8a709bc
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4512,19 +4512,15 @@ private static string MatchCharacterClass(RegexOptions options, string chExpr, s
// (In the future, we could possibly extend the rm.Analysis to produce a known
// lower-bound and compare against that rather than always using 128 as the
// pivot point.)
return negate ?
$"((ch = {chExpr}) < 128 || !RegexRunner.CharInClass((char)ch, {Literal(charClass)}))" :
$"((ch = {chExpr}) >= 128 && RegexRunner.CharInClass((char)ch, {Literal(charClass)}))";
return EmitContainsNoAscii();
}

if (analysis.AllAsciiContained)
{
// We determined that every ASCII character is in the class, for example
// if the class were the negated example from case 1 above:
// [^\p{IsGreek}\p{IsGreekExtended}].
return negate ?
$"((ch = {chExpr}) >= 128 && !RegexRunner.CharInClass((char)ch, {Literal(charClass)}))" :
$"((ch = {chExpr}) < 128 || RegexRunner.CharInClass((char)ch, {Literal(charClass)}))";
return EmitAllAsciiContained();
}

// Now, our big hammer is to generate a lookup table that lets us quickly index by character into a yes/no
Expand Down Expand Up @@ -4554,6 +4550,15 @@ private static string MatchCharacterClass(RegexOptions options, string chExpr, s
}
});

// There's a chance that the class contains either no ASCII characters or all of them,
// and the analysis could not find it (for example if the class has a subtraction).
// We optimize away the bit vector in these trivial cases.
switch (bitVectorString)
{
case "\0\0\0\0\0\0\0\0": return EmitContainsNoAscii();
case "\uffff\uffff\uffff\uffff\uffff\uffff\uffff\uffff": return EmitAllAsciiContained();
}

// We determined that the character class may contain ASCII, so we
// output the lookup against the lookup table.

Expand Down Expand Up @@ -4583,6 +4588,20 @@ private static string MatchCharacterClass(RegexOptions options, string chExpr, s
return negate ?
$"((ch = {chExpr}) < 128 ? ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) == 0 : !RegexRunner.CharInClass((char)ch, {Literal(charClass)}))" :
$"((ch = {chExpr}) < 128 ? ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0 : RegexRunner.CharInClass((char)ch, {Literal(charClass)}))";

string EmitContainsNoAscii()
{
return negate ?
$"((ch = {chExpr}) < 128 || !RegexRunner.CharInClass((char)ch, {Literal(charClass)}))" :
$"((ch = {chExpr}) >= 128 && RegexRunner.CharInClass((char)ch, {Literal(charClass)}))";
}

string EmitAllAsciiContained()
{
return negate ?
$"((ch = {chExpr}) >= 128 && !RegexRunner.CharInClass((char)ch, {Literal(charClass)}))" :
$"((ch = {chExpr}) < 128 || RegexRunner.CharInClass((char)ch, {Literal(charClass)}))";
}
}

/// <summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5472,14 +5472,8 @@ void EmitCharInClass()
Label doneLabel = DefineLabel();
Label comparisonLabel = DefineLabel();

if (analysis.ContainsNoAscii)
void EmitContainsNoAscii()
{
// We determined that the character class contains only non-ASCII,
// for example if the class were [\u1000-\u2000\u3000-\u4000\u5000-\u6000].
// (In the future, we could possibly extend the analysis to produce a known
// lower-bound and compare against that rather than always using 128 as the
// pivot point.)

// ch >= 128 && RegexRunner.CharInClass(ch, "...")
Ldloc(tempLocal);
Ldc(128);
Expand All @@ -5491,15 +5485,10 @@ void EmitCharInClass()
Stloc(resultLocal);
MarkLabel(doneLabel);
Ldloc(resultLocal);
return;
}

if (analysis.AllAsciiContained)
void EmitAllAsciiContained()
{
// We determined that every ASCII character is in the class, for example
// if the class were the negated example from case 1 above:
// [^\p{IsGreek}\p{IsGreekExtended}].

// ch < 128 || RegexRunner.CharInClass(ch, "...")
Ldloc(tempLocal);
Ldc(128);
Expand All @@ -5511,6 +5500,27 @@ void EmitCharInClass()
Stloc(resultLocal);
MarkLabel(doneLabel);
Ldloc(resultLocal);
}

if (analysis.ContainsNoAscii)
{
// We determined that the character class contains only non-ASCII,
// for example if the class were [\u1000-\u2000\u3000-\u4000\u5000-\u6000].
// (In the future, we could possibly extend the analysis to produce a known
// lower-bound and compare against that rather than always using 128 as the
// pivot point.)

EmitContainsNoAscii();
return;
}

if (analysis.AllAsciiContained)
{
// We determined that every ASCII character is in the class, for example
// if the class were the negated example from case 1 above:
// [^\p{IsGreek}\p{IsGreekExtended}].

EmitAllAsciiContained();
return;
}

Expand Down Expand Up @@ -5541,6 +5551,19 @@ void EmitCharInClass()
}
});

// There's a chance that the class contains either no ASCII characters or all of them,
// and the analysis could not find it (for example if the class has a subtraction).
// We optimize away the bit vector in these trivial cases.
switch (bitVectorString)
{
case "\0\0\0\0\0\0\0\0":
EmitContainsNoAscii();
return;
case "\uffff\uffff\uffff\uffff\uffff\uffff\uffff\uffff":
EmitAllAsciiContained();
return;
}

// We determined that the character class may contain ASCII, so we
// output the lookup against the lookup table.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ public static IEnumerable<object[]> Count_ReturnsExpectedCount_TestData()
yield return new object[] { engine, @".", "\n\n\n", 0, RegexOptions.None, 0 };
yield return new object[] { engine, @".", "\n\n\n", 0, RegexOptions.Singleline, 3 };

yield return new object[] { engine, @"[а-я-[аeиоуыэюя]]", "спокойной ночи", 0, RegexOptions.None, 8 };

if (!RegexHelpers.IsNonBacktracking(engine))
{
// Lookbehinds
Expand Down

0 comments on commit 8a709bc

Please sign in to comment.