Skip to content

Commit

Permalink
Implement workarounds for regex parsing known issues (#1603)
Browse files Browse the repository at this point in the history
  • Loading branch information
adams85 authored Jul 31, 2023
1 parent cc63b20 commit 6eddc61
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 49 deletions.
4 changes: 0 additions & 4 deletions Jint.Tests.Test262/Test262Harness.settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,10 @@
"language/literals/regexp/named-groups/forward-reference.js",

// RegExp handling problems
"built-ins/RegExp/match-indices/indices-array-unicode-property-names.js",
"built-ins/RegExp/named-groups/non-unicode-match.js",
"built-ins/RegExp/named-groups/non-unicode-property-names-valid.js",
"built-ins/RegExp/named-groups/non-unicode-property-names.js",
"built-ins/RegExp/named-groups/unicode-match.js",
"built-ins/RegExp/named-groups/unicode-property-names-valid.js",
"built-ins/RegExp/named-groups/unicode-property-names.js",
"built-ins/RegExp/prototype/Symbol.replace/named-groups.js",
"built-ins/RegExp/prototype/exec/S15.10.6.2_A1_T6.js",
"built-ins/String/prototype/split/separator-regexp.js",
"language/literals/regexp/u-case-mapping.js",
Expand Down
39 changes: 38 additions & 1 deletion Jint.Tests/Runtime/RegExpTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ public void CanNotBreakEngineWithLongRunningRegExp()
public void PreventsInfiniteLoop()
{
var engine = new Engine();
var result = (ArrayInstance)engine.Evaluate("'x'.match(/|/g);");
var result = (ArrayInstance) engine.Evaluate("'x'.match(/|/g);");
Assert.Equal((uint) 2, result.Length);
Assert.Equal("", result[0]);
Assert.Equal("", result[1]);
Expand Down Expand Up @@ -103,4 +103,41 @@ public void ShouldProduceCorrectSourceForSlashEscapes()
var source = engine.Evaluate(@"/\/\//.source");
Assert.Equal("\\/\\/", source);
}

[Theory]
[InlineData("", "/()/ug", new[] { "" }, new[] { 0 })]
[InlineData("💩", "/()/ug", new[] { "", "" }, new[] { 0, 2 })]
[InlineData("ᴜⁿᵢ𝒸ₒᵈₑ is a 💩", "/i?/ug",
new[] { "", "", "", "", "", "", "", "", "i", "", "", "", "", "", "" },
new[] { 0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 })]
public void ShouldNotMatchEmptyStringsWithinSurrogatePairsInUnicodeMode(string input, string pattern, string[] expectedCaptures, int[] expectedIndices)
{
var engine = new Engine();
var matches = engine.Evaluate($"[...'{input}'.matchAll({pattern})]").AsArray();
Assert.Equal((ulong) expectedCaptures.Length, matches.Length);
Assert.Equal(expectedCaptures, matches.Select((m, i) => m.Get(0).AsString()));
Assert.Equal(expectedIndices, matches.Select(m => m.Get("index").AsInteger()));
}

[Fact]
public void ShouldAllowProblematicGroupNames()
{
var engine = new Engine();

var match = engine.Evaluate("'abc'.match(/(?<$group>b)/)").AsArray();
var groups = match.Get("groups").AsObject();
Assert.Equal(new[] { "$group" }, groups.GetOwnPropertyKeys().Select(k => k.AsString()));
Assert.Equal("b", groups["$group"]);

var result = engine.Evaluate("'abc'.replace(/(?<$group>b)/g, '-$<$group>-')").AsString();
Assert.Equal("a-b-c", result);
}

[Fact]
public void Issue506()
{
var engine = new Engine();
var result = engine.Evaluate("/[^]?(:[rp][el]a[\\w-]+)[^]/.test(':reagent-')").AsBoolean();
Assert.True(result);
}
}
94 changes: 50 additions & 44 deletions Jint/Native/RegExp/RegExpPrototype.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using System.Diagnostics.CodeAnalysis;
using System.Text;
using System.Text.RegularExpressions;
using Jint.Collections;
using Jint.Native.Number;
Expand Down Expand Up @@ -824,7 +825,7 @@ internal static JsValue RegExpExec(ObjectInstance r, string s)
var exec = r.Get(PropertyExec);
if (exec is ICallable callable)
{
var result = callable.Call(r, new JsValue[] { s });
var result = callable.Call(r, new JsValue[] { s });
if (!result.IsNull() && !result.IsObject())
{
ExceptionHelper.ThrowTypeError(r.Engine.Realm);
Expand Down Expand Up @@ -902,31 +903,47 @@ private static JsValue RegExpBuiltinExec(JsRegExp R, string s)

// the stateful version
Match match;
while (true)

if (lastIndex > length)
{
if (lastIndex > length)
{
R.Set(JsRegExp.PropertyLastIndex, JsNumber.PositiveZero, true);
return Null;
}
R.Set(JsRegExp.PropertyLastIndex, JsNumber.PositiveZero, true);
return Null;
}

match = R.Value.Match(s, (int) lastIndex);
var success = match.Success && (!sticky || match.Index == (int) lastIndex);
if (!success)
var startAt = (int) lastIndex;
while (true)
{
match = R.Value.Match(s, startAt);

// The conversion of Unicode regex patterns to .NET Regex has some flaws:
// when the pattern may match empty strings, the adapted Regex will return empty string matches
// in the middle of surrogate pairs. As a best effort solution, we remove these fake positive matches.
// (See also: https://github.com/sebastienros/esprima-dotnet/pull/364#issuecomment-1606045259)

if (match.Success
&& fullUnicode
&& match.Length == 0
&& 0 < match.Index && match.Index < s.Length
&& char.IsHighSurrogate(s[match.Index - 1]) && char.IsLowSurrogate(s[match.Index]))
{
R.Set(JsRegExp.PropertyLastIndex, JsNumber.PositiveZero, true);
return Null;
startAt++;
continue;
}

break;
}

var e = match.Index + match.Length;
if (fullUnicode)
var success = match.Success && (!sticky || match.Index == (int) lastIndex);
if (!success)
{
e = GetStringIndex(s, e);
R.Set(JsRegExp.PropertyLastIndex, JsNumber.PositiveZero, true);
return Null;
}

var e = match.Index + match.Length;

// NOTE: Even in Unicode mode, we don't need to translate indices as .NET regexes always return code unit indices.

if (global || sticky)
{
R.Set(JsRegExp.PropertyLastIndex, e, true);
Expand All @@ -935,35 +952,6 @@ private static JsValue RegExpBuiltinExec(JsRegExp R, string s)
return CreateReturnValueArray(R.Engine, matcher, match, s, fullUnicode, hasIndices);
}

/// <summary>
/// https://tc39.es/ecma262/#sec-getstringindex
/// </summary>
private static int GetStringIndex(string s, int codePointIndex)
{
if (s.Length == 0)
{
return 0;
}

var len = s.Length;
var codeUnitCount = 0;
var codePointCount = 0;

while (codeUnitCount < len)
{
if (codePointCount == codePointIndex)
{
return codeUnitCount;
}

var isSurrogatePair = char.IsSurrogatePair(s, codeUnitCount);
codeUnitCount += isSurrogatePair ? 2 : 1;
codePointCount += 1;
}

return len;
}

private static JsArray CreateReturnValueArray(
Engine engine,
Regex regex,
Expand Down Expand Up @@ -1080,6 +1068,24 @@ private static JsValue GetMatchIndexPair(Engine engine, string s, JsNumber[] mat
return null;

}

// The characters allowed in group names differs between the JS and .NET regex engines.
// For example the group name "$group" is valid in JS but invalid in .NET.
// As a workaround for this issue, the parser make an attempt to encode the problematic group names to
// names which are valid in .NET and probably won't collide with other group names present in the pattern
// (https://github.com/sebastienros/esprima-dotnet/blob/v3.0.0-rc-03/src/Esprima/Scanner.RegExpParser.cs#L942).
// We need to decode such group names.
const string encodedGroupNamePrefix = "__utf8_";
if (groupNameFromNumber.StartsWith(encodedGroupNamePrefix, StringComparison.Ordinal))
{
try
{
var bytes = groupNameFromNumber.AsSpan(encodedGroupNamePrefix.Length).BytesFromHexString();
groupNameFromNumber = Encoding.UTF8.GetString(bytes);
}
catch { /* intentional no-op */ }
}

return groupNameFromNumber;
}

Expand Down
41 changes: 41 additions & 0 deletions Jint/Shims.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
namespace Jint;

internal static class Shims
{
public static byte[] BytesFromHexString(this ReadOnlySpan<char> value)
{
#if NET6_0_OR_GREATER
return Convert.FromHexString(value);
#else
if ((value.Length & 1) != 0)
{
throw new FormatException();
}

var byteCount = value.Length >> 1;
var result = new byte[byteCount];
var index = 0;
for (var i = 0; i < byteCount; i++)
{
int hi, lo;
if ((hi = GetDigitValue(value[index++])) < 0
|| (lo = GetDigitValue(value[index++])) < 0)
{
throw new FormatException();
}

result[i] = (byte) (hi << 4 | lo);
}

return result;

static int GetDigitValue(char ch) => ch switch
{
>= '0' and <= '9' => ch - 0x30,
>= 'a' and <= 'f' => ch - 0x57,
>= 'A' and <= 'F' => ch - 0x37,
_ => -1
};
#endif
}
}

0 comments on commit 6eddc61

Please sign in to comment.