From 6eddc61db4ff872d993c034716e142795b922e38 Mon Sep 17 00:00:00 2001
From: adams85 <31276480+adams85@users.noreply.github.com>
Date: Mon, 31 Jul 2023 14:22:54 +0200
Subject: [PATCH] Implement workarounds for regex parsing known issues (#1603)
---
.../Test262Harness.settings.json | 4 -
Jint.Tests/Runtime/RegExpTests.cs | 39 +++++++-
Jint/Native/RegExp/RegExpPrototype.cs | 94 ++++++++++---------
Jint/Shims.cs | 41 ++++++++
4 files changed, 129 insertions(+), 49 deletions(-)
create mode 100644 Jint/Shims.cs
diff --git a/Jint.Tests.Test262/Test262Harness.settings.json b/Jint.Tests.Test262/Test262Harness.settings.json
index 0140de4ee3..54a56f0059 100644
--- a/Jint.Tests.Test262/Test262Harness.settings.json
+++ b/Jint.Tests.Test262/Test262Harness.settings.json
@@ -44,14 +44,10 @@
"language/literals/regexp/named-groups/forward-reference.js",
// RegExp handling problems
- "built-ins/RegExp/match-indices/indices-array-unicode-property-names.js",
"built-ins/RegExp/named-groups/non-unicode-match.js",
"built-ins/RegExp/named-groups/non-unicode-property-names-valid.js",
- "built-ins/RegExp/named-groups/non-unicode-property-names.js",
"built-ins/RegExp/named-groups/unicode-match.js",
"built-ins/RegExp/named-groups/unicode-property-names-valid.js",
- "built-ins/RegExp/named-groups/unicode-property-names.js",
- "built-ins/RegExp/prototype/Symbol.replace/named-groups.js",
"built-ins/RegExp/prototype/exec/S15.10.6.2_A1_T6.js",
"built-ins/String/prototype/split/separator-regexp.js",
"language/literals/regexp/u-case-mapping.js",
diff --git a/Jint.Tests/Runtime/RegExpTests.cs b/Jint.Tests/Runtime/RegExpTests.cs
index b9f08ea3d7..af27f11aa8 100644
--- a/Jint.Tests/Runtime/RegExpTests.cs
+++ b/Jint.Tests/Runtime/RegExpTests.cs
@@ -35,7 +35,7 @@ public void CanNotBreakEngineWithLongRunningRegExp()
public void PreventsInfiniteLoop()
{
var engine = new Engine();
- var result = (ArrayInstance)engine.Evaluate("'x'.match(/|/g);");
+ var result = (ArrayInstance) engine.Evaluate("'x'.match(/|/g);");
Assert.Equal((uint) 2, result.Length);
Assert.Equal("", result[0]);
Assert.Equal("", result[1]);
@@ -103,4 +103,41 @@ public void ShouldProduceCorrectSourceForSlashEscapes()
var source = engine.Evaluate(@"/\/\//.source");
Assert.Equal("\\/\\/", source);
}
+
+ [Theory]
+ [InlineData("", "/()/ug", new[] { "" }, new[] { 0 })]
+ [InlineData("💩", "/()/ug", new[] { "", "" }, new[] { 0, 2 })]
+ [InlineData("ᴜⁿᵢ𝒸ₒᵈₑ is a 💩", "/i?/ug",
+ new[] { "", "", "", "", "", "", "", "", "i", "", "", "", "", "", "" },
+ new[] { 0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 })]
+ public void ShouldNotMatchEmptyStringsWithinSurrogatePairsInUnicodeMode(string input, string pattern, string[] expectedCaptures, int[] expectedIndices)
+ {
+ var engine = new Engine();
+ var matches = engine.Evaluate($"[...'{input}'.matchAll({pattern})]").AsArray();
+ Assert.Equal((ulong) expectedCaptures.Length, matches.Length);
+ Assert.Equal(expectedCaptures, matches.Select((m, i) => m.Get(0).AsString()));
+ Assert.Equal(expectedIndices, matches.Select(m => m.Get("index").AsInteger()));
+ }
+
+ [Fact]
+ public void ShouldAllowProblematicGroupNames()
+ {
+ var engine = new Engine();
+
+ var match = engine.Evaluate("'abc'.match(/(?<$group>b)/)").AsArray();
+ var groups = match.Get("groups").AsObject();
+ Assert.Equal(new[] { "$group" }, groups.GetOwnPropertyKeys().Select(k => k.AsString()));
+ Assert.Equal("b", groups["$group"]);
+
+ var result = engine.Evaluate("'abc'.replace(/(?<$group>b)/g, '-$<$group>-')").AsString();
+ Assert.Equal("a-b-c", result);
+ }
+
+ [Fact]
+ public void Issue506()
+ {
+ var engine = new Engine();
+ var result = engine.Evaluate("/[^]?(:[rp][el]a[\\w-]+)[^]/.test(':reagent-')").AsBoolean();
+ Assert.True(result);
+ }
}
diff --git a/Jint/Native/RegExp/RegExpPrototype.cs b/Jint/Native/RegExp/RegExpPrototype.cs
index 265b6fb9f1..65fbe03f07 100644
--- a/Jint/Native/RegExp/RegExpPrototype.cs
+++ b/Jint/Native/RegExp/RegExpPrototype.cs
@@ -1,4 +1,5 @@
using System.Diagnostics.CodeAnalysis;
+using System.Text;
using System.Text.RegularExpressions;
using Jint.Collections;
using Jint.Native.Number;
@@ -824,7 +825,7 @@ internal static JsValue RegExpExec(ObjectInstance r, string s)
var exec = r.Get(PropertyExec);
if (exec is ICallable callable)
{
- var result = callable.Call(r, new JsValue[] { s });
+ var result = callable.Call(r, new JsValue[] { s });
if (!result.IsNull() && !result.IsObject())
{
ExceptionHelper.ThrowTypeError(r.Engine.Realm);
@@ -902,31 +903,47 @@ private static JsValue RegExpBuiltinExec(JsRegExp R, string s)
// the stateful version
Match match;
- while (true)
+
+ if (lastIndex > length)
{
- if (lastIndex > length)
- {
- R.Set(JsRegExp.PropertyLastIndex, JsNumber.PositiveZero, true);
- return Null;
- }
+ R.Set(JsRegExp.PropertyLastIndex, JsNumber.PositiveZero, true);
+ return Null;
+ }
- match = R.Value.Match(s, (int) lastIndex);
- var success = match.Success && (!sticky || match.Index == (int) lastIndex);
- if (!success)
+ var startAt = (int) lastIndex;
+ while (true)
+ {
+ match = R.Value.Match(s, startAt);
+
+ // The conversion of Unicode regex patterns to .NET Regex has some flaws:
+ // when the pattern may match empty strings, the adapted Regex will return empty string matches
+ // in the middle of surrogate pairs. As a best effort solution, we remove these fake positive matches.
+ // (See also: https://github.com/sebastienros/esprima-dotnet/pull/364#issuecomment-1606045259)
+
+ if (match.Success
+ && fullUnicode
+ && match.Length == 0
+ && 0 < match.Index && match.Index < s.Length
+ && char.IsHighSurrogate(s[match.Index - 1]) && char.IsLowSurrogate(s[match.Index]))
{
- R.Set(JsRegExp.PropertyLastIndex, JsNumber.PositiveZero, true);
- return Null;
+ startAt++;
+ continue;
}
break;
}
- var e = match.Index + match.Length;
- if (fullUnicode)
+ var success = match.Success && (!sticky || match.Index == (int) lastIndex);
+ if (!success)
{
- e = GetStringIndex(s, e);
+ R.Set(JsRegExp.PropertyLastIndex, JsNumber.PositiveZero, true);
+ return Null;
}
+ var e = match.Index + match.Length;
+
+ // NOTE: Even in Unicode mode, we don't need to translate indices as .NET regexes always return code unit indices.
+
if (global || sticky)
{
R.Set(JsRegExp.PropertyLastIndex, e, true);
@@ -935,35 +952,6 @@ private static JsValue RegExpBuiltinExec(JsRegExp R, string s)
return CreateReturnValueArray(R.Engine, matcher, match, s, fullUnicode, hasIndices);
}
- ///
- /// https://tc39.es/ecma262/#sec-getstringindex
- ///
- private static int GetStringIndex(string s, int codePointIndex)
- {
- if (s.Length == 0)
- {
- return 0;
- }
-
- var len = s.Length;
- var codeUnitCount = 0;
- var codePointCount = 0;
-
- while (codeUnitCount < len)
- {
- if (codePointCount == codePointIndex)
- {
- return codeUnitCount;
- }
-
- var isSurrogatePair = char.IsSurrogatePair(s, codeUnitCount);
- codeUnitCount += isSurrogatePair ? 2 : 1;
- codePointCount += 1;
- }
-
- return len;
- }
-
private static JsArray CreateReturnValueArray(
Engine engine,
Regex regex,
@@ -1080,6 +1068,24 @@ private static JsValue GetMatchIndexPair(Engine engine, string s, JsNumber[] mat
return null;
}
+
+ // The characters allowed in group names differs between the JS and .NET regex engines.
+ // For example the group name "$group" is valid in JS but invalid in .NET.
+ // As a workaround for this issue, the parser make an attempt to encode the problematic group names to
+ // names which are valid in .NET and probably won't collide with other group names present in the pattern
+ // (https://github.com/sebastienros/esprima-dotnet/blob/v3.0.0-rc-03/src/Esprima/Scanner.RegExpParser.cs#L942).
+ // We need to decode such group names.
+ const string encodedGroupNamePrefix = "__utf8_";
+ if (groupNameFromNumber.StartsWith(encodedGroupNamePrefix, StringComparison.Ordinal))
+ {
+ try
+ {
+ var bytes = groupNameFromNumber.AsSpan(encodedGroupNamePrefix.Length).BytesFromHexString();
+ groupNameFromNumber = Encoding.UTF8.GetString(bytes);
+ }
+ catch { /* intentional no-op */ }
+ }
+
return groupNameFromNumber;
}
diff --git a/Jint/Shims.cs b/Jint/Shims.cs
new file mode 100644
index 0000000000..9327192f20
--- /dev/null
+++ b/Jint/Shims.cs
@@ -0,0 +1,41 @@
+namespace Jint;
+
+internal static class Shims
+{
+ public static byte[] BytesFromHexString(this ReadOnlySpan value)
+ {
+#if NET6_0_OR_GREATER
+ return Convert.FromHexString(value);
+#else
+ if ((value.Length & 1) != 0)
+ {
+ throw new FormatException();
+ }
+
+ var byteCount = value.Length >> 1;
+ var result = new byte[byteCount];
+ var index = 0;
+ for (var i = 0; i < byteCount; i++)
+ {
+ int hi, lo;
+ if ((hi = GetDigitValue(value[index++])) < 0
+ || (lo = GetDigitValue(value[index++])) < 0)
+ {
+ throw new FormatException();
+ }
+
+ result[i] = (byte) (hi << 4 | lo);
+ }
+
+ return result;
+
+ static int GetDigitValue(char ch) => ch switch
+ {
+ >= '0' and <= '9' => ch - 0x30,
+ >= 'a' and <= 'f' => ch - 0x57,
+ >= 'A' and <= 'F' => ch - 0x37,
+ _ => -1
+ };
+#endif
+ }
+}