From 6eddc61db4ff872d993c034716e142795b922e38 Mon Sep 17 00:00:00 2001 From: adams85 <31276480+adams85@users.noreply.github.com> Date: Mon, 31 Jul 2023 14:22:54 +0200 Subject: [PATCH] Implement workarounds for regex parsing known issues (#1603) --- .../Test262Harness.settings.json | 4 - Jint.Tests/Runtime/RegExpTests.cs | 39 +++++++- Jint/Native/RegExp/RegExpPrototype.cs | 94 ++++++++++--------- Jint/Shims.cs | 41 ++++++++ 4 files changed, 129 insertions(+), 49 deletions(-) create mode 100644 Jint/Shims.cs diff --git a/Jint.Tests.Test262/Test262Harness.settings.json b/Jint.Tests.Test262/Test262Harness.settings.json index 0140de4ee3..54a56f0059 100644 --- a/Jint.Tests.Test262/Test262Harness.settings.json +++ b/Jint.Tests.Test262/Test262Harness.settings.json @@ -44,14 +44,10 @@ "language/literals/regexp/named-groups/forward-reference.js", // RegExp handling problems - "built-ins/RegExp/match-indices/indices-array-unicode-property-names.js", "built-ins/RegExp/named-groups/non-unicode-match.js", "built-ins/RegExp/named-groups/non-unicode-property-names-valid.js", - "built-ins/RegExp/named-groups/non-unicode-property-names.js", "built-ins/RegExp/named-groups/unicode-match.js", "built-ins/RegExp/named-groups/unicode-property-names-valid.js", - "built-ins/RegExp/named-groups/unicode-property-names.js", - "built-ins/RegExp/prototype/Symbol.replace/named-groups.js", "built-ins/RegExp/prototype/exec/S15.10.6.2_A1_T6.js", "built-ins/String/prototype/split/separator-regexp.js", "language/literals/regexp/u-case-mapping.js", diff --git a/Jint.Tests/Runtime/RegExpTests.cs b/Jint.Tests/Runtime/RegExpTests.cs index b9f08ea3d7..af27f11aa8 100644 --- a/Jint.Tests/Runtime/RegExpTests.cs +++ b/Jint.Tests/Runtime/RegExpTests.cs @@ -35,7 +35,7 @@ public void CanNotBreakEngineWithLongRunningRegExp() public void PreventsInfiniteLoop() { var engine = new Engine(); - var result = (ArrayInstance)engine.Evaluate("'x'.match(/|/g);"); + var result = (ArrayInstance) engine.Evaluate("'x'.match(/|/g);"); Assert.Equal((uint) 2, result.Length); Assert.Equal("", result[0]); Assert.Equal("", result[1]); @@ -103,4 +103,41 @@ public void ShouldProduceCorrectSourceForSlashEscapes() var source = engine.Evaluate(@"/\/\//.source"); Assert.Equal("\\/\\/", source); } + + [Theory] + [InlineData("", "/()/ug", new[] { "" }, new[] { 0 })] + [InlineData("💩", "/()/ug", new[] { "", "" }, new[] { 0, 2 })] + [InlineData("ᴜⁿᵢ𝒸ₒᵈₑ is a 💩", "/i?/ug", + new[] { "", "", "", "", "", "", "", "", "i", "", "", "", "", "", "" }, + new[] { 0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 })] + public void ShouldNotMatchEmptyStringsWithinSurrogatePairsInUnicodeMode(string input, string pattern, string[] expectedCaptures, int[] expectedIndices) + { + var engine = new Engine(); + var matches = engine.Evaluate($"[...'{input}'.matchAll({pattern})]").AsArray(); + Assert.Equal((ulong) expectedCaptures.Length, matches.Length); + Assert.Equal(expectedCaptures, matches.Select((m, i) => m.Get(0).AsString())); + Assert.Equal(expectedIndices, matches.Select(m => m.Get("index").AsInteger())); + } + + [Fact] + public void ShouldAllowProblematicGroupNames() + { + var engine = new Engine(); + + var match = engine.Evaluate("'abc'.match(/(?<$group>b)/)").AsArray(); + var groups = match.Get("groups").AsObject(); + Assert.Equal(new[] { "$group" }, groups.GetOwnPropertyKeys().Select(k => k.AsString())); + Assert.Equal("b", groups["$group"]); + + var result = engine.Evaluate("'abc'.replace(/(?<$group>b)/g, '-$<$group>-')").AsString(); + Assert.Equal("a-b-c", result); + } + + [Fact] + public void Issue506() + { + var engine = new Engine(); + var result = engine.Evaluate("/[^]?(:[rp][el]a[\\w-]+)[^]/.test(':reagent-')").AsBoolean(); + Assert.True(result); + } } diff --git a/Jint/Native/RegExp/RegExpPrototype.cs b/Jint/Native/RegExp/RegExpPrototype.cs index 265b6fb9f1..65fbe03f07 100644 --- a/Jint/Native/RegExp/RegExpPrototype.cs +++ b/Jint/Native/RegExp/RegExpPrototype.cs @@ -1,4 +1,5 @@ using System.Diagnostics.CodeAnalysis; +using System.Text; using System.Text.RegularExpressions; using Jint.Collections; using Jint.Native.Number; @@ -824,7 +825,7 @@ internal static JsValue RegExpExec(ObjectInstance r, string s) var exec = r.Get(PropertyExec); if (exec is ICallable callable) { - var result = callable.Call(r, new JsValue[] { s }); + var result = callable.Call(r, new JsValue[] { s }); if (!result.IsNull() && !result.IsObject()) { ExceptionHelper.ThrowTypeError(r.Engine.Realm); @@ -902,31 +903,47 @@ private static JsValue RegExpBuiltinExec(JsRegExp R, string s) // the stateful version Match match; - while (true) + + if (lastIndex > length) { - if (lastIndex > length) - { - R.Set(JsRegExp.PropertyLastIndex, JsNumber.PositiveZero, true); - return Null; - } + R.Set(JsRegExp.PropertyLastIndex, JsNumber.PositiveZero, true); + return Null; + } - match = R.Value.Match(s, (int) lastIndex); - var success = match.Success && (!sticky || match.Index == (int) lastIndex); - if (!success) + var startAt = (int) lastIndex; + while (true) + { + match = R.Value.Match(s, startAt); + + // The conversion of Unicode regex patterns to .NET Regex has some flaws: + // when the pattern may match empty strings, the adapted Regex will return empty string matches + // in the middle of surrogate pairs. As a best effort solution, we remove these fake positive matches. + // (See also: https://github.com/sebastienros/esprima-dotnet/pull/364#issuecomment-1606045259) + + if (match.Success + && fullUnicode + && match.Length == 0 + && 0 < match.Index && match.Index < s.Length + && char.IsHighSurrogate(s[match.Index - 1]) && char.IsLowSurrogate(s[match.Index])) { - R.Set(JsRegExp.PropertyLastIndex, JsNumber.PositiveZero, true); - return Null; + startAt++; + continue; } break; } - var e = match.Index + match.Length; - if (fullUnicode) + var success = match.Success && (!sticky || match.Index == (int) lastIndex); + if (!success) { - e = GetStringIndex(s, e); + R.Set(JsRegExp.PropertyLastIndex, JsNumber.PositiveZero, true); + return Null; } + var e = match.Index + match.Length; + + // NOTE: Even in Unicode mode, we don't need to translate indices as .NET regexes always return code unit indices. + if (global || sticky) { R.Set(JsRegExp.PropertyLastIndex, e, true); @@ -935,35 +952,6 @@ private static JsValue RegExpBuiltinExec(JsRegExp R, string s) return CreateReturnValueArray(R.Engine, matcher, match, s, fullUnicode, hasIndices); } - /// - /// https://tc39.es/ecma262/#sec-getstringindex - /// - private static int GetStringIndex(string s, int codePointIndex) - { - if (s.Length == 0) - { - return 0; - } - - var len = s.Length; - var codeUnitCount = 0; - var codePointCount = 0; - - while (codeUnitCount < len) - { - if (codePointCount == codePointIndex) - { - return codeUnitCount; - } - - var isSurrogatePair = char.IsSurrogatePair(s, codeUnitCount); - codeUnitCount += isSurrogatePair ? 2 : 1; - codePointCount += 1; - } - - return len; - } - private static JsArray CreateReturnValueArray( Engine engine, Regex regex, @@ -1080,6 +1068,24 @@ private static JsValue GetMatchIndexPair(Engine engine, string s, JsNumber[] mat return null; } + + // The characters allowed in group names differs between the JS and .NET regex engines. + // For example the group name "$group" is valid in JS but invalid in .NET. + // As a workaround for this issue, the parser make an attempt to encode the problematic group names to + // names which are valid in .NET and probably won't collide with other group names present in the pattern + // (https://github.com/sebastienros/esprima-dotnet/blob/v3.0.0-rc-03/src/Esprima/Scanner.RegExpParser.cs#L942). + // We need to decode such group names. + const string encodedGroupNamePrefix = "__utf8_"; + if (groupNameFromNumber.StartsWith(encodedGroupNamePrefix, StringComparison.Ordinal)) + { + try + { + var bytes = groupNameFromNumber.AsSpan(encodedGroupNamePrefix.Length).BytesFromHexString(); + groupNameFromNumber = Encoding.UTF8.GetString(bytes); + } + catch { /* intentional no-op */ } + } + return groupNameFromNumber; } diff --git a/Jint/Shims.cs b/Jint/Shims.cs new file mode 100644 index 0000000000..9327192f20 --- /dev/null +++ b/Jint/Shims.cs @@ -0,0 +1,41 @@ +namespace Jint; + +internal static class Shims +{ + public static byte[] BytesFromHexString(this ReadOnlySpan value) + { +#if NET6_0_OR_GREATER + return Convert.FromHexString(value); +#else + if ((value.Length & 1) != 0) + { + throw new FormatException(); + } + + var byteCount = value.Length >> 1; + var result = new byte[byteCount]; + var index = 0; + for (var i = 0; i < byteCount; i++) + { + int hi, lo; + if ((hi = GetDigitValue(value[index++])) < 0 + || (lo = GetDigitValue(value[index++])) < 0) + { + throw new FormatException(); + } + + result[i] = (byte) (hi << 4 | lo); + } + + return result; + + static int GetDigitValue(char ch) => ch switch + { + >= '0' and <= '9' => ch - 0x30, + >= 'a' and <= 'f' => ch - 0x57, + >= 'A' and <= 'F' => ch - 0x37, + _ => -1 + }; +#endif + } +}