Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Correct regular expression flags scanning for non-BMP characters #58612

Merged
merged 5 commits into from
Jun 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 35 additions & 33 deletions src/compiler/scanner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -282,16 +282,16 @@ const textToToken = new Map(Object.entries({
"`": SyntaxKind.BacktickToken,
}));

const charToRegExpFlag = new Map(Object.entries({
d: RegularExpressionFlags.HasIndices,
g: RegularExpressionFlags.Global,
i: RegularExpressionFlags.IgnoreCase,
m: RegularExpressionFlags.Multiline,
s: RegularExpressionFlags.DotAll,
u: RegularExpressionFlags.Unicode,
v: RegularExpressionFlags.UnicodeSets,
y: RegularExpressionFlags.Sticky,
}));
const charCodeToRegExpFlag = new Map<CharacterCodes, RegularExpressionFlags>([
[CharacterCodes.d, RegularExpressionFlags.HasIndices],
[CharacterCodes.g, RegularExpressionFlags.Global],
[CharacterCodes.i, RegularExpressionFlags.IgnoreCase],
[CharacterCodes.m, RegularExpressionFlags.Multiline],
[CharacterCodes.s, RegularExpressionFlags.DotAll],
[CharacterCodes.u, RegularExpressionFlags.Unicode],
[CharacterCodes.v, RegularExpressionFlags.UnicodeSets],
[CharacterCodes.y, RegularExpressionFlags.Sticky],
]);

const regExpFlagToFirstAvailableLanguageVersion = new Map<RegularExpressionFlags, LanguageFeatureMinimumTarget>([
[RegularExpressionFlags.HasIndices, LanguageFeatureMinimumTarget.RegularExpressionFlagsHasIndices],
Expand Down Expand Up @@ -394,8 +394,8 @@ function isUnicodeIdentifierPart(code: number, languageVersion: ScriptTarget | u
lookupInUnicodeMap(code, unicodeES5IdentifierPart);
}

function makeReverseMap(source: Map<string, number>): string[] {
const result: string[] = [];
function makeReverseMap<T>(source: Map<T, number>): T[] {
const result: T[] = [];
source.forEach((value, name) => {
result[value] = name;
});
Expand All @@ -416,16 +416,16 @@ export function stringToToken(s: string): SyntaxKind | undefined {
return textToToken.get(s);
}

const regExpFlagChars = makeReverseMap(charToRegExpFlag);
const regExpFlagCharCodes = makeReverseMap(charCodeToRegExpFlag);

/** @internal */
export function regularExpressionFlagToCharacter(f: RegularExpressionFlags): string | undefined {
return regExpFlagChars[f];
export function regularExpressionFlagToCharacterCode(f: RegularExpressionFlags): CharacterCodes | undefined {
return regExpFlagCharCodes[f];
}

/** @internal */
export function characterToRegularExpressionFlag(c: string): RegularExpressionFlags | undefined {
return charToRegExpFlag.get(c);
export function characterCodeToRegularExpressionFlag(ch: CharacterCodes): RegularExpressionFlags | undefined {
return charCodeToRegExpFlag.get(ch);
}

/** @internal */
Expand Down Expand Up @@ -2558,27 +2558,28 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
pos++;
let regExpFlags = RegularExpressionFlags.None;
while (true) {
const ch = charCodeChecked(pos);
const ch = codePointChecked(pos);
if (ch === CharacterCodes.EOF || !isIdentifierPart(ch, languageVersion)) {
break;
}
const size = charSize(ch);
if (reportErrors) {
const flag = characterToRegularExpressionFlag(String.fromCharCode(ch));
const flag = characterCodeToRegularExpressionFlag(ch);
if (flag === undefined) {
error(Diagnostics.Unknown_regular_expression_flag, pos, 1);
error(Diagnostics.Unknown_regular_expression_flag, pos, size);
}
else if (regExpFlags & flag) {
error(Diagnostics.Duplicate_regular_expression_flag, pos, 1);
error(Diagnostics.Duplicate_regular_expression_flag, pos, size);
}
else if (((regExpFlags | flag) & RegularExpressionFlags.AnyUnicodeMode) === RegularExpressionFlags.AnyUnicodeMode) {
error(Diagnostics.The_Unicode_u_flag_and_the_Unicode_Sets_v_flag_cannot_be_set_simultaneously, pos, 1);
error(Diagnostics.The_Unicode_u_flag_and_the_Unicode_Sets_v_flag_cannot_be_set_simultaneously, pos, size);
}
else {
regExpFlags |= flag;
checkRegularExpressionFlagAvailable(flag, pos);
checkRegularExpressionFlagAvailability(flag, size);
}
}
pos++;
pos += size;
}
if (reportErrors) {
scanRange(startOfRegExpBody, endOfRegExpBody - startOfRegExpBody, () => {
Expand Down Expand Up @@ -2843,25 +2844,26 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean

function scanPatternModifiers(currFlags: RegularExpressionFlags): RegularExpressionFlags {
while (true) {
const ch = charCodeChecked(pos);
const ch = codePointChecked(pos);
if (ch === CharacterCodes.EOF || !isIdentifierPart(ch, languageVersion)) {
break;
}
const flag = characterToRegularExpressionFlag(String.fromCharCode(ch));
const size = charSize(ch);
const flag = characterCodeToRegularExpressionFlag(ch);
if (flag === undefined) {
error(Diagnostics.Unknown_regular_expression_flag, pos, 1);
error(Diagnostics.Unknown_regular_expression_flag, pos, size);
}
else if (currFlags & flag) {
error(Diagnostics.Duplicate_regular_expression_flag, pos, 1);
error(Diagnostics.Duplicate_regular_expression_flag, pos, size);
}
else if (!(flag & RegularExpressionFlags.Modifiers)) {
error(Diagnostics.This_regular_expression_flag_cannot_be_toggled_within_a_subpattern, pos, 1);
error(Diagnostics.This_regular_expression_flag_cannot_be_toggled_within_a_subpattern, pos, size);
}
else {
currFlags |= flag;
checkRegularExpressionFlagAvailable(flag, pos);
checkRegularExpressionFlagAvailability(flag, size);
}
pos++;
pos += size;
}
return currFlags;
}
Expand Down Expand Up @@ -3583,10 +3585,10 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
});
}

function checkRegularExpressionFlagAvailable(flag: RegularExpressionFlags, pos: number) {
function checkRegularExpressionFlagAvailability(flag: RegularExpressionFlags, size: number) {
const availableFrom = regExpFlagToFirstAvailableLanguageVersion.get(flag) as ScriptTarget | undefined;
if (availableFrom && languageVersion < availableFrom) {
error(Diagnostics.This_regular_expression_flag_is_only_available_when_targeting_0_or_later, pos, 1, getNameOfScriptTarget(availableFrom));
error(Diagnostics.This_regular_expression_flag_is_only_available_when_targeting_0_or_later, pos, size, getNameOfScriptTarget(availableFrom));
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
regularExpressionWithNonBMPFlags.ts(7,23): error TS1499: Unknown regular expression flag.
regularExpressionWithNonBMPFlags.ts(7,25): error TS1499: Unknown regular expression flag.
regularExpressionWithNonBMPFlags.ts(7,28): error TS1499: Unknown regular expression flag.
regularExpressionWithNonBMPFlags.ts(7,41): error TS1499: Unknown regular expression flag.
regularExpressionWithNonBMPFlags.ts(7,43): error TS1499: Unknown regular expression flag.
regularExpressionWithNonBMPFlags.ts(7,45): error TS1499: Unknown regular expression flag.


==== regularExpressionWithNonBMPFlags.ts (6 errors) ====
// The characters in the following regular expression are ASCII-lookalike characters found in Unicode, including:
// - 𝘴 (U+1D634 Mathematical Sans-Serif Italic Small S)
// - 𝘪 (U+1D62A Mathematical Sans-Serif Italic Small I)
// - 𝘮 (U+1D62E Mathematical Sans-Serif Italic Small M)
//
// See https://en.wikipedia.org/wiki/Mathematical_Alphanumeric_Symbols
const 𝘳𝘦𝘨𝘦𝘹 = /(?𝘴𝘪-𝘮:^𝘧𝘰𝘰.)/𝘨𝘮𝘶;
~~
!!! error TS1499: Unknown regular expression flag.
~~
!!! error TS1499: Unknown regular expression flag.
~~
!!! error TS1499: Unknown regular expression flag.
~~
!!! error TS1499: Unknown regular expression flag.
~~
!!! error TS1499: Unknown regular expression flag.
~~
!!! error TS1499: Unknown regular expression flag.

20 changes: 20 additions & 0 deletions tests/baselines/reference/regularExpressionWithNonBMPFlags.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
//// [tests/cases/compiler/regularExpressionWithNonBMPFlags.ts] ////

//// [regularExpressionWithNonBMPFlags.ts]
// The characters in the following regular expression are ASCII-lookalike characters found in Unicode, including:
// - 𝘴 (U+1D634 Mathematical Sans-Serif Italic Small S)
// - 𝘪 (U+1D62A Mathematical Sans-Serif Italic Small I)
// - 𝘮 (U+1D62E Mathematical Sans-Serif Italic Small M)
//
// See https://en.wikipedia.org/wiki/Mathematical_Alphanumeric_Symbols
const 𝘳𝘦𝘨𝘦𝘹 = /(?𝘴𝘪-𝘮:^𝘧𝘰𝘰.)/𝘨𝘮𝘶;


//// [regularExpressionWithNonBMPFlags.js]
// The characters in the following regular expression are ASCII-lookalike characters found in Unicode, including:
// - 𝘴 (U+1D634 Mathematical Sans-Serif Italic Small S)
// - 𝘪 (U+1D62A Mathematical Sans-Serif Italic Small I)
// - 𝘮 (U+1D62E Mathematical Sans-Serif Italic Small M)
//
// See https://en.wikipedia.org/wiki/Mathematical_Alphanumeric_Symbols
const 𝘳𝘦𝘨𝘦𝘹 = /(?𝘴𝘪-𝘮:^𝘧𝘰𝘰.)/𝘨𝘮𝘶;
12 changes: 12 additions & 0 deletions tests/baselines/reference/regularExpressionWithNonBMPFlags.symbols
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
//// [tests/cases/compiler/regularExpressionWithNonBMPFlags.ts] ////

=== regularExpressionWithNonBMPFlags.ts ===
// The characters in the following regular expression are ASCII-lookalike characters found in Unicode, including:
// - 𝘴 (U+1D634 Mathematical Sans-Serif Italic Small S)
// - 𝘪 (U+1D62A Mathematical Sans-Serif Italic Small I)
// - 𝘮 (U+1D62E Mathematical Sans-Serif Italic Small M)
//
// See https://en.wikipedia.org/wiki/Mathematical_Alphanumeric_Symbols
const 𝘳𝘦𝘨𝘦𝘹 = /(?𝘴𝘪-𝘮:^𝘧𝘰𝘰.)/𝘨𝘮𝘶;
>𝘳𝘦𝘨𝘦𝘹 : Symbol(𝘳𝘦𝘨𝘦𝘹, Decl(regularExpressionWithNonBMPFlags.ts, 6, 5))

15 changes: 15 additions & 0 deletions tests/baselines/reference/regularExpressionWithNonBMPFlags.types
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
//// [tests/cases/compiler/regularExpressionWithNonBMPFlags.ts] ////

=== regularExpressionWithNonBMPFlags.ts ===
// The characters in the following regular expression are ASCII-lookalike characters found in Unicode, including:
// - 𝘴 (U+1D634 Mathematical Sans-Serif Italic Small S)
// - 𝘪 (U+1D62A Mathematical Sans-Serif Italic Small I)
// - 𝘮 (U+1D62E Mathematical Sans-Serif Italic Small M)
//
// See https://en.wikipedia.org/wiki/Mathematical_Alphanumeric_Symbols
const 𝘳𝘦𝘨𝘦𝘹 = /(?𝘴𝘪-𝘮:^𝘧𝘰𝘰.)/𝘨𝘮𝘶;
>𝘳𝘦𝘨𝘦𝘹 : RegExp
> : ^^^^^^
>/(?𝘴𝘪-𝘮:^𝘧𝘰𝘰.)/𝘨𝘮𝘶 : RegExp
> : ^^^^^^

9 changes: 9 additions & 0 deletions tests/cases/compiler/regularExpressionWithNonBMPFlags.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
// @target: esnext

// The characters in the following regular expression are ASCII-lookalike characters found in Unicode, including:
// - 𝘴 (U+1D634 Mathematical Sans-Serif Italic Small S)
// - 𝘪 (U+1D62A Mathematical Sans-Serif Italic Small I)
// - 𝘮 (U+1D62E Mathematical Sans-Serif Italic Small M)
//
// See https://en.wikipedia.org/wiki/Mathematical_Alphanumeric_Symbols
const 𝘳𝘦𝘨𝘦𝘹 = /(?𝘴𝘪-𝘮:^𝘧𝘰𝘰.)/𝘨𝘮𝘶;
graphemecluster marked this conversation as resolved.
Show resolved Hide resolved