Skip to content

Commit

Permalink
RFC: Support full Unicode in lexer
Browse files Browse the repository at this point in the history
Depends on #3115

Implements RFC at graphql/graphql-spec#849.

* Replaces `isSourceCharacter` with `isUnicodeScalarValue`
* Adds `isSupplementaryCodePoint`, used in String, BlockStrings, and Comments to ensure correct lexing of JavaScript's UTF-16 source.
* Updates `printCodePointAt` to correctly print supplementary code points.
* Adds variable-width Unicode escape sequences
* Adds explicit support for legacy JSON-style fixed-width Unicode escape sequence surrogate pairs.
* Adds `printString` to no longer rely on `JSON.stringify`. Borrows some implementation details from Node.js internals for string printing.

  Implements:

  > When producing a {StringValue}, implementations should use escape sequences to
  > represent non-printable control characters (U+0000 to U+001F and U+007F to
  > U+009F). Other escape sequences are not necessary, however an implementation may
  > use escape sequences to represent any other range of code points.

Closes #2449

Co-authored-by: Andreas Marek <[email protected]>
  • Loading branch information
leebyron and andimarek committed Jun 3, 2021
1 parent fd3d8c9 commit 9cdbb19
Show file tree
Hide file tree
Showing 6 changed files with 530 additions and 50 deletions.
259 changes: 226 additions & 33 deletions src/language/__tests__/lexer-test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,6 @@ function expectSyntaxError(text: string) {
}

describe('Lexer', () => {
it('disallows uncommon control characters', () => {
expectSyntaxError('\u0007').to.deep.equal({
message: 'Syntax Error: Invalid character: U+0007.',
locations: [{ line: 1, column: 1 }],
});
});

it('ignores BOM header', () => {
expect(lexOne('\uFEFF foo')).to.contain({
kind: TokenKind.NAME,
Expand Down Expand Up @@ -264,11 +257,97 @@ describe('Lexer', () => {
value: 'slashes \\ /',
});

expect(lexOne('"unicode \\u1234\\u5678\\u90AB\\uCDEF"')).to.contain({
expect(lexOne('"unescaped unicode outside BMP \u{1f600}"')).to.contain({
kind: TokenKind.STRING,
start: 0,
end: 34,
value: 'unescaped unicode outside BMP \u{1f600}',
});

expect(
lexOne('"unescaped maximal unicode outside BMP \u{10ffff}"'),
).to.contain({
kind: TokenKind.STRING,
start: 0,
end: 42,
value: 'unescaped maximal unicode outside BMP \u{10ffff}',
});

expect(lexOne('"unicode \\u1234\\u5678\\u90AB"')).to.contain({
kind: TokenKind.STRING,
start: 0,
end: 34,
value: 'unicode \u1234\u5678\u90AB\uCDEF',
value: 'unicode \u1234\u5678\u90AB',
});

expect(lexOne('"unicode \\u{1234}\\u{5678}\\u{90AB}"')).to.contain(
{
kind: TokenKind.STRING,
start: 0,
end: 42,
value: 'unicode \u1234\u5678\u90AB',
},
);

expect(
lexOne('"string with unicode escape outside BMP \\u{1F600}"'),
).to.contain({
kind: TokenKind.STRING,
start: 0,
end: 50,
value: 'string with unicode escape outside BMP \u{1f600}',
});

expect(lexOne('"string with minimal unicode escape \\u{0}"')).to.contain({
kind: TokenKind.STRING,
start: 0,
end: 42,
value: 'string with minimal unicode escape \u{0}',
});

expect(
lexOne('"string with maximal unicode escape \\u{10FFFF}"'),
).to.contain({
kind: TokenKind.STRING,
start: 0,
end: 47,
value: 'string with maximal unicode escape \u{10FFFF}',
});

expect(
lexOne('"string with maximal minimal unicode escape \\u{00000000}"'),
).to.contain({
kind: TokenKind.STRING,
start: 0,
end: 57,
value: 'string with maximal minimal unicode escape \u{0}',
});

expect(
lexOne('"string with unicode surrogate pair escape \\uD83D\\uDE00"'),
).to.contain({
kind: TokenKind.STRING,
start: 0,
end: 56,
value: 'string with unicode surrogate pair escape \u{1f600}',
});

expect(
lexOne('"string with minimal surrogate pair escape \\uD800\\uDC00"'),
).to.contain({
kind: TokenKind.STRING,
start: 0,
end: 56,
value: 'string with minimal surrogate pair escape \u{10000}',
});

expect(
lexOne('"string with maximal surrogate pair escape \\uDBFF\\uDFFF"'),
).to.contain({
kind: TokenKind.STRING,
start: 0,
end: 56,
value: 'string with maximal surrogate pair escape \u{10FFFF}',
});
});

Expand Down Expand Up @@ -299,16 +378,19 @@ describe('Lexer', () => {
locations: [{ line: 1, column: 1 }],
});

expectSyntaxError('"contains unescaped \u0007 control char"').to.deep.equal(
{
message: 'Syntax Error: Invalid character within String: U+0007.',
locations: [{ line: 1, column: 21 }],
},
);
expectSyntaxError('"bad surrogate \uDEAD"').to.deep.equal({
message: 'Syntax Error: Invalid character within String: U+DEAD.',
locations: [{ line: 1, column: 16 }],
});

expectSyntaxError('"bad high surrogate pair \uDEAD\uDEAD"').to.deep.equal({
message: 'Syntax Error: Invalid character within String: U+DEAD.',
locations: [{ line: 1, column: 26 }],
});

expectSyntaxError('"null-byte is not \u0000 end of file"').to.deep.equal({
message: 'Syntax Error: Invalid character within String: U+0000.',
locations: [{ line: 1, column: 19 }],
expectSyntaxError('"bad low surrogate pair \uD800\uD800"').to.deep.equal({
message: 'Syntax Error: Invalid character within String: U+D800.',
locations: [{ line: 1, column: 25 }],
});

expectSyntaxError('"multi\nline"').to.deep.equal({
Expand Down Expand Up @@ -355,6 +437,93 @@ describe('Lexer', () => {
message: 'Syntax Error: Invalid Unicode escape sequence: "\\uXXXF".',
locations: [{ line: 1, column: 6 }],
});

expectSyntaxError('"bad \\u{} esc"').to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{}".',
locations: [{ line: 1, column: 6 }],
});

expectSyntaxError('"bad \\u{FXXX} esc"').to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{FX".',
locations: [{ line: 1, column: 6 }],
});

expectSyntaxError('"bad \\u{FFFF esc"').to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{FFFF ".',
locations: [{ line: 1, column: 6 }],
});

expectSyntaxError('"bad \\u{FFFF"').to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{FFFF"".',
locations: [{ line: 1, column: 6 }],
});

expectSyntaxError('"too high \\u{110000} esc"').to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{110000}".',
locations: [{ line: 1, column: 11 }],
});

expectSyntaxError('"way too high \\u{12345678} esc"').to.deep.equal({
message:
'Syntax Error: Invalid Unicode escape sequence: "\\u{12345678}".',
locations: [{ line: 1, column: 15 }],
});

expectSyntaxError('"too long \\u{000000000} esc"').to.deep.equal({
message:
'Syntax Error: Invalid Unicode escape sequence: "\\u{000000000".',
locations: [{ line: 1, column: 11 }],
});

expectSyntaxError('"bad surrogate \\uDEAD esc"').to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\uDEAD".',
locations: [{ line: 1, column: 16 }],
});

expectSyntaxError('"bad surrogate \\u{DEAD} esc"').to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{DEAD}".',
locations: [{ line: 1, column: 16 }],
});

expectSyntaxError(
'"cannot use braces for surrogate pair \\u{D83D}\\u{DE00} esc"',
).to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{D83D}".',
locations: [{ line: 1, column: 39 }],
});

expectSyntaxError(
'"bad high surrogate pair \\uDEAD\\uDEAD esc"',
).to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\uDEAD".',
locations: [{ line: 1, column: 26 }],
});

expectSyntaxError(
'"bad low surrogate pair \\uD800\\uD800 esc"',
).to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\uD800".',
locations: [{ line: 1, column: 25 }],
});

expectSyntaxError(
'"cannot escape half a pair \uD83D\\uDE00 esc"',
).to.deep.equal({
message: 'Syntax Error: Invalid character within String: U+D83D.',
locations: [{ line: 1, column: 28 }],
});

expectSyntaxError(
'"cannot escape half a pair \\uD83D\uDE00 esc"',
).to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\uD83D".',
locations: [{ line: 1, column: 28 }],
});

expectSyntaxError('"bad \\uD83D\\not an escape"').to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\uD83D".',
locations: [{ line: 1, column: 6 }],
});
});

it('lexes block strings', () => {
Expand Down Expand Up @@ -414,6 +583,13 @@ describe('Lexer', () => {
value: 'unescaped \\n\\r\\b\\t\\f\\u1234',
});

expect(lexOne('"""unescaped unicode outside BMP \u{1f600}"""')).to.contain({
kind: TokenKind.BLOCK_STRING,
start: 0,
end: 38,
value: 'unescaped unicode outside BMP \u{1f600}',
});

expect(lexOne('"""slashes \\\\ \\/"""')).to.contain({
kind: TokenKind.BLOCK_STRING,
start: 0,
Expand Down Expand Up @@ -486,18 +662,9 @@ describe('Lexer', () => {
locations: [{ line: 1, column: 16 }],
});

expectSyntaxError(
'"""contains unescaped \u0007 control char"""',
).to.deep.equal({
message: 'Syntax Error: Invalid character within String: U+0007.',
locations: [{ line: 1, column: 23 }],
});

expectSyntaxError(
'"""null-byte is not \u0000 end of file"""',
).to.deep.equal({
message: 'Syntax Error: Invalid character within String: U+0000.',
locations: [{ line: 1, column: 21 }],
expectSyntaxError('"""contains invalid surrogate \uDEAD"""').to.deep.equal({
message: 'Syntax Error: Invalid character within String: U+DEAD.',
locations: [{ line: 1, column: 31 }],
});
});

Expand Down Expand Up @@ -837,6 +1004,16 @@ describe('Lexer', () => {
locations: [{ line: 1, column: 1 }],
});

expectSyntaxError('\x00').to.deep.equal({
message: 'Syntax Error: Unexpected character: U+0000.',
locations: [{ line: 1, column: 1 }],
});

expectSyntaxError('\b').to.deep.equal({
message: 'Syntax Error: Unexpected character: U+0008.',
locations: [{ line: 1, column: 1 }],
});

expectSyntaxError('\u00AA').to.deep.equal({
message: 'Syntax Error: Unexpected character: U+00AA.',
locations: [{ line: 1, column: 1 }],
Expand All @@ -851,6 +1028,16 @@ describe('Lexer', () => {
message: 'Syntax Error: Unexpected character: U+203B.',
locations: [{ line: 1, column: 1 }],
});

expectSyntaxError('\u{1f600}').to.deep.equal({
message: 'Syntax Error: Unexpected character: U+1F600.',
locations: [{ line: 1, column: 1 }],
});

expectSyntaxError('\uDEAD').to.deep.equal({
message: 'Syntax Error: Invalid character: U+DEAD.',
locations: [{ line: 1, column: 1 }],
});
});

it('lex reports useful information for dashes in names', () => {
Expand Down Expand Up @@ -931,9 +1118,15 @@ describe('Lexer', () => {
end: 9,
value: ' Comment',
});
expectSyntaxError('# \u0007').to.deep.equal({
message: 'Syntax Error: Invalid character: U+0007.',
locations: [{ line: 1, column: 3 }],
expect(lexOne('# Comment \u{1f600}').prev).to.contain({
kind: TokenKind.COMMENT,
start: 0,
end: 12,
value: ' Comment \u{1f600}',
});
expectSyntaxError('# Invalid surrogate \uDEAD').to.deep.equal({
message: 'Syntax Error: Invalid character: U+DEAD.',
locations: [{ line: 1, column: 21 }],
});
});
});
Expand Down
Loading

0 comments on commit 9cdbb19

Please sign in to comment.