Skip to content
38 changes: 38 additions & 0 deletions scripts/unicode-identifier.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
// see https://github.com/microsoft/TypeScript/blob/main/scripts/regenerate-unicode-identifier-parts.js

const MAX_UNICODE_CODEPOINT = 0x10FFFF;
const isStart = c => /[\p{ID_Start}\u{2118}\u{212E}\u{309B}\u{309C}]/u.test(c); // Other_ID_Start explicitly included for back compat - see http://www.unicode.org/reports/tr31/#Introduction
const isPart = c => /[\p{ID_Continue}\u{00B7}\u{0387}\u{19DA}\u{1369}\u{136A}\u{136B}\u{136C}\u{136D}\u{136E}\u{136F}\u{1370}\u{1371}]/u.test(c) || isStart(c); // Likewise for Other_ID_Continue
const parts = [];
let partsActive = false;
let startsActive = false;
const starts = [];

// Skip 0-9 (48..57), A-Z (65..90), a-z (97..122) - checked otherwise
for (let cp = 123; cp <= MAX_UNICODE_CODEPOINT; cp++) {
if (isStart(String.fromCodePoint(cp)) !== startsActive) {
starts.push(cp - +startsActive);
startsActive = !startsActive;
}
if (isPart(String.fromCodePoint(cp)) !== partsActive) {
parts.push(cp - +partsActive);
partsActive = !partsActive;
}
}
if (startsActive) starts.push(MAX_UNICODE_CODEPOINT);
if (partsActive) parts.push(MAX_UNICODE_CODEPOINT);

function tablify(cps) {
let sb = ["/*\n| from ... to | from ... to | from ... to | from ... to |*/"];
let i = 0;
while (i < cps.length) {
if (!(i % 8)) sb.push("\n ");
sb.push(`${cps[i++].toString().padEnd(6)}, `);
}
return sb.join("") + "\n";
}

console.log(`/** Unicode ${process.versions.unicode} ID_Start/Other_ID_Start ranges */`);
console.log(`const unicodeIdentifierStart: i32[] = [${tablify(starts)}];\n`);
console.log(`/** Unicode ${process.versions.unicode} ID_Continue/Other_ID_Continue + ID_Start/Other_ID_Start ranges*/`);
console.log(`const unicodeIdentifierPart: i32[] = [${tablify(parts)}];`);
25 changes: 12 additions & 13 deletions src/tokenizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,7 @@ import {
isDecimal,
isOctal,
isHexBase,
isHighSurrogate,
isLowSurrogate
numCodeUnits
} from "./util";

/** Named token types. */
Expand Down Expand Up @@ -520,7 +519,7 @@ export class Tokenizer extends DiagnosticEmitter {
var pos = this.pos;
while (pos < end) {
this.tokenPos = pos;
let c = text.charCodeAt(pos);
let c = <i32>text.codePointAt(pos);
switch (c) {
case CharCode.CARRIAGERETURN: {
if (!(
Expand Down Expand Up @@ -913,11 +912,12 @@ export class Tokenizer extends DiagnosticEmitter {
return Token.AT;
}
default: {
// Unicode-aware from here on. Is a pair of two code units if `c > 0xffff`.
if (isIdentifierStart(c)) {
let posBefore = pos;
while (
++pos < end &&
isIdentifierPart(c = text.charCodeAt(pos))
(pos += numCodeUnits(c)) < end &&
isIdentifierPart(c = <i32>text.codePointAt(pos))
) { /* nop */ }
if (identifierHandling != IdentifierHandling.ALWAYS) {
let maybeKeywordToken = tokenFromKeyword(text.substring(posBefore, pos));
Expand All @@ -935,14 +935,11 @@ export class Tokenizer extends DiagnosticEmitter {
this.pos = posBefore;
return Token.IDENTIFIER;
} else if (isWhiteSpace(c)) {
++pos;
pos += numCodeUnits(c);
break;
}
let start = pos++;
if (
isHighSurrogate(c) && pos < end &&
isLowSurrogate(text.charCodeAt(pos))
) ++pos;
let start = pos;
pos += numCodeUnits(c);
this.error(
DiagnosticCode.Invalid_character,
this.range(start, pos)
Expand Down Expand Up @@ -1055,9 +1052,11 @@ export class Tokenizer extends DiagnosticEmitter {
var end = this.end;
var pos = this.pos;
var start = pos;
var c = <i32>text.codePointAt(pos);
assert(isIdentifierStart(c));
while (
++pos < end &&
isIdentifierPart(text.charCodeAt(pos))
(pos += numCodeUnits(c)) < end &&
isIdentifierPart(c = <i32>text.codePointAt(pos))
);
this.pos = pos;
return text.substring(start, pos);
Expand Down
Loading