AssemblyScript · dcodeIO · Sep 26, 2022 · Sep 25, 2022 · Sep 25, 2022 · Sep 26, 2022
diff --git a/scripts/unicode-identifier.js b/scripts/unicode-identifier.js
@@ -0,0 +1,38 @@
+// see https://github.com/microsoft/TypeScript/blob/main/scripts/regenerate-unicode-identifier-parts.js
+
+const MAX_UNICODE_CODEPOINT = 0x10FFFF;
+const isStart = c => /[\p{ID_Start}\u{2118}\u{212E}\u{309B}\u{309C}]/u.test(c); // Other_ID_Start explicitly included for back compat - see http://www.unicode.org/reports/tr31/#Introduction
+const isPart = c => /[\p{ID_Continue}\u{00B7}\u{0387}\u{19DA}\u{1369}\u{136A}\u{136B}\u{136C}\u{136D}\u{136E}\u{136F}\u{1370}\u{1371}]/u.test(c) || isStart(c); // Likewise for Other_ID_Continue
+const parts = [];
+let partsActive = false;
+let startsActive = false;
+const starts = [];
+
+// Skip 0-9 (48..57), A-Z (65..90), a-z (97..122) - checked otherwise
+for (let cp = 123; cp <= MAX_UNICODE_CODEPOINT; cp++) {
+  if (isStart(String.fromCodePoint(cp)) !== startsActive) {
+    starts.push(cp - +startsActive);
+    startsActive = !startsActive;
+  }
+  if (isPart(String.fromCodePoint(cp)) !== partsActive) {
+    parts.push(cp - +partsActive);
+    partsActive = !partsActive;
+  }
+}
+if (startsActive) starts.push(MAX_UNICODE_CODEPOINT);
+if (partsActive) parts.push(MAX_UNICODE_CODEPOINT);
+
+function tablify(cps) {
+  let sb = ["/*\n| from  ...  to | from  ...  to | from  ...  to | from  ...  to |*/"];
+  let i = 0;
+  while (i < cps.length) {
+    if (!(i % 8)) sb.push("\n  ");
+    sb.push(`${cps[i++].toString().padEnd(6)}, `);
+  }
+  return sb.join("") + "\n";
+}
+
+console.log(`/** Unicode ${process.versions.unicode} ID_Start/Other_ID_Start ranges */`);
+console.log(`const unicodeIdentifierStart: i32[] = [${tablify(starts)}];\n`);
+console.log(`/** Unicode ${process.versions.unicode} ID_Continue/Other_ID_Continue + ID_Start/Other_ID_Start ranges*/`);
+console.log(`const unicodeIdentifierPart: i32[] = [${tablify(parts)}];`);
diff --git a/src/tokenizer.ts b/src/tokenizer.ts
@@ -32,8 +32,7 @@ import {
   isDecimal,
   isOctal,
   isHexBase,
-  isHighSurrogate,
-  isLowSurrogate
+  numCodeUnits
 } from "./util";
 
 /** Named token types. */
@@ -520,7 +519,7 @@ export class Tokenizer extends DiagnosticEmitter {
     var pos = this.pos;
     while (pos < end) {
       this.tokenPos = pos;
-      let c = text.charCodeAt(pos);
+      let c = <i32>text.codePointAt(pos);
       switch (c) {
         case CharCode.CARRIAGERETURN: {
           if (!(
@@ -913,11 +912,12 @@ export class Tokenizer extends DiagnosticEmitter {
           return Token.AT;
         }
         default: {
+          // Unicode-aware from here on. Is a pair of two code units if `c > 0xffff`.
           if (isIdentifierStart(c)) {
             let posBefore = pos;
             while (
-              ++pos < end &&
-              isIdentifierPart(c = text.charCodeAt(pos))
+              (pos += numCodeUnits(c)) < end &&
+              isIdentifierPart(c = <i32>text.codePointAt(pos))
             ) { /* nop */ }
             if (identifierHandling != IdentifierHandling.ALWAYS) {
               let maybeKeywordToken = tokenFromKeyword(text.substring(posBefore, pos));
@@ -935,14 +935,11 @@ export class Tokenizer extends DiagnosticEmitter {
             this.pos = posBefore;
             return Token.IDENTIFIER;
           } else if (isWhiteSpace(c)) {
-            ++pos;
+            pos += numCodeUnits(c);
             break;
           }
-          let start = pos++;
-          if (
-            isHighSurrogate(c) && pos < end &&
-            isLowSurrogate(text.charCodeAt(pos))
-          ) ++pos;
+          let start = pos;
+          pos += numCodeUnits(c);
           this.error(
             DiagnosticCode.Invalid_character,
             this.range(start, pos)
@@ -1055,9 +1052,11 @@ export class Tokenizer extends DiagnosticEmitter {
     var end = this.end;
     var pos = this.pos;
     var start = pos;
+    var c = <i32>text.codePointAt(pos);
+    assert(isIdentifierStart(c));
     while (
-      ++pos < end &&
-      isIdentifierPart(text.charCodeAt(pos))
+      (pos += numCodeUnits(c)) < end &&
+      isIdentifierPart(c = <i32>text.codePointAt(pos))
     );
     this.pos = pos;
     return text.substring(start, pos);