From e943275a3b5eb876b7759f79c7dce26b39c48e66 Mon Sep 17 00:00:00 2001 From: Yosuke Ota Date: Sun, 27 Oct 2024 20:19:25 +0900 Subject: [PATCH] Add missing unicode properties FIX: Support some missing Unicode properties. --- .eslintrc.js | 2 +- .../generated/scriptValuesAddedInUnicode.js | 2 + acorn/src/unicode-property-data.js | 3 +- bin/generate-unicode-script-values.js | 59 +++++++++++++++++++ bin/test262.whitelist | 36 ++--------- package.json | 4 +- 6 files changed, 71 insertions(+), 35 deletions(-) create mode 100644 acorn/src/generated/scriptValuesAddedInUnicode.js create mode 100644 bin/generate-unicode-script-values.js diff --git a/.eslintrc.js b/.eslintrc.js index e9ab078c5..1dcbf606e 100644 --- a/.eslintrc.js +++ b/.eslintrc.js @@ -13,7 +13,7 @@ module.exports = { }, overrides: [ { - files: ["acorn/src/bin/*.js", "bin/generate-identifier-regex.js"], + files: ["acorn/src/bin/*.js", "bin/generate-identifier-regex.js", "bin/generate-unicode-script-values.js"], rules: { "no-console": "off" } diff --git a/acorn/src/generated/scriptValuesAddedInUnicode.js b/acorn/src/generated/scriptValuesAddedInUnicode.js new file mode 100644 index 000000000..168967e98 --- /dev/null +++ b/acorn/src/generated/scriptValuesAddedInUnicode.js @@ -0,0 +1,2 @@ +// This file was generated by "bin/generate-unicode-script-values.js". Do not modify manually! +export default "Gara Garay Gukh Gurung_Khema Hrkt Katakana_Or_Hiragana Kawi Kirat_Rai Krai Nag_Mundari Nagm Ol_Onal Onao Sunu Sunuwar Todhri Todr Tulu_Tigalari Tutg Unknown Zzzz" diff --git a/acorn/src/unicode-property-data.js b/acorn/src/unicode-property-data.js index 7fdbf43e1..99cd28723 100644 --- a/acorn/src/unicode-property-data.js +++ b/acorn/src/unicode-property-data.js @@ -1,4 +1,5 @@ import {wordsRegexp} from "./util.js" +import scriptValuesAddedInUnicode from "./generated/scriptValuesAddedInUnicode.js" // This file contains Unicode properties extracted from the ECMAScript specification. // The lists are extracted like so: @@ -42,7 +43,7 @@ const ecma10ScriptValues = ecma9ScriptValues + " Dogra Dogr Gunjala_Gondi Gong H const ecma11ScriptValues = ecma10ScriptValues + " Elymaic Elym Nandinagari Nand Nyiakeng_Puachue_Hmong Hmnp Wancho Wcho" const ecma12ScriptValues = ecma11ScriptValues + " Chorasmian Chrs Diak Dives_Akuru Khitan_Small_Script Kits Yezi Yezidi" const ecma13ScriptValues = ecma12ScriptValues + " Cypro_Minoan Cpmn Old_Uyghur Ougr Tangsa Tnsa Toto Vithkuqi Vith" -const ecma14ScriptValues = ecma13ScriptValues + " Hrkt Katakana_Or_Hiragana Kawi Nag_Mundari Nagm Unknown Zzzz" +const ecma14ScriptValues = ecma13ScriptValues + " " + scriptValuesAddedInUnicode const unicodeScriptValues = { 9: ecma9ScriptValues, diff --git a/bin/generate-unicode-script-values.js b/bin/generate-unicode-script-values.js new file mode 100644 index 000000000..e49f8585b --- /dev/null +++ b/bin/generate-unicode-script-values.js @@ -0,0 +1,59 @@ +"use strict" + +const fs = require("fs") +const path = require("path") + +import("../acorn/src/unicode-property-data.js") + .then(m => { + return m.default[13].nonBinary.Script + }) + .then(async(reScriptValuesAddedInES) => { + const scriptValues = new Set() + for await (const value of getLatestUnicodeScriptValues()) { + scriptValues.add(value) + } + const scriptValuesAddedInUnicode = "export default " + + JSON.stringify( + [...scriptValues] + // The unicode script values now follow the Unicode spec as of ES2023, + // but prior to ES2022 they were listed in the ES2022 spec. + // The generated file lists all the unicode script values except those listed before ES2022. + .filter(value => !reScriptValuesAddedInES.test(value)) + .sort() + .join(" ") + ) + + writeGeneratedFile("scriptValuesAddedInUnicode", scriptValuesAddedInUnicode) + + console.log("Done. The generated files must be committed.") + }) + +function writeGeneratedFile(filename, content) { + const comment = "// This file was generated by \"bin/" + path.basename(__filename) + "\". Do not modify manually!" + fs.writeFileSync(path.resolve("./acorn/src/generated", filename + ".js"), comment + "\n" + content + "\n", "utf8") +} + +/** + * Gets the all unicode script values from the latest PropertyValueAliases. + */ +async function * getLatestUnicodeScriptValues() { + const response = await fetch("https://unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt") + const lines = (await response.text()).split("\n") + for (const line of lines) { + if (!line || line.startsWith("#")) { + continue + } + const [propertyAlias, alias, canonical, ...remaining] = line + .split("#")[0] // strip comments + .split(";") // split by semicolon + .map((x) => x.trim()) // trim + + if (propertyAlias !== "sc") { + continue + } + + yield canonical + yield alias + yield * remaining + } +} diff --git a/bin/test262.whitelist b/bin/test262.whitelist index a5e3a296e..50dfe4f25 100644 --- a/bin/test262.whitelist +++ b/bin/test262.whitelist @@ -1,32 +1,4 @@ -built-ins/RegExp/property-escapes/generated/Script_-_Garay.js (default) -built-ins/RegExp/property-escapes/generated/Script_-_Garay.js (strict mode) -built-ins/RegExp/property-escapes/generated/Script_-_Gurung_Khema.js (default) -built-ins/RegExp/property-escapes/generated/Script_-_Gurung_Khema.js (strict mode) -built-ins/RegExp/property-escapes/generated/Script_-_Kirat_Rai.js (default) -built-ins/RegExp/property-escapes/generated/Script_-_Kirat_Rai.js (strict mode) -built-ins/RegExp/property-escapes/generated/Script_-_Ol_Onal.js (default) -built-ins/RegExp/property-escapes/generated/Script_-_Ol_Onal.js (strict mode) -built-ins/RegExp/property-escapes/generated/Script_-_Sunuwar.js (default) -built-ins/RegExp/property-escapes/generated/Script_-_Sunuwar.js (strict mode) -built-ins/RegExp/property-escapes/generated/Script_-_Todhri.js (default) -built-ins/RegExp/property-escapes/generated/Script_-_Todhri.js (strict mode) -built-ins/RegExp/property-escapes/generated/Script_-_Tulu_Tigalari.js (default) -built-ins/RegExp/property-escapes/generated/Script_-_Tulu_Tigalari.js (strict mode) -built-ins/RegExp/property-escapes/generated/Script_Extensions_-_Garay.js (default) -built-ins/RegExp/property-escapes/generated/Script_Extensions_-_Garay.js (strict mode) -built-ins/RegExp/property-escapes/generated/Script_Extensions_-_Gurung_Khema.js (default) -built-ins/RegExp/property-escapes/generated/Script_Extensions_-_Gurung_Khema.js (strict mode) -built-ins/RegExp/property-escapes/generated/Script_Extensions_-_Kirat_Rai.js (default) -built-ins/RegExp/property-escapes/generated/Script_Extensions_-_Kirat_Rai.js (strict mode) -built-ins/RegExp/property-escapes/generated/Script_Extensions_-_Ol_Onal.js (default) -built-ins/RegExp/property-escapes/generated/Script_Extensions_-_Ol_Onal.js (strict mode) -built-ins/RegExp/property-escapes/generated/Script_Extensions_-_Sunuwar.js (default) -built-ins/RegExp/property-escapes/generated/Script_Extensions_-_Sunuwar.js (strict mode) -built-ins/RegExp/property-escapes/generated/Script_Extensions_-_Todhri.js (default) -built-ins/RegExp/property-escapes/generated/Script_Extensions_-_Todhri.js (strict mode) -built-ins/RegExp/property-escapes/generated/Script_Extensions_-_Tulu_Tigalari.js (default) -built-ins/RegExp/property-escapes/generated/Script_Extensions_-_Tulu_Tigalari.js (strict mode) -language/import/import-attributes/json-invalid.js (default) -language/import/import-attributes/json-invalid.js (strict mode) -language/import/import-attributes/json-named-bindings.js (default) -language/import/import-attributes/json-named-bindings.js (strict mode) +language/import/import-attributes/json-invalid.js (default) +language/import/import-attributes/json-invalid.js (strict mode) +language/import/import-attributes/json-named-bindings.js (default) +language/import/import-attributes/json-named-bindings.js (strict mode) diff --git a/package.json b/package.json index a3372a5a4..7be2a52a8 100644 --- a/package.json +++ b/package.json @@ -25,7 +25,9 @@ "build:loose": "rollup -c acorn-loose/rollup.config.mjs", "build:main": "rollup -c acorn/rollup.config.mjs", "build:walk": "rollup -c acorn-walk/rollup.config.mjs", - "generate": "node bin/generate-identifier-regex.js", + "generate": "npm run generate:identifier-regex && npm run generate:unicode-script-values", + "generate:identifier-regex": "node bin/generate-identifier-regex.js", + "generate:unicode-script-values": "node bin/generate-unicode-script-values.js", "lint": "eslint .", "prepare": "npm run test", "pretest": "npm run build:main && npm run build:loose",