Make generated test cases more compact

With the previous approach of hardcoding the generated strings, each test file was ~13 MB in size. The total size of the `output` folder was 4.5 GB. Generating the full test suite took ~18 minutes on my laptop. The new approach outputs code point ranges, which are then used to generate the string at run-time. The resulting test files are much smaller in size — the entire `output` folder now takes up only 1.3 MB. Generating the full test suite takes only a second.
mathiasbynens · Apr 12, 2017 · b7221f8 · b7221f8
1 parent d839855
commit b7221f8
Show file tree

Hide file tree

Showing 3 changed files with 58 additions and 42 deletions.
diff --git a/build.js b/build.js
@@ -1,78 +1,84 @@
-// The maximum number of code points in the string used for the `\p{…}` test.
-const MAX_MATCH_LENGTH = 0x10FFFF;
-// The maximum number of code points in the string used for the `\P{…}` test.
-const MAX_NON_MATCH_LENGTH = 0x10FFFF;
-// Higher values result in more accurate, but larger (and slower) tests.
-// Lower values result in less accurate, but smaller (and faster) tests.
-
 const fs = require('fs');
 const jsesc = require('jsesc');
 
+const UNICODE_VERSION = require(
+	'regenerate-unicode-properties/unicode-version.js'
+);
+
 const _template = require('lodash.template');
 const TEST_TEMPLATE = fs.readFileSync('templates/test.template', 'utf8');
 const template = _template(TEST_TEMPLATE);
 
 const escape = (value) => {
 	return jsesc(value, {
+		'compact': false,
+		'numbers': 'hexadecimal',
 		'quotes': 'double',
 		'wrap': true,
 	});
 };
 
 const regenerate = require('regenerate');
 const UNICODE_SET = regenerate().addRange(0x0, 0x10FFFF);
-const findInverse = (set) => {
-	const codePoints = UNICODE_SET.clone()
-		.remove(set)
-		.toArray()
-		.slice(0, MAX_NON_MATCH_LENGTH);
-	const chunkSize = 0xFFFF;
-	let result = '';
-	for (let index = 0; index < codePoints.length; index += chunkSize) {
-		const chunk = codePoints.slice(index, index + chunkSize);
-		result += String.fromCodePoint.apply(null, chunk);
+
+const codePointToString = (codePoint) => {
+	return '0x' + ('00000' + codePoint.toString(16).toUpperCase()).slice(-6);
+};
+
+regenerate.prototype.toTestData = function() {
+	const data = this.data;
+	// Iterate over the data per `(start, end)` pair.
+	let index = 0;
+	const length = data.length;
+	const loneCodePoints = [];
+	const ranges = [];
+	while (index < length) {
+		let start = data[index];
+		let end = data[index + 1] - 1; // Note: the `- 1` makes `end` inclusive.
+		if (start == end) {
+			loneCodePoints.push(codePointToString(start));
+		} else {
+			ranges.push(`[${ codePointToString(start) }, ${ codePointToString(end) }]`);
+		}
+		index += 2;
 	}
-	return result;
+	const loneCodePointsOutput = loneCodePoints.length ?
+		`[\n\t\t${ loneCodePoints.join(',\n\t\t') }\n\t]` :
+		`[]`;
+	const rangesOutput = ranges.length ?
+		`[\n\t\t${ ranges.join(',\n\t\t') }\n\t]` :
+		`[]`;
+	return `{\n\tloneCodePoints: ${ loneCodePointsOutput },\n\tranges: ${ rangesOutput }\n}`;
 };
 
 const generateExpressions = require('./generate-expressions.js');
 
-const handleExpression = (property, value, symbols, set) => {
+const handleExpression = (property, value, symbols) => {
 	const expressions = generateExpressions(property, value);
 	const mainExpression = expressions[0];
-	const outputFile = mainExpression.replace('=', '_-_');
 	console.log(`Handling \`\\p{${ mainExpression }}\`…`);
-	const nonMatchSymbols = mainExpression == 'Any' ? '' : findInverse(set);
-	symbols = symbols.join('');
+	const outputFile = mainExpression.replace('=', '_-_');
+	const matchSymbols = symbols.toTestData();
+	const nonMatchSymbols = mainExpression == 'Any' ?
+			'' :
+			UNICODE_SET.clone().remove(symbols).toTestData();
 	const output = template({
 		'mainExpression': mainExpression,
 		'expressions': expressions,
-		'matchSymbols': escape(symbols.slice(0, MAX_MATCH_LENGTH)),
-		'nonMatchSymbols': escape(nonMatchSymbols),
-		'unicodeVersion': unicodeVersion,
+		'matchSymbols': matchSymbols,
+		'nonMatchSymbols': nonMatchSymbols,
+		'unicodeVersion': UNICODE_VERSION,
 	}).replace(/\n{3,}/g, '\n\n').trim() + '\n';
 	fs.writeFileSync(`output/${ outputFile }.js`, output);
 };
 
-const package = require('./package.json');
-const dependencies = Object.keys(package.devDependencies);
-const unicodePackage = dependencies.find((name) =>/^unicode-\d/.test(name));
-const unicodeVersion = unicodePackage.replace(/^unicode-/g, '');
-
 const properties = require('regenerate-unicode-properties');
 for (const [property, values] of properties) {
 	for (const value of values) {
 		const expression = `${ property }=${ value }`;
-		const symbols = (() => {
-			try {
-				return require(`${ unicodePackage }/${ property }/${ value }/symbols.js`);
-			} catch (exception) {
-				return require(`unicode-tr51/${ value }.js`);
-			}
-		})();
 		const set = require(
 			`regenerate-unicode-properties/${ property }/${ value }.js`
 		);
-		handleExpression(property, value, symbols, set);
+		handleExpression(property, value, set);
 	}
 }
diff --git a/package.json b/package.json
@@ -9,7 +9,7 @@
     "jsesc": "^2.5.0",
     "lodash.template": "^4.4.0",
     "regenerate": "^1.3.2",
-    "regenerate-unicode-properties": "^5.0.0",
+    "regenerate-unicode-properties": "^5.0.1",
     "unicode-9.0.0": "^0.7.2",
     "unicode-property-aliases": "^1.1.1",
     "unicode-property-value-aliases": "^1.2.2"

diff --git a/templates/test.template b/templates/test.template
@@ -12,14 +12,24 @@ esid: sec-static-semantics-unicodematchproperty-p
 features: [regexp-unicode-property-escapes]
 ---*/
 
-const matchSymbols = <%= matchSymbols %>;
+const buildString = ({ loneCodePoints, ranges }) => {
+	let result = String.fromCodePoint(...loneCodePoints);
+	for (const [start, end] of ranges) {
+		for (let codePoint = start; codePoint <= end; codePoint++) {
+			result += String.fromCodePoint(codePoint);
+		}
+	}
+	return result;
+};
+
+const matchSymbols = buildString(<%= matchSymbols %>);
 <% for (const expression of expressions) { %>assert(
 	/^\p{<%= expression %>}+$/u.test(matchSymbols),
 	"\\p{<%= expression %>} matches all proper symbols"
 );
 <% } %>
-<% if (nonMatchSymbols.length > 2) { %>
-const nonMatchSymbols = <%= nonMatchSymbols %>;
+<% if (nonMatchSymbols) { %>
+const nonMatchSymbols = buildString(<%= nonMatchSymbols %>);
 <%
 	for (const expression of expressions) {
 %>assert(