Skip to content

Commit

Permalink
Make generated test cases more compact
Browse files Browse the repository at this point in the history
With the previous approach of hardcoding the generated strings, each test file was ~13 MB in size. The total size of the `output` folder was 4.5 GB. Generating the full test suite took ~18 minutes on my laptop.

The new approach outputs code point ranges, which are then used to generate the string at run-time. The resulting test files are much smaller in size — the entire `output` folder now takes up only 1.3 MB. Generating the full test suite takes only a second.
  • Loading branch information
mathiasbynens committed Apr 12, 2017
1 parent d839855 commit b7221f8
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 42 deletions.
82 changes: 44 additions & 38 deletions build.js
Original file line number Diff line number Diff line change
@@ -1,78 +1,84 @@
// The maximum number of code points in the string used for the `\p{…}` test.
const MAX_MATCH_LENGTH = 0x10FFFF;
// The maximum number of code points in the string used for the `\P{…}` test.
const MAX_NON_MATCH_LENGTH = 0x10FFFF;
// Higher values result in more accurate, but larger (and slower) tests.
// Lower values result in less accurate, but smaller (and faster) tests.

const fs = require('fs');
const jsesc = require('jsesc');

const UNICODE_VERSION = require(
'regenerate-unicode-properties/unicode-version.js'
);

const _template = require('lodash.template');
const TEST_TEMPLATE = fs.readFileSync('templates/test.template', 'utf8');
const template = _template(TEST_TEMPLATE);

const escape = (value) => {
return jsesc(value, {
'compact': false,
'numbers': 'hexadecimal',
'quotes': 'double',
'wrap': true,
});
};

const regenerate = require('regenerate');
const UNICODE_SET = regenerate().addRange(0x0, 0x10FFFF);
const findInverse = (set) => {
const codePoints = UNICODE_SET.clone()
.remove(set)
.toArray()
.slice(0, MAX_NON_MATCH_LENGTH);
const chunkSize = 0xFFFF;
let result = '';
for (let index = 0; index < codePoints.length; index += chunkSize) {
const chunk = codePoints.slice(index, index + chunkSize);
result += String.fromCodePoint.apply(null, chunk);

const codePointToString = (codePoint) => {
return '0x' + ('00000' + codePoint.toString(16).toUpperCase()).slice(-6);
};

regenerate.prototype.toTestData = function() {
const data = this.data;
// Iterate over the data per `(start, end)` pair.
let index = 0;
const length = data.length;
const loneCodePoints = [];
const ranges = [];
while (index < length) {
let start = data[index];
let end = data[index + 1] - 1; // Note: the `- 1` makes `end` inclusive.
if (start == end) {
loneCodePoints.push(codePointToString(start));
} else {
ranges.push(`[${ codePointToString(start) }, ${ codePointToString(end) }]`);
}
index += 2;
}
return result;
const loneCodePointsOutput = loneCodePoints.length ?
`[\n\t\t${ loneCodePoints.join(',\n\t\t') }\n\t]` :
`[]`;
const rangesOutput = ranges.length ?
`[\n\t\t${ ranges.join(',\n\t\t') }\n\t]` :
`[]`;
return `{\n\tloneCodePoints: ${ loneCodePointsOutput },\n\tranges: ${ rangesOutput }\n}`;
};

const generateExpressions = require('./generate-expressions.js');

const handleExpression = (property, value, symbols, set) => {
const handleExpression = (property, value, symbols) => {
const expressions = generateExpressions(property, value);
const mainExpression = expressions[0];
const outputFile = mainExpression.replace('=', '_-_');
console.log(`Handling \`\\p{${ mainExpression }}\`…`);
const nonMatchSymbols = mainExpression == 'Any' ? '' : findInverse(set);
symbols = symbols.join('');
const outputFile = mainExpression.replace('=', '_-_');
const matchSymbols = symbols.toTestData();
const nonMatchSymbols = mainExpression == 'Any' ?
'' :
UNICODE_SET.clone().remove(symbols).toTestData();
const output = template({
'mainExpression': mainExpression,
'expressions': expressions,
'matchSymbols': escape(symbols.slice(0, MAX_MATCH_LENGTH)),
'nonMatchSymbols': escape(nonMatchSymbols),
'unicodeVersion': unicodeVersion,
'matchSymbols': matchSymbols,
'nonMatchSymbols': nonMatchSymbols,
'unicodeVersion': UNICODE_VERSION,
}).replace(/\n{3,}/g, '\n\n').trim() + '\n';
fs.writeFileSync(`output/${ outputFile }.js`, output);
};

const package = require('./package.json');
const dependencies = Object.keys(package.devDependencies);
const unicodePackage = dependencies.find((name) =>/^unicode-\d/.test(name));
const unicodeVersion = unicodePackage.replace(/^unicode-/g, '');

const properties = require('regenerate-unicode-properties');
for (const [property, values] of properties) {
for (const value of values) {
const expression = `${ property }=${ value }`;
const symbols = (() => {
try {
return require(`${ unicodePackage }/${ property }/${ value }/symbols.js`);
} catch (exception) {
return require(`unicode-tr51/${ value }.js`);
}
})();
const set = require(
`regenerate-unicode-properties/${ property }/${ value }.js`
);
handleExpression(property, value, symbols, set);
handleExpression(property, value, set);
}
}
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"jsesc": "^2.5.0",
"lodash.template": "^4.4.0",
"regenerate": "^1.3.2",
"regenerate-unicode-properties": "^5.0.0",
"regenerate-unicode-properties": "^5.0.1",
"unicode-9.0.0": "^0.7.2",
"unicode-property-aliases": "^1.1.1",
"unicode-property-value-aliases": "^1.2.2"
Expand Down
16 changes: 13 additions & 3 deletions templates/test.template
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,24 @@ esid: sec-static-semantics-unicodematchproperty-p
features: [regexp-unicode-property-escapes]
---*/

const matchSymbols = <%= matchSymbols %>;
const buildString = ({ loneCodePoints, ranges }) => {
let result = String.fromCodePoint(...loneCodePoints);
for (const [start, end] of ranges) {
for (let codePoint = start; codePoint <= end; codePoint++) {
result += String.fromCodePoint(codePoint);
}
}
return result;
};

const matchSymbols = buildString(<%= matchSymbols %>);
<% for (const expression of expressions) { %>assert(
/^\p{<%= expression %>}+$/u.test(matchSymbols),
"\\p{<%= expression %>} matches all proper symbols"
);
<% } %>
<% if (nonMatchSymbols.length > 2) { %>
const nonMatchSymbols = <%= nonMatchSymbols %>;
<% if (nonMatchSymbols) { %>
const nonMatchSymbols = buildString(<%= nonMatchSymbols %>);
<%
for (const expression of expressions) {
%>assert(
Expand Down

0 comments on commit b7221f8

Please sign in to comment.