Skip to content

Commit

Permalink
grapheme data import
Browse files Browse the repository at this point in the history
  • Loading branch information
jerch committed May 30, 2018
1 parent b43ee08 commit 7f6d12b
Show file tree
Hide file tree
Showing 5 changed files with 323 additions and 1 deletion.
124 changes: 124 additions & 0 deletions bin/create-graphemedata.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#!/usr/bin/env node
'use strict';

const URL = 'https://www.unicode.org/Public/10.0.0/ucd/auxiliary/GraphemeBreakProperty.txt';
const PATH = __dirname + '/../src/GraphemeData.ts';

const GRAPHEME_REX = /^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*([A-Za-z_]+)/gm;

const TYPES = {
Other: 0,
L: 1,
V: 2,
T: 3,
LV: 4,
LVT: 5,
CR: 6,
LF: 7,
ZWJ: 8,
Prepend: 9,
Control: 10,
Extend: 11,
SpacingMark: 12,
E_Base: 13,
Glue_After_Zwj: 14,
E_Modifier: 15,
E_Base_GAZ: 16,
Regional_Indicator: 17
};

function parseDefinitions(data) {
let codepoints = Object.create(null);
let match = null;
while (match = GRAPHEME_REX.exec(data)) {
let start = parseInt(match[1], 16);
let end = parseInt(match[2], 16) || start;
for (let i = start; i < end + 1; ++i)
codepoints[i] = match[3];
}
return codepoints;
}


function createPackedBMP(codepoints, start, end) {
let type = -1;
let count = 0;
let lengths = [];
let types = [];
for (let i = start; i < end; ++i) {
let t = parseInt(TYPES[codepoints[i] || 'Other']);
if (t !== type) {
lengths.push(count);
types.push(type);
type = t;
count = 0;
}
count++;
if (count === 255) {
lengths.push(count);
types.push(type);
count = 0;
}
}
lengths.push(count);
types.push(type);

// remove start entries
lengths.shift();
types.shift();

if (types.length & 1)
types.push(0);

let accu = 0;
let finalTypes = [];
for (let i = 0; i < types.length; ++i) {
accu <<= 4;
accu |= types[i];
if (i & 1) {
finalTypes.push(accu);
accu = 0;
}
}

// null terminate length values
lengths.push(0);
return new Buffer(lengths.concat(finalTypes)).toString('base64');
}


function createGraphemeDataFile(url, path) {
require('https').get(url, (resp) => {
let data = '';
resp.on('data', (chunk) => {
data += chunk;
});
resp.on('end', () => {
const codepoints = parseDefinitions(data);
let highest = 0;
for (let el in codepoints)
highest = Math.max(highest, parseInt(el));

// codepoint < 12443
const first = createPackedBMP(codepoints, 0, 12443);
// 42606 <= codepoint < 65536
const second = createPackedBMP(codepoints, 42606, 65536);
// codepoint <= 65536
const third = ''; //createPackedHIGH(codepoints, 65536, highest);

// write to ts file
let final = '';
final += `// FIRST: 0 <= codepoint < 12443\n`;
final += `export const FIRST: string = '${first}';\n`;
final += `// SECOND: 42606 <= codepoint < 65536\n`;
final += `export const SECOND: string = '${second}';\n`;
final += `// THIRD: codepoint >= 65536\n`;
final += `export const THIRD: string = '${third}';\n`;
require('fs').writeFileSync(path, final);
});
}).on('error', (err) => {
console.log('error', err.message);
});
}

createGraphemeDataFile(URL, PATH);
3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
"prepublish": "npm run build",
"coveralls": "gulp coveralls",
"webpack": "gulp webpack",
"watch": "gulp watch"
"watch": "gulp watch",
"graphemedata": "node bin/create-graphemedata"
}
}
103 changes: 103 additions & 0 deletions src/Grapheme.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
/**
* Copyright (c) 2018 The xterm.js authors. All rights reserved.
* @license MIT
*/

import { FIRST, SECOND } from './GraphemeData';
import { loadFromPackedBMP, graphemeType } from './Grapheme';
import * as chai from 'chai';

const TYPES = {
Other: 0,
L: 1,
V: 2,
T: 3,
LV: 4,
LVT: 5,
CR: 6,
LF: 7,
ZWJ: 8,
Prepend: 9,
Control: 10,
Extend: 11,
SpacingMark: 12,
E_Base: 13,
Glue_After_Zwj: 14,
E_Modifier: 15,
E_Base_GAZ: 16,
Regional_Indicator: 17
};

const URL = 'https://www.unicode.org/Public/10.0.0/ucd/auxiliary/GraphemeBreakProperty.txt';
const GRAPHEME_REX = /^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*([A-Za-z_]+)/gm;

let CODEPOINTS = null;

function parseDefinitions(data: string): {[key: number]: number} {
let codepoints = Object.create(null);
let match = null;
while (match = GRAPHEME_REX.exec(data)) {
let start = parseInt(match[1], 16);
let end = parseInt(match[2], 16) || start;
for (let i = start; i < end + 1; ++i) codepoints[i] = match[3];
}
return codepoints;
}

function loadUnicodeData(done: Function): void {
require('https').get(URL, (resp): any => {
let data = '';
resp.on('data', (chunk): void => {
data += chunk;
});
resp.on('end', () => {
CODEPOINTS = parseDefinitions(data);
done();
});
}).on('error', (err) => {
throw Error('error fetching unicode data');
});
}

describe('grapheme cluster', function (): void {
before(function(done: Function): void {
loadUnicodeData(done);
});
describe('correct GraphemeData', function(): void {
it('FIRST', function(): void {
if (!CODEPOINTS) return;
let one = loadFromPackedBMP(FIRST, 0, 12443);
for (let cp = 0; cp < 12443; ++cp) {
let fromStore = TYPES[CODEPOINTS[cp]] || 0;
let v = (cp & 1) ? one[cp >> 1] >> 4 : one[cp >> 1] & 15;
chai.expect(fromStore).equals(v);
}
});
it('SECOND', function(): void {
if (!CODEPOINTS) return;
let one = loadFromPackedBMP(SECOND, 42606, 65536);
for (let cp = 42606; cp < 65536; ++cp) {
let fromStore = TYPES[CODEPOINTS[cp]] || 0;
let idx = cp - 42606;
let v = (idx & 1) ? one[idx >> 1] >> 4 : one[idx >> 1] & 15;
chai.expect(fromStore).equals(v);
}
});
it('THIRD', function(): void {
if (!CODEPOINTS) return;
// TODO
});
});
describe('graphemeType', function(): void {
it('BMP', function(): void {
if (!CODEPOINTS) return;
for (let cp = 0; cp < 65536; ++cp) {
chai.expect(graphemeType(cp)).equals(TYPES[CODEPOINTS[cp]] || 0);
}
});
it('HIGH', function(): void {
if (!CODEPOINTS) return;
// TODO
});
});
});
88 changes: 88 additions & 0 deletions src/Grapheme.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
/**
* Copyright (c) 2018 The xterm.js authors. All rights reserved.
* @license MIT
*/
import { FIRST, SECOND } from './GraphemeData';

export function loadFromPackedBMP(data: string, start: number, end: number): number[] | Uint8Array {
// decode base64 and split into lengths and types strings
const raw = (typeof atob === 'undefined')
// nodejs
? new Buffer(data, 'base64').toString('binary')
// browser - FIXME: how to test this?
: atob(data);
// first occurence of 0x0 marks end of lengths (null terminated)
const lengths = raw.substring(0, raw.indexOf('\x00'));
const types = raw.substring(raw.indexOf('\x00') + 1);

// lookup table with 2 type entries per index position
const table = (typeof Uint8Array === 'undefined')
? new Array(((end - start) >> 1) + 1)
: new Uint8Array(((end - start) >> 1) + 1);

// load data into lookup table
let codepointOffset = 0;
for (let chunkIdx = 0; chunkIdx < lengths.length; ++chunkIdx) {
let currentLength = lengths.charCodeAt(chunkIdx);
for (let chunkPos = 0; chunkPos < currentLength; ++chunkPos) {
let tcode = types.charCodeAt(chunkIdx >> 1);
let type = (chunkIdx & 1) ? tcode & 15 : tcode >> 4;
table[(codepointOffset + chunkPos) >> 1] |= ((codepointOffset + chunkPos) & 1) ? type << 4 : type;
}
codepointOffset += currentLength;
}
return table;
}


// NOTE: Types must be identical to bin/create-graphemedata.js#TYPES
const enum Types {
OTHER = 0,
L = 1,
V = 2,
T = 3,
LV = 4,
LVT = 5,
CR = 6,
LF = 7,
ZWJ = 8,
PREPEND = 9,
CONTROL = 10,
EXTEND = 11,
SPACINGMARK = 12,
E_BASE = 13,
GLUE_AFTER_ZWJ = 14,
E_MODIFIER = 15,
E_BASE_GAZ = 16,
REGIONAL_INDICATOR = 17
}

export const graphemeType = (function(): (codepoint: number) => Types {
let BMP_LOW = null;
let BMP_HIGH = null;
return (codepoint: number): Types => {
// ASCII printable shortcut
if (31 < codepoint && codepoint < 127) return Types.OTHER;
// BMP_LOW: 0 <= codepoint < 12443
if (codepoint < 12443) {
let table = BMP_LOW || ((): number[] | Uint8Array => {
BMP_LOW = loadFromPackedBMP(FIRST, 0, 12443);
return BMP_LOW;
})();
return (codepoint & 1) ? table[codepoint >> 1] >> 4 : table[codepoint >> 1] & 15;
}
// always Other: 12443 <= codepoint < 42606
if (codepoint < 42606) return Types.OTHER;
// BMP_HIGH (CJK): 42606 <= codepoint < 65536
if (codepoint < 65536) {
let table = BMP_HIGH || ((): number[] | Uint8Array => {
BMP_HIGH = loadFromPackedBMP(SECOND, 42606, 65536);
return BMP_HIGH;
})();
codepoint -= 42606;
return (codepoint & 1) ? table[codepoint >> 1] >> 4 : table[codepoint >> 1] & 15;
}
// TODO codepoint > 65536
return Types.OTHER;
};
})();
6 changes: 6 additions & 0 deletions src/GraphemeData.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
// FIRST: 0 <= codepoint < 12443
export const FIRST: string = 'CgECARJfIQ0B//9UcP8UB/8ILQEBAQIBAgEBOAYKCwEBLhUQAWUHAQEGAgIBBCEBAQEeG1sLOgkiBAEJAQMBBSsDeA4BIAE2AQEBAQMIBAECAQcKAh0BAjgBAQECBAICAgIBCQEKAh0CATgBAQMCBAICAwMBHgIDAQsCATgBAQMFAQIBAQIBFAIWBgEBAjgBAQIBBAICAgIBCAIKAh4BOwEBAQIDAwEDAQkBKAEDOgMEAQMBBAcCCwIdAQI4AQEBAQIBAgEBAgECAgcCCwIcAgI3AgEBAgQBAwEDAQEIAQoCHgJGAQQBAgMBAQEHARICPQEBAQcMCGIBAQEGAQILBkoCGwEBAQEBBAIxDgEFAQIFCwEkCQFmBAEGAQICAhcCAgQDEAQNAQEBAgYBDwFiYEhY/14D////tQMdAx0CHgJAAgEHCAECCwkBLQMBdgIiAXYDBAIDBAIBBgPbAgIBOQEBAQcBAQEBAggGCgIBMA9BBAEvAQEFAQEFAQImCQwCAR4BBAICAQM4AQECAwEBAwIwCAgCApgDAQ0BBwQBBAIBAgECxjoBBf//DQEBAQIYBzEQYCH//////zEBIgEBAVICYgEOAQEEVgH//////48DjQFgIP//LAZpAgCnpqCgoACwCwCwsLCwsJCwoLCwuQsLCwkLCwsLCwsLCwsLm8C8sMvLwLCwvAsLywwMsLCwvAsMsLCwsLCwvAsMsLwMsLCwvAsLywwMsLCwsLy8DAywsLwLwLCwsLC8Cwy8vAvAywsLC8CwvLDAy5CwsMCwvLCwywwLDLCwsMsLCwsLCwsMC8sLCwsLC8sLywywsLCwywsLASMAsAALCwsLC8vLywsLoLCwvLwMvLC8sMvLCwsLywsLC8C8vLy8CwvAy8vLC8vLy8DLywsLywsMsMsLCwAKuKCgoLAAAA0ODg4NDg0OAAAAsLCwALCw';
// SECOND: 42606 <= codepoint < 65536
export const SECOND: string = 'AQQBCiACUAL/EQEDAQQBFwICAVgCMhACGhI0CBkLAgwdAwMBLwECBAIBBCQBQwYCAgICDAEIAQEuATMBAQMCAgUCAQEpAQICBQEB7AIBAgECAQEBEgEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwwXBDEE//////////8I////////////////////////////////////OQH//+MQEBDPAZ4CUAwEAAsLCwsAsLCwy8DAywsLC8AQvAvLy8CwvLywsLwLCwsLCwsMvAywy8vAywRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFAgMKqqqqqgAAAAAAAAAAAAAAAAAAsACwsKCwoA';
// THIRD: codepoint >= 65536
export const THIRD: string = '';

1 comment on commit 7f6d12b

@TK101518
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.