From 7f6d12b428f942a1225af9d22c603adf9e4d619d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Breitbart?= Date: Thu, 31 May 2018 00:27:38 +0200 Subject: [PATCH] grapheme data import --- bin/create-graphemedata.js | 124 +++++++++++++++++++++++++++++++++++++ package.json | 3 +- src/Grapheme.test.ts | 103 ++++++++++++++++++++++++++++++ src/Grapheme.ts | 88 ++++++++++++++++++++++++++ src/GraphemeData.ts | 6 ++ 5 files changed, 323 insertions(+), 1 deletion(-) create mode 100755 bin/create-graphemedata.js create mode 100644 src/Grapheme.test.ts create mode 100644 src/Grapheme.ts create mode 100644 src/GraphemeData.ts diff --git a/bin/create-graphemedata.js b/bin/create-graphemedata.js new file mode 100755 index 0000000000..772d7476a0 --- /dev/null +++ b/bin/create-graphemedata.js @@ -0,0 +1,124 @@ +#!/usr/bin/env node +'use strict'; + +const URL = 'https://www.unicode.org/Public/10.0.0/ucd/auxiliary/GraphemeBreakProperty.txt'; +const PATH = __dirname + '/../src/GraphemeData.ts'; + +const GRAPHEME_REX = /^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*([A-Za-z_]+)/gm; + +const TYPES = { + Other: 0, + L: 1, + V: 2, + T: 3, + LV: 4, + LVT: 5, + CR: 6, + LF: 7, + ZWJ: 8, + Prepend: 9, + Control: 10, + Extend: 11, + SpacingMark: 12, + E_Base: 13, + Glue_After_Zwj: 14, + E_Modifier: 15, + E_Base_GAZ: 16, + Regional_Indicator: 17 +}; + +function parseDefinitions(data) { + let codepoints = Object.create(null); + let match = null; + while (match = GRAPHEME_REX.exec(data)) { + let start = parseInt(match[1], 16); + let end = parseInt(match[2], 16) || start; + for (let i = start; i < end + 1; ++i) + codepoints[i] = match[3]; + } + return codepoints; +} + + +function createPackedBMP(codepoints, start, end) { + let type = -1; + let count = 0; + let lengths = []; + let types = []; + for (let i = start; i < end; ++i) { + let t = parseInt(TYPES[codepoints[i] || 'Other']); + if (t !== type) { + lengths.push(count); + types.push(type); + type = t; + count = 0; + } + count++; + if (count === 255) { + lengths.push(count); + types.push(type); + count = 0; + } + } + lengths.push(count); + types.push(type); + + // remove start entries + lengths.shift(); + types.shift(); + + if (types.length & 1) + types.push(0); + + let accu = 0; + let finalTypes = []; + for (let i = 0; i < types.length; ++i) { + accu <<= 4; + accu |= types[i]; + if (i & 1) { + finalTypes.push(accu); + accu = 0; + } + } + + // null terminate length values + lengths.push(0); + return new Buffer(lengths.concat(finalTypes)).toString('base64'); +} + + +function createGraphemeDataFile(url, path) { + require('https').get(url, (resp) => { + let data = ''; + resp.on('data', (chunk) => { + data += chunk; + }); + resp.on('end', () => { + const codepoints = parseDefinitions(data); + let highest = 0; + for (let el in codepoints) + highest = Math.max(highest, parseInt(el)); + + // codepoint < 12443 + const first = createPackedBMP(codepoints, 0, 12443); + // 42606 <= codepoint < 65536 + const second = createPackedBMP(codepoints, 42606, 65536); + // codepoint <= 65536 + const third = ''; //createPackedHIGH(codepoints, 65536, highest); + + // write to ts file + let final = ''; + final += `// FIRST: 0 <= codepoint < 12443\n`; + final += `export const FIRST: string = '${first}';\n`; + final += `// SECOND: 42606 <= codepoint < 65536\n`; + final += `export const SECOND: string = '${second}';\n`; + final += `// THIRD: codepoint >= 65536\n`; + final += `export const THIRD: string = '${third}';\n`; + require('fs').writeFileSync(path, final); + }); + }).on('error', (err) => { + console.log('error', err.message); + }); +} + +createGraphemeDataFile(URL, PATH); diff --git a/package.json b/package.json index 44cc2e88d2..3a62dc5029 100644 --- a/package.json +++ b/package.json @@ -55,6 +55,7 @@ "prepublish": "npm run build", "coveralls": "gulp coveralls", "webpack": "gulp webpack", - "watch": "gulp watch" + "watch": "gulp watch", + "graphemedata": "node bin/create-graphemedata" } } diff --git a/src/Grapheme.test.ts b/src/Grapheme.test.ts new file mode 100644 index 0000000000..c3d38b2974 --- /dev/null +++ b/src/Grapheme.test.ts @@ -0,0 +1,103 @@ +/** + * Copyright (c) 2018 The xterm.js authors. All rights reserved. + * @license MIT + */ + +import { FIRST, SECOND } from './GraphemeData'; +import { loadFromPackedBMP, graphemeType } from './Grapheme'; +import * as chai from 'chai'; + +const TYPES = { + Other: 0, + L: 1, + V: 2, + T: 3, + LV: 4, + LVT: 5, + CR: 6, + LF: 7, + ZWJ: 8, + Prepend: 9, + Control: 10, + Extend: 11, + SpacingMark: 12, + E_Base: 13, + Glue_After_Zwj: 14, + E_Modifier: 15, + E_Base_GAZ: 16, + Regional_Indicator: 17 +}; + +const URL = 'https://www.unicode.org/Public/10.0.0/ucd/auxiliary/GraphemeBreakProperty.txt'; +const GRAPHEME_REX = /^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*([A-Za-z_]+)/gm; + +let CODEPOINTS = null; + +function parseDefinitions(data: string): {[key: number]: number} { + let codepoints = Object.create(null); + let match = null; + while (match = GRAPHEME_REX.exec(data)) { + let start = parseInt(match[1], 16); + let end = parseInt(match[2], 16) || start; + for (let i = start; i < end + 1; ++i) codepoints[i] = match[3]; + } + return codepoints; +} + +function loadUnicodeData(done: Function): void { + require('https').get(URL, (resp): any => { + let data = ''; + resp.on('data', (chunk): void => { + data += chunk; + }); + resp.on('end', () => { + CODEPOINTS = parseDefinitions(data); + done(); + }); + }).on('error', (err) => { + throw Error('error fetching unicode data'); + }); +} + +describe('grapheme cluster', function (): void { + before(function(done: Function): void { + loadUnicodeData(done); + }); + describe('correct GraphemeData', function(): void { + it('FIRST', function(): void { + if (!CODEPOINTS) return; + let one = loadFromPackedBMP(FIRST, 0, 12443); + for (let cp = 0; cp < 12443; ++cp) { + let fromStore = TYPES[CODEPOINTS[cp]] || 0; + let v = (cp & 1) ? one[cp >> 1] >> 4 : one[cp >> 1] & 15; + chai.expect(fromStore).equals(v); + } + }); + it('SECOND', function(): void { + if (!CODEPOINTS) return; + let one = loadFromPackedBMP(SECOND, 42606, 65536); + for (let cp = 42606; cp < 65536; ++cp) { + let fromStore = TYPES[CODEPOINTS[cp]] || 0; + let idx = cp - 42606; + let v = (idx & 1) ? one[idx >> 1] >> 4 : one[idx >> 1] & 15; + chai.expect(fromStore).equals(v); + } + }); + it('THIRD', function(): void { + if (!CODEPOINTS) return; + // TODO + }); + }); + describe('graphemeType', function(): void { + it('BMP', function(): void { + if (!CODEPOINTS) return; + for (let cp = 0; cp < 65536; ++cp) { + chai.expect(graphemeType(cp)).equals(TYPES[CODEPOINTS[cp]] || 0); + } + }); + it('HIGH', function(): void { + if (!CODEPOINTS) return; + // TODO + }); + }); +}); diff --git a/src/Grapheme.ts b/src/Grapheme.ts new file mode 100644 index 0000000000..4eddc9ff85 --- /dev/null +++ b/src/Grapheme.ts @@ -0,0 +1,88 @@ +/** + * Copyright (c) 2018 The xterm.js authors. All rights reserved. + * @license MIT + */ +import { FIRST, SECOND } from './GraphemeData'; + +export function loadFromPackedBMP(data: string, start: number, end: number): number[] | Uint8Array { + // decode base64 and split into lengths and types strings + const raw = (typeof atob === 'undefined') + // nodejs + ? new Buffer(data, 'base64').toString('binary') + // browser - FIXME: how to test this? + : atob(data); + // first occurence of 0x0 marks end of lengths (null terminated) + const lengths = raw.substring(0, raw.indexOf('\x00')); + const types = raw.substring(raw.indexOf('\x00') + 1); + + // lookup table with 2 type entries per index position + const table = (typeof Uint8Array === 'undefined') + ? new Array(((end - start) >> 1) + 1) + : new Uint8Array(((end - start) >> 1) + 1); + + // load data into lookup table + let codepointOffset = 0; + for (let chunkIdx = 0; chunkIdx < lengths.length; ++chunkIdx) { + let currentLength = lengths.charCodeAt(chunkIdx); + for (let chunkPos = 0; chunkPos < currentLength; ++chunkPos) { + let tcode = types.charCodeAt(chunkIdx >> 1); + let type = (chunkIdx & 1) ? tcode & 15 : tcode >> 4; + table[(codepointOffset + chunkPos) >> 1] |= ((codepointOffset + chunkPos) & 1) ? type << 4 : type; + } + codepointOffset += currentLength; + } + return table; +} + + +// NOTE: Types must be identical to bin/create-graphemedata.js#TYPES +const enum Types { + OTHER = 0, + L = 1, + V = 2, + T = 3, + LV = 4, + LVT = 5, + CR = 6, + LF = 7, + ZWJ = 8, + PREPEND = 9, + CONTROL = 10, + EXTEND = 11, + SPACINGMARK = 12, + E_BASE = 13, + GLUE_AFTER_ZWJ = 14, + E_MODIFIER = 15, + E_BASE_GAZ = 16, + REGIONAL_INDICATOR = 17 +} + +export const graphemeType = (function(): (codepoint: number) => Types { + let BMP_LOW = null; + let BMP_HIGH = null; + return (codepoint: number): Types => { + // ASCII printable shortcut + if (31 < codepoint && codepoint < 127) return Types.OTHER; + // BMP_LOW: 0 <= codepoint < 12443 + if (codepoint < 12443) { + let table = BMP_LOW || ((): number[] | Uint8Array => { + BMP_LOW = loadFromPackedBMP(FIRST, 0, 12443); + return BMP_LOW; + })(); + return (codepoint & 1) ? table[codepoint >> 1] >> 4 : table[codepoint >> 1] & 15; + } + // always Other: 12443 <= codepoint < 42606 + if (codepoint < 42606) return Types.OTHER; + // BMP_HIGH (CJK): 42606 <= codepoint < 65536 + if (codepoint < 65536) { + let table = BMP_HIGH || ((): number[] | Uint8Array => { + BMP_HIGH = loadFromPackedBMP(SECOND, 42606, 65536); + return BMP_HIGH; + })(); + codepoint -= 42606; + return (codepoint & 1) ? table[codepoint >> 1] >> 4 : table[codepoint >> 1] & 15; + } + // TODO codepoint > 65536 + return Types.OTHER; + }; +})(); diff --git a/src/GraphemeData.ts b/src/GraphemeData.ts new file mode 100644 index 0000000000..c9cb30d3ef --- /dev/null +++ b/src/GraphemeData.ts @@ -0,0 +1,6 @@ +// FIRST: 0 <= codepoint < 12443 +export const FIRST: string = 'CgECARJfIQ0B//9UcP8UB/8ILQEBAQIBAgEBOAYKCwEBLhUQAWUHAQEGAgIBBCEBAQEeG1sLOgkiBAEJAQMBBSsDeA4BIAE2AQEBAQMIBAECAQcKAh0BAjgBAQECBAICAgIBCQEKAh0CATgBAQMCBAICAwMBHgIDAQsCATgBAQMFAQIBAQIBFAIWBgEBAjgBAQIBBAICAgIBCAIKAh4BOwEBAQIDAwEDAQkBKAEDOgMEAQMBBAcCCwIdAQI4AQEBAQIBAgEBAgECAgcCCwIcAgI3AgEBAgQBAwEDAQEIAQoCHgJGAQQBAgMBAQEHARICPQEBAQcMCGIBAQEGAQILBkoCGwEBAQEBBAIxDgEFAQIFCwEkCQFmBAEGAQICAhcCAgQDEAQNAQEBAgYBDwFiYEhY/14D////tQMdAx0CHgJAAgEHCAECCwkBLQMBdgIiAXYDBAIDBAIBBgPbAgIBOQEBAQcBAQEBAggGCgIBMA9BBAEvAQEFAQEFAQImCQwCAR4BBAICAQM4AQECAwEBAwIwCAgCApgDAQ0BBwQBBAIBAgECxjoBBf//DQEBAQIYBzEQYCH//////zEBIgEBAVICYgEOAQEEVgH//////48DjQFgIP//LAZpAgCnpqCgoACwCwCwsLCwsJCwoLCwuQsLCwkLCwsLCwsLCwsLm8C8sMvLwLCwvAsLywwMsLCwvAsMsLCwsLCwvAsMsLwMsLCwvAsLywwMsLCwsLy8DAywsLwLwLCwsLC8Cwy8vAvAywsLC8CwvLDAy5CwsMCwvLCwywwLDLCwsMsLCwsLCwsMC8sLCwsLC8sLywywsLCwywsLASMAsAALCwsLC8vLywsLoLCwvLwMvLC8sMvLCwsLywsLC8C8vLy8CwvAy8vLC8vLy8DLywsLywsMsMsLCwAKuKCgoLAAAA0ODg4NDg0OAAAAsLCwALCw'; +// SECOND: 42606 <= codepoint < 65536 +export const SECOND: string = 'AQQBCiACUAL/EQEDAQQBFwICAVgCMhACGhI0CBkLAgwdAwMBLwECBAIBBCQBQwYCAgICDAEIAQEuATMBAQMCAgUCAQEpAQICBQEB7AIBAgECAQEBEgEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwwXBDEE//////////8I////////////////////////////////////OQH//+MQEBDPAZ4CUAwEAAsLCwsAsLCwy8DAywsLC8AQvAvLy8CwvLywsLwLCwsLCwsMvAywy8vAywRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFAgMKqqqqqgAAAAAAAAAAAAAAAAAAsACwsKCwoA'; +// THIRD: codepoint >= 65536 +export const THIRD: string = '';