-
Notifications
You must be signed in to change notification settings - Fork 1.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
323 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
#!/usr/bin/env node | ||
'use strict'; | ||
|
||
const URL = 'https://www.unicode.org/Public/10.0.0/ucd/auxiliary/GraphemeBreakProperty.txt'; | ||
const PATH = __dirname + '/../src/GraphemeData.ts'; | ||
|
||
const GRAPHEME_REX = /^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*([A-Za-z_]+)/gm; | ||
|
||
const TYPES = { | ||
Other: 0, | ||
L: 1, | ||
V: 2, | ||
T: 3, | ||
LV: 4, | ||
LVT: 5, | ||
CR: 6, | ||
LF: 7, | ||
ZWJ: 8, | ||
Prepend: 9, | ||
Control: 10, | ||
Extend: 11, | ||
SpacingMark: 12, | ||
E_Base: 13, | ||
Glue_After_Zwj: 14, | ||
E_Modifier: 15, | ||
E_Base_GAZ: 16, | ||
Regional_Indicator: 17 | ||
}; | ||
|
||
function parseDefinitions(data) { | ||
let codepoints = Object.create(null); | ||
let match = null; | ||
while (match = GRAPHEME_REX.exec(data)) { | ||
let start = parseInt(match[1], 16); | ||
let end = parseInt(match[2], 16) || start; | ||
for (let i = start; i < end + 1; ++i) | ||
codepoints[i] = match[3]; | ||
} | ||
return codepoints; | ||
} | ||
|
||
|
||
function createPackedBMP(codepoints, start, end) { | ||
let type = -1; | ||
let count = 0; | ||
let lengths = []; | ||
let types = []; | ||
for (let i = start; i < end; ++i) { | ||
let t = parseInt(TYPES[codepoints[i] || 'Other']); | ||
if (t !== type) { | ||
lengths.push(count); | ||
types.push(type); | ||
type = t; | ||
count = 0; | ||
} | ||
count++; | ||
if (count === 255) { | ||
lengths.push(count); | ||
types.push(type); | ||
count = 0; | ||
} | ||
} | ||
lengths.push(count); | ||
types.push(type); | ||
|
||
// remove start entries | ||
lengths.shift(); | ||
types.shift(); | ||
|
||
if (types.length & 1) | ||
types.push(0); | ||
|
||
let accu = 0; | ||
let finalTypes = []; | ||
for (let i = 0; i < types.length; ++i) { | ||
accu <<= 4; | ||
accu |= types[i]; | ||
if (i & 1) { | ||
finalTypes.push(accu); | ||
accu = 0; | ||
} | ||
} | ||
|
||
// null terminate length values | ||
lengths.push(0); | ||
return new Buffer(lengths.concat(finalTypes)).toString('base64'); | ||
} | ||
|
||
|
||
function createGraphemeDataFile(url, path) { | ||
require('https').get(url, (resp) => { | ||
let data = ''; | ||
resp.on('data', (chunk) => { | ||
data += chunk; | ||
}); | ||
resp.on('end', () => { | ||
const codepoints = parseDefinitions(data); | ||
let highest = 0; | ||
for (let el in codepoints) | ||
highest = Math.max(highest, parseInt(el)); | ||
|
||
// codepoint < 12443 | ||
const first = createPackedBMP(codepoints, 0, 12443); | ||
// 42606 <= codepoint < 65536 | ||
const second = createPackedBMP(codepoints, 42606, 65536); | ||
// codepoint <= 65536 | ||
const third = ''; //createPackedHIGH(codepoints, 65536, highest); | ||
|
||
// write to ts file | ||
let final = ''; | ||
final += `// FIRST: 0 <= codepoint < 12443\n`; | ||
final += `export const FIRST: string = '${first}';\n`; | ||
final += `// SECOND: 42606 <= codepoint < 65536\n`; | ||
final += `export const SECOND: string = '${second}';\n`; | ||
final += `// THIRD: codepoint >= 65536\n`; | ||
final += `export const THIRD: string = '${third}';\n`; | ||
require('fs').writeFileSync(path, final); | ||
}); | ||
}).on('error', (err) => { | ||
console.log('error', err.message); | ||
}); | ||
} | ||
|
||
createGraphemeDataFile(URL, PATH); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
/** | ||
* Copyright (c) 2018 The xterm.js authors. All rights reserved. | ||
* @license MIT | ||
*/ | ||
|
||
import { FIRST, SECOND } from './GraphemeData'; | ||
import { loadFromPackedBMP, graphemeType } from './Grapheme'; | ||
import * as chai from 'chai'; | ||
|
||
const TYPES = { | ||
Other: 0, | ||
L: 1, | ||
V: 2, | ||
T: 3, | ||
LV: 4, | ||
LVT: 5, | ||
CR: 6, | ||
LF: 7, | ||
ZWJ: 8, | ||
Prepend: 9, | ||
Control: 10, | ||
Extend: 11, | ||
SpacingMark: 12, | ||
E_Base: 13, | ||
Glue_After_Zwj: 14, | ||
E_Modifier: 15, | ||
E_Base_GAZ: 16, | ||
Regional_Indicator: 17 | ||
}; | ||
|
||
const URL = 'https://www.unicode.org/Public/10.0.0/ucd/auxiliary/GraphemeBreakProperty.txt'; | ||
const GRAPHEME_REX = /^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*([A-Za-z_]+)/gm; | ||
|
||
let CODEPOINTS = null; | ||
|
||
function parseDefinitions(data: string): {[key: number]: number} { | ||
let codepoints = Object.create(null); | ||
let match = null; | ||
while (match = GRAPHEME_REX.exec(data)) { | ||
let start = parseInt(match[1], 16); | ||
let end = parseInt(match[2], 16) || start; | ||
for (let i = start; i < end + 1; ++i) codepoints[i] = match[3]; | ||
} | ||
return codepoints; | ||
} | ||
|
||
function loadUnicodeData(done: Function): void { | ||
require('https').get(URL, (resp): any => { | ||
let data = ''; | ||
resp.on('data', (chunk): void => { | ||
data += chunk; | ||
}); | ||
resp.on('end', () => { | ||
CODEPOINTS = parseDefinitions(data); | ||
done(); | ||
}); | ||
}).on('error', (err) => { | ||
throw Error('error fetching unicode data'); | ||
}); | ||
} | ||
|
||
describe('grapheme cluster', function (): void { | ||
before(function(done: Function): void { | ||
loadUnicodeData(done); | ||
}); | ||
describe('correct GraphemeData', function(): void { | ||
it('FIRST', function(): void { | ||
if (!CODEPOINTS) return; | ||
let one = loadFromPackedBMP(FIRST, 0, 12443); | ||
for (let cp = 0; cp < 12443; ++cp) { | ||
let fromStore = TYPES[CODEPOINTS[cp]] || 0; | ||
let v = (cp & 1) ? one[cp >> 1] >> 4 : one[cp >> 1] & 15; | ||
chai.expect(fromStore).equals(v); | ||
} | ||
}); | ||
it('SECOND', function(): void { | ||
if (!CODEPOINTS) return; | ||
let one = loadFromPackedBMP(SECOND, 42606, 65536); | ||
for (let cp = 42606; cp < 65536; ++cp) { | ||
let fromStore = TYPES[CODEPOINTS[cp]] || 0; | ||
let idx = cp - 42606; | ||
let v = (idx & 1) ? one[idx >> 1] >> 4 : one[idx >> 1] & 15; | ||
chai.expect(fromStore).equals(v); | ||
} | ||
}); | ||
it('THIRD', function(): void { | ||
if (!CODEPOINTS) return; | ||
// TODO | ||
}); | ||
}); | ||
describe('graphemeType', function(): void { | ||
it('BMP', function(): void { | ||
if (!CODEPOINTS) return; | ||
for (let cp = 0; cp < 65536; ++cp) { | ||
chai.expect(graphemeType(cp)).equals(TYPES[CODEPOINTS[cp]] || 0); | ||
} | ||
}); | ||
it('HIGH', function(): void { | ||
if (!CODEPOINTS) return; | ||
// TODO | ||
}); | ||
}); | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
/** | ||
* Copyright (c) 2018 The xterm.js authors. All rights reserved. | ||
* @license MIT | ||
*/ | ||
import { FIRST, SECOND } from './GraphemeData'; | ||
|
||
export function loadFromPackedBMP(data: string, start: number, end: number): number[] | Uint8Array { | ||
// decode base64 and split into lengths and types strings | ||
const raw = (typeof atob === 'undefined') | ||
// nodejs | ||
? new Buffer(data, 'base64').toString('binary') | ||
// browser - FIXME: how to test this? | ||
: atob(data); | ||
// first occurence of 0x0 marks end of lengths (null terminated) | ||
const lengths = raw.substring(0, raw.indexOf('\x00')); | ||
const types = raw.substring(raw.indexOf('\x00') + 1); | ||
|
||
// lookup table with 2 type entries per index position | ||
const table = (typeof Uint8Array === 'undefined') | ||
? new Array(((end - start) >> 1) + 1) | ||
: new Uint8Array(((end - start) >> 1) + 1); | ||
|
||
// load data into lookup table | ||
let codepointOffset = 0; | ||
for (let chunkIdx = 0; chunkIdx < lengths.length; ++chunkIdx) { | ||
let currentLength = lengths.charCodeAt(chunkIdx); | ||
for (let chunkPos = 0; chunkPos < currentLength; ++chunkPos) { | ||
let tcode = types.charCodeAt(chunkIdx >> 1); | ||
let type = (chunkIdx & 1) ? tcode & 15 : tcode >> 4; | ||
table[(codepointOffset + chunkPos) >> 1] |= ((codepointOffset + chunkPos) & 1) ? type << 4 : type; | ||
} | ||
codepointOffset += currentLength; | ||
} | ||
return table; | ||
} | ||
|
||
|
||
// NOTE: Types must be identical to bin/create-graphemedata.js#TYPES | ||
const enum Types { | ||
OTHER = 0, | ||
L = 1, | ||
V = 2, | ||
T = 3, | ||
LV = 4, | ||
LVT = 5, | ||
CR = 6, | ||
LF = 7, | ||
ZWJ = 8, | ||
PREPEND = 9, | ||
CONTROL = 10, | ||
EXTEND = 11, | ||
SPACINGMARK = 12, | ||
E_BASE = 13, | ||
GLUE_AFTER_ZWJ = 14, | ||
E_MODIFIER = 15, | ||
E_BASE_GAZ = 16, | ||
REGIONAL_INDICATOR = 17 | ||
} | ||
|
||
export const graphemeType = (function(): (codepoint: number) => Types { | ||
let BMP_LOW = null; | ||
let BMP_HIGH = null; | ||
return (codepoint: number): Types => { | ||
// ASCII printable shortcut | ||
if (31 < codepoint && codepoint < 127) return Types.OTHER; | ||
// BMP_LOW: 0 <= codepoint < 12443 | ||
if (codepoint < 12443) { | ||
let table = BMP_LOW || ((): number[] | Uint8Array => { | ||
BMP_LOW = loadFromPackedBMP(FIRST, 0, 12443); | ||
return BMP_LOW; | ||
})(); | ||
return (codepoint & 1) ? table[codepoint >> 1] >> 4 : table[codepoint >> 1] & 15; | ||
} | ||
// always Other: 12443 <= codepoint < 42606 | ||
if (codepoint < 42606) return Types.OTHER; | ||
// BMP_HIGH (CJK): 42606 <= codepoint < 65536 | ||
if (codepoint < 65536) { | ||
let table = BMP_HIGH || ((): number[] | Uint8Array => { | ||
BMP_HIGH = loadFromPackedBMP(SECOND, 42606, 65536); | ||
return BMP_HIGH; | ||
})(); | ||
codepoint -= 42606; | ||
return (codepoint & 1) ? table[codepoint >> 1] >> 4 : table[codepoint >> 1] & 15; | ||
} | ||
// TODO codepoint > 65536 | ||
return Types.OTHER; | ||
}; | ||
})(); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
// FIRST: 0 <= codepoint < 12443 | ||
export const FIRST: string = 'CgECARJfIQ0B//9UcP8UB/8ILQEBAQIBAgEBOAYKCwEBLhUQAWUHAQEGAgIBBCEBAQEeG1sLOgkiBAEJAQMBBSsDeA4BIAE2AQEBAQMIBAECAQcKAh0BAjgBAQECBAICAgIBCQEKAh0CATgBAQMCBAICAwMBHgIDAQsCATgBAQMFAQIBAQIBFAIWBgEBAjgBAQIBBAICAgIBCAIKAh4BOwEBAQIDAwEDAQkBKAEDOgMEAQMBBAcCCwIdAQI4AQEBAQIBAgEBAgECAgcCCwIcAgI3AgEBAgQBAwEDAQEIAQoCHgJGAQQBAgMBAQEHARICPQEBAQcMCGIBAQEGAQILBkoCGwEBAQEBBAIxDgEFAQIFCwEkCQFmBAEGAQICAhcCAgQDEAQNAQEBAgYBDwFiYEhY/14D////tQMdAx0CHgJAAgEHCAECCwkBLQMBdgIiAXYDBAIDBAIBBgPbAgIBOQEBAQcBAQEBAggGCgIBMA9BBAEvAQEFAQEFAQImCQwCAR4BBAICAQM4AQECAwEBAwIwCAgCApgDAQ0BBwQBBAIBAgECxjoBBf//DQEBAQIYBzEQYCH//////zEBIgEBAVICYgEOAQEEVgH//////48DjQFgIP//LAZpAgCnpqCgoACwCwCwsLCwsJCwoLCwuQsLCwkLCwsLCwsLCwsLm8C8sMvLwLCwvAsLywwMsLCwvAsMsLCwsLCwvAsMsLwMsLCwvAsLywwMsLCwsLy8DAywsLwLwLCwsLC8Cwy8vAvAywsLC8CwvLDAy5CwsMCwvLCwywwLDLCwsMsLCwsLCwsMC8sLCwsLC8sLywywsLCwywsLASMAsAALCwsLC8vLywsLoLCwvLwMvLC8sMvLCwsLywsLC8C8vLy8CwvAy8vLC8vLy8DLywsLywsMsMsLCwAKuKCgoLAAAA0ODg4NDg0OAAAAsLCwALCw'; | ||
// SECOND: 42606 <= codepoint < 65536 | ||
export const SECOND: string = 'AQQBCiACUAL/EQEDAQQBFwICAVgCMhACGhI0CBkLAgwdAwMBLwECBAIBBCQBQwYCAgICDAEIAQEuATMBAQMCAgUCAQEpAQICBQEB7AIBAgECAQEBEgEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwwXBDEE//////////8I////////////////////////////////////OQH//+MQEBDPAZ4CUAwEAAsLCwsAsLCwy8DAywsLC8AQvAvLy8CwvLywsLwLCwsLCwsMvAywy8vAywRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFAgMKqqqqqgAAAAAAAAAAAAAAAAAAsACwsKCwoA'; | ||
// THIRD: codepoint >= 65536 | ||
export const THIRD: string = ''; |
7f6d12b
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
7f6d12b