-
Notifications
You must be signed in to change notification settings - Fork 1.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
grapheme support #1478
Closed
Closed
grapheme support #1478
Changes from 2 commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
#!/usr/bin/env node | ||
'use strict'; | ||
|
||
const URL = 'https://www.unicode.org/Public/10.0.0/ucd/auxiliary/GraphemeBreakProperty.txt'; | ||
const PATH = __dirname + '/../src/GraphemeData.ts'; | ||
|
||
const GRAPHEME_REX = /^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*([A-Za-z_]+)/gm; | ||
|
||
const TYPES = { | ||
Other: 0, | ||
L: 1, | ||
V: 2, | ||
T: 3, | ||
LV: 4, | ||
LVT: 5, | ||
CR: 6, | ||
LF: 7, | ||
ZWJ: 8, | ||
Prepend: 9, | ||
Control: 10, | ||
Extend: 11, | ||
SpacingMark: 12, | ||
E_Base: 13, | ||
Glue_After_Zwj: 14, | ||
E_Modifier: 15, | ||
E_Base_GAZ: 16, | ||
Regional_Indicator: 17 | ||
}; | ||
|
||
function parseDefinitions(data) { | ||
let codepoints = Object.create(null); | ||
let match = null; | ||
while (match = GRAPHEME_REX.exec(data)) { | ||
let start = parseInt(match[1], 16); | ||
let end = parseInt(match[2], 16) || start; | ||
for (let i = start; i < end + 1; ++i) | ||
codepoints[i] = match[3]; | ||
} | ||
return codepoints; | ||
} | ||
|
||
|
||
function createPackedBMP(codepoints, start, end) { | ||
let type = -1; | ||
let count = 0; | ||
let lengths = []; | ||
let types = []; | ||
for (let i = start; i < end; ++i) { | ||
let t = parseInt(TYPES[codepoints[i] || 'Other']); | ||
if (t !== type) { | ||
lengths.push(count); | ||
types.push(type); | ||
type = t; | ||
count = 0; | ||
} | ||
count++; | ||
if (count === 255) { | ||
lengths.push(count); | ||
types.push(type); | ||
count = 0; | ||
} | ||
} | ||
lengths.push(count); | ||
types.push(type); | ||
|
||
// remove start entries | ||
lengths.shift(); | ||
types.shift(); | ||
|
||
if (types.length & 1) | ||
types.push(0); | ||
|
||
let accu = 0; | ||
let finalTypes = []; | ||
for (let i = 0; i < types.length; ++i) { | ||
accu <<= 4; | ||
accu |= types[i]; | ||
if (i & 1) { | ||
finalTypes.push(accu); | ||
accu = 0; | ||
} | ||
} | ||
|
||
// null terminate length values | ||
lengths.push(0); | ||
return new Buffer(lengths.concat(finalTypes)).toString('base64'); | ||
} | ||
|
||
|
||
function createGraphemeDataFile(url, path) { | ||
require('https').get(url, (resp) => { | ||
let data = ''; | ||
resp.on('data', (chunk) => { | ||
data += chunk; | ||
}); | ||
resp.on('end', () => { | ||
const codepoints = parseDefinitions(data); | ||
let highest = 0; | ||
for (let el in codepoints) | ||
highest = Math.max(highest, parseInt(el)); | ||
|
||
// codepoint < 12443 | ||
const first = createPackedBMP(codepoints, 0, 12443); | ||
// 42606 <= codepoint < 65536 | ||
const second = createPackedBMP(codepoints, 42606, 65536); | ||
// codepoint <= 65536 | ||
const third = ''; //createPackedHIGH(codepoints, 65536, highest); | ||
|
||
// write to ts file | ||
let final = ''; | ||
final += `// FIRST: 0 <= codepoint < 12443\n`; | ||
final += `export const FIRST: string = '${first}';\n`; | ||
final += `// SECOND: 42606 <= codepoint < 65536\n`; | ||
final += `export const SECOND: string = '${second}';\n`; | ||
final += `// THIRD: codepoint >= 65536\n`; | ||
final += `export const THIRD: string = '${third}';\n`; | ||
require('fs').writeFileSync(path, final); | ||
}); | ||
}).on('error', (err) => { | ||
console.log('error', err.message); | ||
}); | ||
} | ||
|
||
createGraphemeDataFile(URL, PATH); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
/** | ||
* Copyright (c) 2018 The xterm.js authors. All rights reserved. | ||
* @license MIT | ||
*/ | ||
|
||
import { FIRST, SECOND } from './GraphemeData'; | ||
import { loadFromPackedBMP, graphemeType } from './Grapheme'; | ||
import * as chai from 'chai'; | ||
|
||
const TYPES = { | ||
Other: 0, | ||
L: 1, | ||
V: 2, | ||
T: 3, | ||
LV: 4, | ||
LVT: 5, | ||
CR: 6, | ||
LF: 7, | ||
ZWJ: 8, | ||
Prepend: 9, | ||
Control: 10, | ||
Extend: 11, | ||
SpacingMark: 12, | ||
E_Base: 13, | ||
Glue_After_Zwj: 14, | ||
E_Modifier: 15, | ||
E_Base_GAZ: 16, | ||
Regional_Indicator: 17 | ||
}; | ||
|
||
const URL = 'https://www.unicode.org/Public/10.0.0/ucd/auxiliary/GraphemeBreakProperty.txt'; | ||
const GRAPHEME_REX = /^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*([A-Za-z_]+)/gm; | ||
|
||
let CODEPOINTS = null; | ||
|
||
function parseDefinitions(data: string): {[key: number]: number} { | ||
const codepoints = Object.create(null); | ||
let match = null; | ||
while (match = GRAPHEME_REX.exec(data)) { | ||
const start = parseInt(match[1], 16); | ||
const end = parseInt(match[2], 16) || start; | ||
for (let i = start; i < end + 1; ++i) codepoints[i] = match[3]; | ||
} | ||
return codepoints; | ||
} | ||
|
||
function loadUnicodeData(done: Function): void { | ||
require('https').get(URL, (resp): any => { | ||
let data = ''; | ||
resp.on('data', (chunk): void => { | ||
data += chunk; | ||
}); | ||
resp.on('end', () => { | ||
CODEPOINTS = parseDefinitions(data); | ||
done(); | ||
}); | ||
}).on('error', (err) => { | ||
throw Error('error fetching unicode data'); | ||
}); | ||
} | ||
|
||
describe('grapheme cluster', function (): void { | ||
before(function(done: Function): void { | ||
this.timeout(5000); | ||
loadUnicodeData(done); | ||
}); | ||
describe('correct GraphemeData', function(): void { | ||
it('FIRST', function(): void { | ||
if (!CODEPOINTS) return; | ||
const one = loadFromPackedBMP(FIRST, 0, 12443); | ||
for (let cp = 0; cp < 12443; ++cp) { | ||
const fromStore = TYPES[CODEPOINTS[cp]] || 0; | ||
const v = (cp & 1) ? one[cp >> 1] >> 4 : one[cp >> 1] & 15; | ||
chai.expect(fromStore).equals(v); | ||
} | ||
}); | ||
it('SECOND', function(): void { | ||
if (!CODEPOINTS) return; | ||
const one = loadFromPackedBMP(SECOND, 42606, 65536); | ||
for (let cp = 42606; cp < 65536; ++cp) { | ||
const fromStore = TYPES[CODEPOINTS[cp]] || 0; | ||
const idx = cp - 42606; | ||
const v = (idx & 1) ? one[idx >> 1] >> 4 : one[idx >> 1] & 15; | ||
chai.expect(fromStore).equals(v); | ||
} | ||
}); | ||
it('THIRD', function(): void { | ||
if (!CODEPOINTS) return; | ||
// TODO | ||
}); | ||
}); | ||
describe('graphemeType', function(): void { | ||
it('BMP', function(): void { | ||
if (!CODEPOINTS) return; | ||
for (let cp = 0; cp < 65536; ++cp) { | ||
chai.expect(graphemeType(cp)).equals(TYPES[CODEPOINTS[cp]] || 0); | ||
} | ||
}); | ||
it('HIGH', function(): void { | ||
if (!CODEPOINTS) return; | ||
// TODO | ||
}); | ||
}); | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
/** | ||
* Copyright (c) 2018 The xterm.js authors. All rights reserved. | ||
* @license MIT | ||
*/ | ||
import { FIRST, SECOND } from './GraphemeData'; | ||
|
||
export function loadFromPackedBMP(data: string, start: number, end: number): number[] | Uint8Array { | ||
// decode base64 and split into lengths and types strings | ||
const raw = (typeof atob === 'undefined') | ||
// nodejs | ||
? new Buffer(data, 'base64').toString('binary') | ||
// browser - FIXME: how to test this? | ||
: atob(data); | ||
// first occurence of 0x0 marks end of lengths (null terminated) | ||
const lengths = raw.substring(0, raw.indexOf('\x00')); | ||
const types = raw.substring(raw.indexOf('\x00') + 1); | ||
|
||
// lookup table with 2 type entries per index position | ||
const table = (typeof Uint8Array === 'undefined') | ||
? new Array(((end - start) >> 1) + 1) | ||
: new Uint8Array(((end - start) >> 1) + 1); | ||
|
||
// load data into lookup table | ||
let codepointOffset = 0; | ||
for (let chunkIdx = 0; chunkIdx < lengths.length; ++chunkIdx) { | ||
const currentLength = lengths.charCodeAt(chunkIdx); | ||
for (let chunkPos = 0; chunkPos < currentLength; ++chunkPos) { | ||
const tcode = types.charCodeAt(chunkIdx >> 1); | ||
const type = (chunkIdx & 1) ? tcode & 15 : tcode >> 4; | ||
table[(codepointOffset + chunkPos) >> 1] |= ((codepointOffset + chunkPos) & 1) ? type << 4 : type; | ||
} | ||
codepointOffset += currentLength; | ||
} | ||
return table; | ||
} | ||
|
||
|
||
// NOTE: Types must be identical to bin/create-graphemedata.js#TYPES | ||
const enum Types { | ||
OTHER = 0, | ||
L = 1, | ||
V = 2, | ||
T = 3, | ||
LV = 4, | ||
LVT = 5, | ||
CR = 6, | ||
LF = 7, | ||
ZWJ = 8, | ||
PREPEND = 9, | ||
CONTROL = 10, | ||
EXTEND = 11, | ||
SPACINGMARK = 12, | ||
E_BASE = 13, | ||
GLUE_AFTER_ZWJ = 14, | ||
E_MODIFIER = 15, | ||
E_BASE_GAZ = 16, | ||
REGIONAL_INDICATOR = 17 | ||
} | ||
|
||
export const graphemeType = (function(): (codepoint: number) => Types { | ||
let BMP_LOW = null; | ||
let BMP_HIGH = null; | ||
return (codepoint: number): Types => { | ||
// ASCII printable shortcut | ||
if (31 < codepoint && codepoint < 127) return Types.OTHER; | ||
// BMP_LOW: 0 <= codepoint < 12443 | ||
if (codepoint < 12443) { | ||
const table = BMP_LOW || ((): number[] | Uint8Array => { | ||
BMP_LOW = loadFromPackedBMP(FIRST, 0, 12443); | ||
return BMP_LOW; | ||
})(); | ||
return (codepoint & 1) ? table[codepoint >> 1] >> 4 : table[codepoint >> 1] & 15; | ||
} | ||
// always Other: 12443 <= codepoint < 42606 | ||
if (codepoint < 42606) return Types.OTHER; | ||
// BMP_HIGH (CJK): 42606 <= codepoint < 65536 | ||
if (codepoint < 65536) { | ||
const table = BMP_HIGH || ((): number[] | Uint8Array => { | ||
BMP_HIGH = loadFromPackedBMP(SECOND, 42606, 65536); | ||
return BMP_HIGH; | ||
})(); | ||
codepoint -= 42606; | ||
return (codepoint & 1) ? table[codepoint >> 1] >> 4 : table[codepoint >> 1] & 15; | ||
} | ||
// TODO codepoint > 65536 | ||
return Types.OTHER; | ||
}; | ||
})(); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
// FIRST: 0 <= codepoint < 12443 | ||
export const FIRST: string = 'CgECARJfIQ0B//9UcP8UB/8ILQEBAQIBAgEBOAYKCwEBLhUQAWUHAQEGAgIBBCEBAQEeG1sLOgkiBAEJAQMBBSsDeA4BIAE2AQEBAQMIBAECAQcKAh0BAjgBAQECBAICAgIBCQEKAh0CATgBAQMCBAICAwMBHgIDAQsCATgBAQMFAQIBAQIBFAIWBgEBAjgBAQIBBAICAgIBCAIKAh4BOwEBAQIDAwEDAQkBKAEDOgMEAQMBBAcCCwIdAQI4AQEBAQIBAgEBAgECAgcCCwIcAgI3AgEBAgQBAwEDAQEIAQoCHgJGAQQBAgMBAQEHARICPQEBAQcMCGIBAQEGAQILBkoCGwEBAQEBBAIxDgEFAQIFCwEkCQFmBAEGAQICAhcCAgQDEAQNAQEBAgYBDwFiYEhY/14D////tQMdAx0CHgJAAgEHCAECCwkBLQMBdgIiAXYDBAIDBAIBBgPbAgIBOQEBAQcBAQEBAggGCgIBMA9BBAEvAQEFAQEFAQImCQwCAR4BBAICAQM4AQECAwEBAwIwCAgCApgDAQ0BBwQBBAIBAgECxjoBBf//DQEBAQIYBzEQYCH//////zEBIgEBAVICYgEOAQEEVgH//////48DjQFgIP//LAZpAgCnpqCgoACwCwCwsLCwsJCwoLCwuQsLCwkLCwsLCwsLCwsLm8C8sMvLwLCwvAsLywwMsLCwvAsMsLCwsLCwvAsMsLwMsLCwvAsLywwMsLCwsLy8DAywsLwLwLCwsLC8Cwy8vAvAywsLC8CwvLDAy5CwsMCwvLCwywwLDLCwsMsLCwsLCwsMC8sLCwsLC8sLywywsLCwywsLASMAsAALCwsLC8vLywsLoLCwvLwMvLC8sMvLCwsLywsLC8C8vLy8CwvAy8vLC8vLy8DLywsLywsMsMsLCwAKuKCgoLAAAA0ODg4NDg0OAAAAsLCwALCw'; | ||
// SECOND: 42606 <= codepoint < 65536 | ||
export const SECOND: string = 'AQQBCiACUAL/EQEDAQQBFwICAVgCMhACGhI0CBkLAgwdAwMBLwECBAIBBCQBQwYCAgICDAEIAQEuATMBAQMCAgUCAQEpAQICBQEB7AIBAgECAQEBEgEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwwXBDEE//////////8I////////////////////////////////////OQH//+MQEBDPAZ4CUAwEAAsLCwsAsLCwy8DAywsLC8AQvAvLy8CwvLywsLwLCwsLCwsMvAywy8vAywRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFAgMKqqqqqgAAAAAAAAAAAAAAAAAAsACwsKCwoA'; | ||
// THIRD: codepoint >= 65536 | ||
export const THIRD: string = ''; |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we expect the data file to ever change? If not I'd prefer to do a one time conversion and not include conversion scripts/data files in the repo
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It changes at least with every major release and the consortium updates the spec in a yearly fashion. In 2 months we should see unicode v11.0.
The changes address mostly new chars and updates of drafted specs, it seems the older a char the less likely is an update.
I am not a fan of the code generation script thingy either. It may introduce several problems due to inexpected changes in the spec - like the region_indicator from 9.0 to 10.0 where they even gave up the previous-next rule, which will not work correctly until explicitly handled in some corresponding code. Maybe we should tag a supported unicode version and stick to it for some time?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Edit: The data generator could be parked in some dev module to avoid rewriting it once we decide to upgrade to a newer unicode version in future.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
xtermjs/node-grapheme-cluster-break
?