xtermjs · jerch · May 30, 2018 · May 30, 2018 · Jun 3, 2018 · Jun 3, 2018
diff --git a/bin/create-graphemedata.js b/bin/create-graphemedata.js
@@ -0,0 +1,124 @@
+#!/usr/bin/env node
+'use strict';
+
+const URL = 'https://www.unicode.org/Public/10.0.0/ucd/auxiliary/GraphemeBreakProperty.txt';
+const PATH = __dirname + '/../src/GraphemeData.ts';
+
+const GRAPHEME_REX = /^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*([A-Za-z_]+)/gm;
+
+const TYPES  = {
+    Other: 0,
+    L: 1,
+    V: 2,
+    T: 3,
+    LV: 4,
+    LVT: 5,
+    CR: 6,
+    LF: 7,
+    ZWJ: 8,
+    Prepend: 9,
+    Control: 10,
+    Extend: 11,
+    SpacingMark: 12,
+    E_Base: 13,
+    Glue_After_Zwj: 14,
+    E_Modifier: 15,
+    E_Base_GAZ: 16,
+    Regional_Indicator: 17
+};
+
+function parseDefinitions(data) {
+    let codepoints = Object.create(null);
+    let match = null;
+    while (match = GRAPHEME_REX.exec(data)) {
+        let start = parseInt(match[1], 16);
+        let end = parseInt(match[2], 16) || start;
+        for (let i = start; i < end + 1; ++i)
+            codepoints[i] = match[3];
+    }
+    return codepoints;
+}
+
+
+function createPackedBMP(codepoints, start, end) {
+    let type = -1;
+    let count = 0;
+    let lengths = [];
+    let types = [];
+    for (let i = start; i < end; ++i) {
+        let t = parseInt(TYPES[codepoints[i] || 'Other']);
+        if (t !== type) {
+            lengths.push(count);
+            types.push(type);
+            type = t;
+            count = 0;
+        }
+        count++;
+        if (count === 255) {
+            lengths.push(count);
+            types.push(type);
+            count = 0;
+        }
+    }
+    lengths.push(count);
+    types.push(type);
+
+    // remove start entries
+    lengths.shift();
+    types.shift();
+
+    if (types.length & 1)
+        types.push(0);
+
+    let accu = 0;
+    let finalTypes = [];
+    for (let i = 0; i < types.length; ++i) {
+        accu <<= 4;
+        accu |= types[i];
+        if (i & 1) {
+            finalTypes.push(accu);
+            accu = 0;
+        }
+    }
+
+    // null terminate length values
+    lengths.push(0);
+    return new Buffer(lengths.concat(finalTypes)).toString('base64');
+}
+
+
+function createGraphemeDataFile(url, path) {
+    require('https').get(url, (resp) => {
+        let data = '';
+        resp.on('data', (chunk) => {
+            data += chunk;
+        });
+        resp.on('end', () => {
+            const codepoints = parseDefinitions(data);
+            let highest = 0;
+            for (let el in codepoints)
+                highest = Math.max(highest, parseInt(el));
+
+            // codepoint < 12443
+            const first = createPackedBMP(codepoints, 0, 12443);
+            // 42606 <= codepoint < 65536
+            const second = createPackedBMP(codepoints, 42606, 65536);
+            // codepoint <= 65536
+            const third = ''; //createPackedHIGH(codepoints, 65536, highest);
+
+            // write to ts file
+            let final = '';
+            final += `// FIRST: 0 <= codepoint < 12443\n`;
+            final += `export const FIRST: string = '${first}';\n`;
+            final += `// SECOND: 42606 <= codepoint < 65536\n`;
+            final += `export const SECOND: string = '${second}';\n`;
+            final += `// THIRD: codepoint >= 65536\n`;
+            final += `export const THIRD: string = '${third}';\n`;
+            require('fs').writeFileSync(path, final);
+        });
+    }).on('error', (err) => {
+        console.log('error', err.message);
+    });
+}
+
+createGraphemeDataFile(URL, PATH);
diff --git a/package.json b/package.json
@@ -55,6 +55,7 @@
     "prepublish": "npm run build",
     "coveralls": "gulp coveralls",
     "webpack": "gulp webpack",
-    "watch": "gulp watch"
+    "watch": "gulp watch",
+    "graphemedata": "node bin/create-graphemedata"
   }
 }
diff --git a/src/Grapheme.test.ts b/src/Grapheme.test.ts
@@ -0,0 +1,104 @@
+/**
+ * Copyright (c) 2018 The xterm.js authors. All rights reserved.
+ * @license MIT
+ */
+
+import { FIRST, SECOND } from './GraphemeData';
+import { loadFromPackedBMP, graphemeType } from './Grapheme';
+import * as chai from 'chai';
+
+const TYPES  = {
+  Other: 0,
+  L: 1,
+  V: 2,
+  T: 3,
+  LV: 4,
+  LVT: 5,
+  CR: 6,
+  LF: 7,
+  ZWJ: 8,
+  Prepend: 9,
+  Control: 10,
+  Extend: 11,
+  SpacingMark: 12,
+  E_Base: 13,
+  Glue_After_Zwj: 14,
+  E_Modifier: 15,
+  E_Base_GAZ: 16,
+  Regional_Indicator: 17
+};
+
+const URL = 'https://www.unicode.org/Public/10.0.0/ucd/auxiliary/GraphemeBreakProperty.txt';
+const GRAPHEME_REX = /^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*([A-Za-z_]+)/gm;
+
+let CODEPOINTS = null;
+
+function parseDefinitions(data: string): {[key: number]: number} {
+  const codepoints = Object.create(null);
+  let match = null;
+  while (match = GRAPHEME_REX.exec(data)) {
+    const start = parseInt(match[1], 16);
+    const end = parseInt(match[2], 16) || start;
+    for (let i = start; i < end + 1; ++i) codepoints[i] = match[3];
+  }
+  return codepoints;
+}
+
+function loadUnicodeData(done: Function): void {
+  require('https').get(URL, (resp): any => {
+    let data = '';
+    resp.on('data', (chunk): void => {
+      data += chunk;
+    });
+    resp.on('end', () => {
+      CODEPOINTS = parseDefinitions(data);
+      done();
+    });
+  }).on('error', (err) => {
+    throw Error('error fetching unicode data');
+  });
+}
+
+describe('grapheme cluster', function (): void {
+  before(function(done: Function): void {
+    this.timeout(5000);
+    loadUnicodeData(done);
+  });
+  describe('correct GraphemeData', function(): void {
+    it('FIRST', function(): void {
+      if (!CODEPOINTS) return;
+      const one = loadFromPackedBMP(FIRST, 0, 12443);
+      for (let cp = 0; cp < 12443; ++cp) {
+        const fromStore = TYPES[CODEPOINTS[cp]] || 0;
+        const v = (cp & 1) ? one[cp >> 1] >> 4 : one[cp >> 1] & 15;
+        chai.expect(fromStore).equals(v);
+      }
+    });
+    it('SECOND', function(): void {
+      if (!CODEPOINTS) return;
+      const one = loadFromPackedBMP(SECOND, 42606, 65536);
+      for (let cp = 42606; cp < 65536; ++cp) {
+        const fromStore = TYPES[CODEPOINTS[cp]] || 0;
+        const idx = cp - 42606;
+        const v = (idx & 1) ? one[idx >> 1] >> 4 : one[idx >> 1] & 15;
+        chai.expect(fromStore).equals(v);
+      }
+    });
+    it('THIRD', function(): void {
+      if (!CODEPOINTS) return;
+      // TODO
+    });
+  });
+  describe('graphemeType', function(): void {
+    it('BMP', function(): void {
+      if (!CODEPOINTS) return;
+      for (let cp = 0; cp < 65536; ++cp) {
+        chai.expect(graphemeType(cp)).equals(TYPES[CODEPOINTS[cp]] || 0);
+      }
+    });
+    it('HIGH', function(): void {
+      if (!CODEPOINTS) return;
+      // TODO
+    });
+  });
+});
diff --git a/src/Grapheme.ts b/src/Grapheme.ts
@@ -0,0 +1,88 @@
+/**
+ * Copyright (c) 2018 The xterm.js authors. All rights reserved.
+ * @license MIT
+ */
+import { FIRST, SECOND } from './GraphemeData';
+
+export function loadFromPackedBMP(data: string, start: number, end: number): number[] | Uint8Array {
+  // decode base64 and split into lengths and types strings
+  const raw = (typeof atob === 'undefined')
+    // nodejs
+    ? new Buffer(data, 'base64').toString('binary')
+    // browser - FIXME: how to test this?
+    : atob(data);
+  // first occurence of 0x0 marks end of lengths (null terminated)
+  const lengths = raw.substring(0, raw.indexOf('\x00'));
+  const types = raw.substring(raw.indexOf('\x00') + 1);
+
+  // lookup table with 2 type entries per index position
+  const table = (typeof Uint8Array === 'undefined')
+    ? new Array(((end - start) >> 1) + 1)
+    : new Uint8Array(((end - start) >> 1) + 1);
+
+  // load data into lookup table
+  let codepointOffset = 0;
+  for (let chunkIdx = 0; chunkIdx < lengths.length; ++chunkIdx) {
+    const currentLength = lengths.charCodeAt(chunkIdx);
+    for (let chunkPos = 0; chunkPos < currentLength; ++chunkPos) {
+      const tcode = types.charCodeAt(chunkIdx >> 1);
+      const type = (chunkIdx & 1) ? tcode & 15 : tcode >> 4;
+      table[(codepointOffset + chunkPos) >> 1] |= ((codepointOffset + chunkPos) & 1) ? type << 4 : type;
+    }
+      codepointOffset += currentLength;
+  }
+  return table;
+}
+
+
+// NOTE: Types must be identical to bin/create-graphemedata.js#TYPES
+const enum Types {
+  OTHER = 0,
+  L = 1,
+  V = 2,
+  T = 3,
+  LV = 4,
+  LVT = 5,
+  CR = 6,
+  LF = 7,
+  ZWJ = 8,
+  PREPEND = 9,
+  CONTROL = 10,
+  EXTEND = 11,
+  SPACINGMARK = 12,
+  E_BASE = 13,
+  GLUE_AFTER_ZWJ = 14,
+  E_MODIFIER = 15,
+  E_BASE_GAZ = 16,
+  REGIONAL_INDICATOR = 17
+}
+
+export const graphemeType = (function(): (codepoint: number) => Types {
+  let BMP_LOW = null;
+  let BMP_HIGH = null;
+  return (codepoint: number): Types => {
+    // ASCII printable shortcut
+    if (31 < codepoint && codepoint < 127) return Types.OTHER;
+    // BMP_LOW: 0 <= codepoint < 12443
+    if (codepoint < 12443) {
+      const table = BMP_LOW || ((): number[] | Uint8Array => {
+        BMP_LOW = loadFromPackedBMP(FIRST, 0, 12443);
+        return BMP_LOW;
+      })();
+      return (codepoint & 1) ? table[codepoint >> 1] >> 4 : table[codepoint >> 1] & 15;
+    }
+    // always Other: 12443 <= codepoint < 42606
+    if (codepoint < 42606) return Types.OTHER;
+    // BMP_HIGH (CJK): 42606 <= codepoint < 65536
+    if (codepoint < 65536) {
+      const table = BMP_HIGH || ((): number[] | Uint8Array => {
+        BMP_HIGH = loadFromPackedBMP(SECOND, 42606, 65536);
+        return BMP_HIGH;
+      })();
+      codepoint -= 42606;
+      return (codepoint & 1) ? table[codepoint >> 1] >> 4 : table[codepoint >> 1] & 15;
+    }
+    // TODO codepoint > 65536
+    return Types.OTHER;
+  };
+})();
diff --git a/src/GraphemeData.ts b/src/GraphemeData.ts
@@ -0,0 +1,6 @@
+// FIRST: 0 <= codepoint < 12443
+export const FIRST: string = 'CgECARJfIQ0B//9UcP8UB/8ILQEBAQIBAgEBOAYKCwEBLhUQAWUHAQEGAgIBBCEBAQEeG1sLOgkiBAEJAQMBBSsDeA4BIAE2AQEBAQMIBAECAQcKAh0BAjgBAQECBAICAgIBCQEKAh0CATgBAQMCBAICAwMBHgIDAQsCATgBAQMFAQIBAQIBFAIWBgEBAjgBAQIBBAICAgIBCAIKAh4BOwEBAQIDAwEDAQkBKAEDOgMEAQMBBAcCCwIdAQI4AQEBAQIBAgEBAgECAgcCCwIcAgI3AgEBAgQBAwEDAQEIAQoCHgJGAQQBAgMBAQEHARICPQEBAQcMCGIBAQEGAQILBkoCGwEBAQEBBAIxDgEFAQIFCwEkCQFmBAEGAQICAhcCAgQDEAQNAQEBAgYBDwFiYEhY/14D////tQMdAx0CHgJAAgEHCAECCwkBLQMBdgIiAXYDBAIDBAIBBgPbAgIBOQEBAQcBAQEBAggGCgIBMA9BBAEvAQEFAQEFAQImCQwCAR4BBAICAQM4AQECAwEBAwIwCAgCApgDAQ0BBwQBBAIBAgECxjoBBf//DQEBAQIYBzEQYCH//////zEBIgEBAVICYgEOAQEEVgH//////48DjQFgIP//LAZpAgCnpqCgoACwCwCwsLCwsJCwoLCwuQsLCwkLCwsLCwsLCwsLm8C8sMvLwLCwvAsLywwMsLCwvAsMsLCwsLCwvAsMsLwMsLCwvAsLywwMsLCwsLy8DAywsLwLwLCwsLC8Cwy8vAvAywsLC8CwvLDAy5CwsMCwvLCwywwLDLCwsMsLCwsLCwsMC8sLCwsLC8sLywywsLCwywsLASMAsAALCwsLC8vLywsLoLCwvLwMvLC8sMvLCwsLywsLC8C8vLy8CwvAy8vLC8vLy8DLywsLywsMsMsLCwAKuKCgoLAAAA0ODg4NDg0OAAAAsLCwALCw';
+// SECOND: 42606 <= codepoint < 65536
+export const SECOND: string = 'AQQBCiACUAL/EQEDAQQBFwICAVgCMhACGhI0CBkLAgwdAwMBLwECBAIBBCQBQwYCAgICDAEIAQEuATMBAQMCAgUCAQEpAQICBQEB7AIBAgECAQEBEgEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwEbARsBGwwXBDEE//////////8I////////////////////////////////////OQH//+MQEBDPAZ4CUAwEAAsLCwsAsLCwy8DAywsLC8AQvAvLy8CwvLywsLwLCwsLCwsMvAywy8vAywRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFRUVFAgMKqqqqqgAAAAAAAAAAAAAAAAAAsACwsKCwoA';
+// THIRD: codepoint >= 65536
+export const THIRD: string = '';