Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Efficient trie lookup for boolean Unicode properties #33098

Merged
merged 3 commits into from
May 23, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 134 additions & 4 deletions src/etc/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@

import fileinput, re, os, sys, operator

bytes_old = 0
bytes_new = 0

preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
Expand Down Expand Up @@ -307,12 +310,137 @@ def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
format_table_content(f, data, 8)
f.write("\n ];\n\n")

def emit_trie_lookup_range_table(f):
f.write("""

// BoolTrie is a trie for representing a set of Unicode codepoints. It is
// implemented with postfix compression (sharing of identical child nodes),
// which gives both compact size and fast lookup.
//
// The space of Unicode codepoints is divided into 3 subareas, each
// represented by a trie with different depth. In the first (0..0x800), there
// is no trie structure at all; each u64 entry corresponds to a bitvector
// effectively holding 64 bool values.
//
// In the second (0x800..0x10000), each child of the root node represents a
// 64-wide subrange, but instead of storing the full 64-bit value of the leaf,
// the trie stores an 8-bit index into a shared table of leaf values. This
// exploits the fact that in reasonable sets, many such leaves can be shared.
//
// In the third (0x10000..0x110000), each child of the root node represents a
// 4096-wide subrange, and the trie stores an 8-bit index into a 64-byte slice
// of a child tree. Each of these 64 bytes represents an index into the table
// of shared 64-bit leaf values. This exploits the sparse structure in the
// non-BMP range of most Unicode sets.
pub struct BoolTrie {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It'd be great to have an high-level overview of what all these fields represent/how it actually represents the data.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, with some comments I think this is good to go though

// 0..0x800 (corresponding to 1 and 2 byte utf-8 sequences)
r1: [u64; 32], // leaves

// 0x800..0x10000 (corresponding to 3 byte utf-8 sequences)
r2: [u8; 992], // first level
r3: &'static [u64], // leaves

// 0x10000..0x110000 (corresponding to 4 byte utf-8 sequences)
r4: [u8; 256], // first level
r5: &'static [u8], // second level
r6: &'static [u64], // leaves
}

fn trie_range_leaf(c: usize, bitmap_chunk: u64) -> bool {
((bitmap_chunk >> (c & 63)) & 1) != 0
}

fn trie_lookup_range_table(c: char, r: &'static BoolTrie) -> bool {
let c = c as usize;
if c < 0x800 {
trie_range_leaf(c, r.r1[c >> 6])
} else if c < 0x10000 {
let child = r.r2[(c >> 6) - 0x20];
trie_range_leaf(c, r.r3[child as usize])
} else {
let child = r.r4[(c >> 12) - 0x10];
let leaf = r.r5[((child as usize) << 6) + ((c >> 6) & 0x3f)];
trie_range_leaf(c, r.r6[leaf as usize])
}
}\n
""")

def compute_trie(rawdata, chunksize):
root = []
childmap = {}
child_data = []
for i in range(len(rawdata) / chunksize):
data = rawdata[i * chunksize: (i + 1) * chunksize]
child = '|'.join(map(str, data))
if child not in childmap:
childmap[child] = len(childmap)
child_data.extend(data)
root.append(childmap[child])
return (root, child_data)

def emit_bool_trie(f, name, t_data, is_pub=True):
global bytes_old, bytes_new
bytes_old += 8 * len(t_data)
CHUNK = 64
rawdata = [False] * 0x110000;
for (lo, hi) in t_data:
for cp in range(lo, hi + 1):
rawdata[cp] = True

# convert to bitmap chunks of 64 bits each
chunks = []
for i in range(0x110000 / CHUNK):
chunk = 0
for j in range(64):
if rawdata[i * 64 + j]:
chunk |= 1 << j
chunks.append(chunk)

pub_string = ""
if is_pub:
pub_string = "pub "
f.write(" %sconst %s: &'static super::BoolTrie = &super::BoolTrie {\n" % (pub_string, name))
f.write(" r1: [\n")
data = ','.join('0x%016x' % chunk for chunk in chunks[0:0x800 / CHUNK])
format_table_content(f, data, 12)
f.write("\n ],\n")

# 0x800..0x10000 trie
(r2, r3) = compute_trie(chunks[0x800 / CHUNK : 0x10000 / CHUNK], 64 / CHUNK)
f.write(" r2: [\n")
data = ','.join(str(node) for node in r2)
format_table_content(f, data, 12)
f.write("\n ],\n")
f.write(" r3: &[\n")
data = ','.join('0x%016x' % chunk for chunk in r3)
format_table_content(f, data, 12)
f.write("\n ],\n")

# 0x10000..0x110000 trie
(mid, r6) = compute_trie(chunks[0x10000 / CHUNK : 0x110000 / CHUNK], 64 / CHUNK)
(r4, r5) = compute_trie(mid, 64)
f.write(" r4: [\n")
data = ','.join(str(node) for node in r4)
format_table_content(f, data, 12)
f.write("\n ],\n")
f.write(" r5: &[\n")
data = ','.join(str(node) for node in r5)
format_table_content(f, data, 12)
f.write("\n ],\n")
f.write(" r6: &[\n")
data = ','.join('0x%016x' % chunk for chunk in r6)
format_table_content(f, data, 12)
f.write("\n ],\n")

f.write(" };\n\n")
bytes_new += 256 + 992 + 256 + 8 * len(r3) + len(r5) + 8 * len(r6)

def emit_property_module(f, mod, tbl, emit):
f.write("pub mod %s {\n" % mod)
for cat in sorted(emit):
emit_table(f, "%s_table" % cat, tbl[cat])
emit_bool_trie(f, "%s_table" % cat, tbl[cat])
f.write(" pub fn %s(c: char) -> bool {\n" % cat)
f.write(" super::bsearch_range_table(c, %s_table)\n" % cat)
f.write(" super::trie_lookup_range_table(c, %s_table)\n" % cat)
f.write(" }\n\n")
f.write("}\n\n")

Expand Down Expand Up @@ -402,8 +530,9 @@ def emit_norm_module(f, canon, compat, combine, norm_props):
norm_props = load_properties("DerivedNormalizationProps.txt",
["Full_Composition_Exclusion"])

# bsearch_range_table is used in all the property modules below
emit_bsearch_range_table(rf)
# trie_lookup_table is used in all the property modules below
emit_trie_lookup_range_table(rf)
# emit_bsearch_range_table(rf)

# category tables
for (name, cat, pfuns) in ("general_category", gencats, ["N", "Cc"]), \
Expand All @@ -414,3 +543,4 @@ def emit_norm_module(f, canon, compat, combine, norm_props):
# normalizations and conversions module
emit_norm_module(rf, canon_decomp, compat_decomp, combines, norm_props)
emit_conversions_module(rf, to_upper, to_lower, to_title)
#print 'bytes before = %d, bytes after = %d' % (bytes_old, bytes_new)
Loading