Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: charwidth function #27

Merged
merged 1 commit into from
Mar 12, 2015
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -10,10 +10,13 @@
*.dSYM
*.out
data/*.txt
data/*.ttf
data/*.sfd
bench/bench
bench/icu
bench/unistring
normtest
graphemetest
utf8proc_data.c.new
printproperty
charwidth
17 changes: 8 additions & 9 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -4,15 +4,14 @@ compiler:
- clang
notifications:
email: false
before_install:
- sudo add-apt-repository ppa:staticfloat/julia-deps -y
- sudo add-apt-repository ppa:staticfloat/juliareleases -y
- sudo apt-get update -qq -y
- sudo apt-get install libpcre3-dev julia fontforge -y
script:
- make prefix=`pwd`/local install
- make check
- make utf8proc_data.c.new && (diff utf8proc_data.c.new utf8proc_data.c > /dev/null)
- mkdir build_static
- cd build_static
- cmake .. -DCMAKE_VERBOSE_MAKEFILE=ON
- make
- mkdir ../build_shared
- cd ../build_shared
- cmake .. -DCMAKE_VERBOSE_MAKEFILE=ON -DBUILD_SHARED_LIBS=ON
- make
- make data && (diff data/utf8proc_data.c.new utf8proc_data.c > /dev/null)
- (mkdir build_static && cd build_static && cmake .. -DCMAKE_VERBOSE_MAKEFILE=ON && make)
- (mkdir build_shared && cd build_shared && cmake .. -DCMAKE_VERBOSE_MAKEFILE=ON -DBUILD_SHARED_LIBS=ON && make)
47 changes: 18 additions & 29 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
# libutf8proc Makefile

# programs
CURL=curl
RUBY=ruby
PERL=perl
MAKE=make
AR=ar
INSTALL=install
@@ -37,36 +34,24 @@ includedir=$(prefix)/include

# meta targets

all: c-library
.PHONY: all, clean, update, data

c-library: libutf8proc.a libutf8proc.$(SHLIB_EXT)
all: libutf8proc.a libutf8proc.$(SHLIB_EXT)

clean:
rm -f utf8proc.o libutf8proc.a libutf8proc.$(SHLIB_VERS_EXT) libutf8proc.$(SHLIB_EXT) test/normtest test/graphemetest data/UnicodeData.txt data/DerivedCoreProperties.txt data/CompositionExclusions.txt data/CaseFolding.txt data/NormalizationTest.txt data/GraphemeBreakTest.txt
rm -f utf8proc.o libutf8proc.a libutf8proc.$(SHLIB_VERS_EXT) libutf8proc.$(SHLIB_EXT) test/normtest test/graphemetest test/printproperty test/charwidth
$(MAKE) -C bench clean
$(MAKE) -C data clean

update: utf8proc_data.c.new
cp -f utf8proc_data.c.new utf8proc_data.c
data: data/utf8proc_data.c.new

# real targets

utf8proc_data.c.new: data/data_generator.rb data/UnicodeData.txt data/GraphemeBreakProperty.txt data/DerivedCoreProperties.txt data/CompositionExclusions.txt data/CaseFolding.txt
(cd data; $(RUBY) data_generator.rb < UnicodeData.txt) > utf8proc_data.c.new

data/UnicodeData.txt:
$(CURL) -o $@ -O http://www.unicode.org/Public/UNIDATA/UnicodeData.txt

data/GraphemeBreakProperty.txt:
$(CURL) -o $@ -O http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
update: data/utf8proc_data.c.new
cp -f data/utf8proc_data.c.new utf8proc_data.c

data/DerivedCoreProperties.txt:
$(CURL) -o $@ -O http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt

data/CompositionExclusions.txt:
$(CURL) -o $@ -O http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt
# real targets

data/CaseFolding.txt:
$(CURL) -o $@ -O http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
data/utf8proc_data.c.new: libutf8proc.$(SHLIB_EXT) data/data_generator.rb data/charwidths.jl
$(MAKE) -C data utf8proc_data.c.new

utf8proc.o: utf8proc.h utf8proc.c utf8proc_data.c
$(cc) -c -o utf8proc.o utf8proc.c
@@ -86,7 +71,7 @@ libutf8proc.$(MAJOR).dylib: utf8proc.o
$(cc) -dynamiclib -o $@ $^ -install_name $(libdir)/$@ -Wl,-compatibility_version -Wl,$(MAJOR) -Wl,-current_version -Wl,$(MAJOR).$(MINOR).$(PATCH)

libutf8proc.dylib: libutf8proc.$(MAJOR).dylib
ln -s libutf8proc.$(MAJOR).dylib $@
ln -f -s libutf8proc.$(MAJOR).dylib $@

install: libutf8proc.a libutf8proc.$(SHLIB_EXT) libutf8proc.$(SHLIB_VERS_EXT)
mkdir -m 755 -p $(includedir)
@@ -99,10 +84,10 @@ install: libutf8proc.a libutf8proc.$(SHLIB_EXT) libutf8proc.$(SHLIB_VERS_EXT)
# Test programs

data/NormalizationTest.txt:
$(CURL) -o $@ -O http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt
$(MAKE) -C data NormalizationTest.txt

data/GraphemeBreakTest.txt:
$(CURL) http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt | $(PERL) -pe 's,÷,/,g;s,×,+,g' > $@
$(MAKE) -C data GraphemeBreakTest.txt

test/normtest: test/normtest.c utf8proc.o utf8proc.h test/tests.h
$(cc) test/normtest.c utf8proc.o -o $@
@@ -113,6 +98,10 @@ test/graphemetest: test/graphemetest.c utf8proc.o utf8proc.h test/tests.h
test/printproperty: test/printproperty.c utf8proc.o utf8proc.h test/tests.h
$(cc) test/printproperty.c utf8proc.o -o $@

check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt
test/charwidth: test/charwidth.c utf8proc.o utf8proc.h test/tests.h
$(cc) test/charwidth.c utf8proc.o -o $@

check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/charwidth
test/normtest data/NormalizationTest.txt
test/graphemetest data/GraphemeBreakTest.txt
test/charwidth
62 changes: 62 additions & 0 deletions data/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Unicode data generation rules. Except for the test data files, most
# users will not use these Makefile rules, which are primarily to re-generate
# unicode_data.c when we get a new Unicode version or charwidth data; they
# require ruby, fontforge, and julia to be installed.

# programs
CURL=curl
RUBY=ruby
PERL=perl
MAKE=make
JULIA=julia
CURLFLAGS = --retry 5 --location

# use JuliaLang caching (https://github.com/staticfloat/cache.julialang.org)
# so that Travis builds do not depend on anyone's flaky servers but our own
URLCACHE=https://cache.e.ip.saba.us/

.PHONY: clean

.DELETE_ON_ERROR:

utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt CharWidths.txt
$(RUBY) data_generator.rb < UnicodeData.txt > $@

# GNU Unifont version for font-metric calculations:
UNIFONT_VERSION=7.0.06

unifont-$(UNIFONT_VERSION).ttf:
$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)http://unifoundry.com/pub/unifont-$(UNIFONT_VERSION)/font-builds/unifont-$(UNIFONT_VERSION).ttf

unifont_upper-$(UNIFONT_VERSION).ttf:
$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)http://unifoundry.com/pub/unifont-$(UNIFONT_VERSION)/font-builds/unifont_upper-$(UNIFONT_VERSION).ttf

CharWidths.txt: charwidths.jl unifont-$(UNIFONT_VERSION).ttf unifont_upper-$(UNIFONT_VERSION).ttf EastAsianWidth.txt
UNIFONT_VERSION=$(UNIFONT_VERSION) $(JULIA) charwidths.jl > $@

UnicodeData.txt:
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://www.unicode.org/Public/UNIDATA/UnicodeData.txt

EastAsianWidth.txt:
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt

GraphemeBreakProperty.txt:
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt

DerivedCoreProperties.txt:
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt

CompositionExclusions.txt:
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt

CaseFolding.txt:
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://www.unicode.org/Public/UNIDATA/CaseFolding.txt

NormalizationTest.txt:
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt

GraphemeBreakTest.txt:
$(CURL) $(CURLFLAGS) $(URLCACHE)http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt | $(PERL) -pe 's,÷,/,g;s,×,+,g' > $@

clean:
rm -f UnicodeData.txt EastAsianWidth.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt CharWidths.txt unifont*.ttf unifont*.sfd
161 changes: 161 additions & 0 deletions data/charwidths.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
# Following work by @jiahao, we compute character widths using a combination of
# * advance widths from GNU Unifont (advance width 512 = 1 en)
# * UAX 11: East Asian Width
# * a few exceptions as needed
# Adapted from http://nbviewer.ipython.org/gist/jiahao/07e8b08bf6d8671e9734
#
# Requires Julia (obviously) and FontForge.

#############################################################################
# Julia 0.3/0.4 compatibility (taken from Compat package)
if VERSION < v"0.4.0-dev+1419"
const UInt16 = Uint16
end

CharWidths = Dict{Int,Int}()

#############################################################################
# Widths from GNU Unifont

universion=get(ENV, "UNIFONT_VERSION", "7.0.06")
for fontfile in ["unifont-$universion", "unifont_upper-$universion"]
isfile("$fontfile.ttf") || download("http://unifoundry.com/pub/unifont-$universion/font-builds/$fontfile.ttf", "$fontfile.ttf")
isfile("$fontfile.sfd") || run(`fontforge -lang=ff -c "Open(\"$fontfile.ttf\");Save(\"$fontfile.sfd\");Quit(0);"`)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that -lang=ff seems to be needed for recent versions of fontforge, since they changed the -c option to accept Python scripts by default rather than FontForge's own scripting language.

end

#Read sfdfile for character widths
function parsesfd(filename::String, CharWidths::Dict{Int,Int}=Dict{Int,Int}())
state=:seekchar
lineno = 0
for line in readlines(open(filename))
lineno += 1
if state==:seekchar #StartChar: nonmarkingreturn
if contains(line, "StartChar: ")
codepoint = nothing
width = nothing
state = :readdata
end
elseif state==:readdata #Encoding: 65538 -1 2, Width: 1024
contains(line, "Encoding:") && (codepoint = int(split(line)[3]))
contains(line, "Width:") && (width = int(split(line)[2]))
if codepoint!=nothing && width!=nothing && codepoint >= 0
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jiahao, I added the codepoint>=0 check here since the sfd file seems to have some codepoint -1 entries in the beginning.

CharWidths[codepoint]=div(width, 512) # 512 units to the en
state = :seekchar
end
end
end
CharWidths
end
CharWidths=parsesfd("unifont-$universion.sfd", CharWidths)
CharWidths=parsesfd("unifont_upper-$universion.sfd", CharWidths)

#############################################################################
# Widths from UAX #11: East Asian Width
# .. these take precedence over the Unifont width for all codepoints
# listed explicitly as wide/full/narrow/half-width

isfile("EastAsianWidth.txt") || download("http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt", "EastAsianWidth.txt")
for line in readlines(open("EastAsianWidth.txt"))
#Strip comments
line[1] == '#' && continue
precomment = split(line, '#')[1]
#Parse code point range and width code
tokens = split(precomment, ';')
length(tokens) >= 2 || continue
charrange = tokens[1]
width = strip(tokens[2])
#Parse code point range into Julia UnitRange
rangetokens = split(charrange, "..")
charstart = uint32("0x"*rangetokens[1])
charend = uint32("0x"*rangetokens[length(rangetokens)>1 ? 2 : 1])

#Assign widths
for c in charstart:charend
if width=="W" || width=="F" # wide or full
CharWidths[c]=2
elseif width=="Na"|| width=="H" # narrow or half
CharWidths[c]=1
end
end
end

#############################################################################
# A few exceptions to the above cases, found by manual comparison
# to other wcwidth functions and similar checks.

# Use ../libutf8proc for category codes, rather than the one in Julia,
# to minimize bootstrapping complexity when a new version of Unicode comes out.
function catcode(c)
uint(c) > 0x10FFFF && return 0x0000 # see utf8proc_get_property docs
return unsafe_load(ccall((:utf8proc_get_property,"../libutf8proc"), Ptr{UInt16}, (Int32,), c))
end

# use Base.UTF8proc module to get category codes constants, since
# we aren't goint to change these in utf8proc.
import Base.UTF8proc

for c in keys(CharWidths)
cat = catcode(c)

# make sure format control character (category Cf) have width 0,
# except for the Arabic characters 0x06xx (see unicode std 6.2, sec. 8.2)
if cat==UTF8proc.UTF8PROC_CATEGORY_CF && c [0x0601,0x0602,0x0603,0x06dd]
CharWidths[c]=0
end

# Unifont has nonzero width for a number of non-spacing combining
# characters, e.g. (in 7.0.06): f84,17b4,17b5,180b,180d,2d7f, and
# the variation selectors
if cat==UTF8proc.UTF8PROC_CATEGORY_MN
CharWidths[c]=0
end

# We also assign width of zero to unassigned and private-use
# codepoints (Unifont includes ConScript Unicode Registry PUA fonts,
# but since these are nonstandard it seems questionable to recognize them).
if cat==UTF8proc.UTF8PROC_CATEGORY_CO || cat==UTF8proc.UTF8PROC_CATEGORY_CN
CharWidths[c]=0
end

# for some reason, Unifont has width-2 glyphs for ASCII control chars
if cat==UTF8proc.UTF8PROC_CATEGORY_CC
CharWidths[c]=0
end
end

#By definition, should have zero width (on the same line)
#0x002028 '
' category: Zl name: LINE SEPARATOR/
#0x002029 '
' category: Zp name: PARAGRAPH SEPARATOR/
CharWidths[0x2028]=0
CharWidths[0x2029]=0

#By definition, should be narrow = width of 1 en space
#0x00202f ' ' category: Zs name: NARROW NO-BREAK SPACE/
CharWidths[0x202f]=1

#By definition, should be wide = width of 1 em space
#0x002001 ' ' category: Zs name: EM QUAD/
#0x002003 ' ' category: Zs name: EM SPACE/
CharWidths[0x2001]=2
CharWidths[0x2003]=2

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jiahao, in a monospaced font, like in a terminal, aren't em and en the same?

#############################################################################
# Output (to a file or pipe) for processing by data_generator.rb
# ... don't bother to output zero widths since that will be the default.

firstc = 0x000000
lastv = 0
uhex(c) = uppercase(hex(c,4))
for c in 0x0000:0x110000
v = get(CharWidths, c, 0)
if v != lastv || c == 0x110000
v < 4 || error("invalid charwidth $v for $c")
if firstc+1 < c
println(uhex(firstc), "..", uhex(c-1), "; ", lastv)
else
println(uhex(firstc), "; ", lastv)
end
firstc = c
lastv = v
end
end
18 changes: 14 additions & 4 deletions data/data_generator.rb
Original file line number Diff line number Diff line change
@@ -85,14 +85,23 @@
end
end

$charwidth_list = File.read("CharWidths.txt")
$charwidth = Hash.new(0)
$charwidth_list.each_line do |entry|
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*([0-9]+)/
$1.hex.upto($2.hex) { |e2| $charwidth[e2] = $3.to_i }
elsif entry =~ /^([0-9A-F]+)\s*;\s*([0-9]+)/
$charwidth[$1.hex] = $2.to_i
end
end

$exclusions = File.read("CompositionExclusions.txt")[/# \(1\) Script Specifics.*?# Total code points:/m]
$exclusions = $exclusions.chomp.split("\n").collect { |e| e.hex }

$excl_version = File.read("CompositionExclusions.txt")[/# \(2\) Post Composition Version precomposed characters.*?# Total code points:/m]
$excl_version = $excl_version.chomp.split("\n").collect { |e| e.hex }

$case_folding_string = File.open("CaseFolding.txt").read

$case_folding_string = File.open("CaseFolding.txt", :encoding => 'utf-8').read
$case_folding = {}
$case_folding_string.chomp.split("\n").each do |line|
next unless line =~ /([0-9A-F]+); [CFS]; ([0-9A-F ]+);/i
@@ -172,7 +181,8 @@ def c_entry(comb1_indicies, comb2_indicies)
"#{$exclusions.include?(code) or $excl_version.include?(code)}, " <<
"#{$ignorable.include?(code)}, " <<
"#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " <<
"#{$grapheme_boundclass[code]}},\n"
"#{$grapheme_boundclass[code]}, " <<
"#{$charwidth[code]}},\n"
end
end

@@ -295,7 +305,7 @@ def c_entry(comb1_indicies, comb2_indicies)
$stdout << "};\n\n"

$stdout << "const utf8proc_property_t utf8proc_properties[] = {\n"
$stdout << " {0, 0, 0, 0, NULL, NULL, -1, -1, -1, -1, -1, false,false,false,false, UTF8PROC_BOUNDCLASS_OTHER},\n"
$stdout << " {0, 0, 0, 0, NULL, NULL, -1, -1, -1, -1, -1, false,false,false,false, UTF8PROC_BOUNDCLASS_OTHER, 0},\n"
properties.each { |line|
$stdout << line
}
61 changes: 61 additions & 0 deletions test/charwidth.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#include "tests.h"
#include <ctype.h>
#include <wchar.h>

int my_isprint(int c) {
int cat = utf8proc_get_property(c)->category;
return (UTF8PROC_CATEGORY_LU <= cat && cat <= UTF8PROC_CATEGORY_ZS) ||
(c == 0x0601 || c == 0x0602 || c == 0x0603 || c == 0x06dd);
}

int main(int argc, char **argv)
{
int prevc, c, error = 0;

(void) argc; /* unused */
(void) argv; /* unused */

/* some simple sanity tests of the character widths */
for (c = 0; c <= 0x110000; ++c) {
int cat = utf8proc_get_property(c)->category;
int w = utf8proc_charwidth(c);
if ((cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_ME) &&
w > 0) {
fprintf(stderr, "nonzero width %d for combining char %x\n", w, c);
error = 1;
}
if (c <= 127 && ((!isprint(c) && w > 0) ||
(isprint(c) && wcwidth(c) != w))) {
fprintf(stderr, "wcwidth %d mismatch %d for %s ASCII %x\n",
wcwidth(c), w,
isprint(c) ? "printable" : "non-printable", c);
error = 1;
}
if (!my_isprint(c) && w > 0) {
fprintf(stderr, "non-printing %x had width %d\n", c, w);
error = 1;
}
}
check(!error, "utf8proc_charwidth FAILED tests.");

/* print some other information by compariing with system wcwidth */
printf("Mismatches with system wcwidth (not necessarily errors):\n");
for (c = 0; c <= 0x110000; ++c) {
int w = utf8proc_charwidth(c);
int wc = wcwidth(c);
if (sizeof(wchar_t) == 2 && c >= (1<<16)) continue;
#if 0
/* lots of these errors for out-of-date system unicode tables */
if (wc == -1 && my_isprint(c) && w > 0)
printf(" wcwidth(%x) = -1 for printable char\n", c);
#endif
if (wc == -1 && !my_isprint(c) && w > 0)
printf(" wcwidth(%x) = -1 for non-printable width-%d char\n", c, w);
if (wc >= 0 && wc != w)
printf(" wcwidth(%x) = %d != charwidth %d\n", c, wc, w);
}

printf("Character-width tests SUCCEEDED.\n");

return 0;
}
10 changes: 6 additions & 4 deletions test/printproperty.c
Original file line number Diff line number Diff line change
@@ -11,7 +11,7 @@ int main(int argc, char **argv)
check(sscanf(argv[i],"%x",&c) == 1, "invalid hex input %s", argv[i]);
const utf8proc_property_t *p = utf8proc_get_property(c);
printf("U+%s:\n"
" category = %d\n"
" category = %s\n"
" combining_class = %d\n"
" bidi_class = %d\n"
" decomp_type = %d\n"
@@ -24,9 +24,10 @@ int main(int argc, char **argv)
" comp_exclusion = %d\n"
" ignorable = %d\n"
" control_boundary = %d\n"
" boundclass = %d\n",
" boundclass = %d\n"
" charwidth = %d\n",
argv[i],
p->category,
utf8proc_category_string(c),
p->combining_class,
p->bidi_class,
p->decomp_type,
@@ -39,7 +40,8 @@ int main(int argc, char **argv)
p->comp_exclusion,
p->ignorable,
p->control_boundary,
p->boundclass);
p->boundclass,
utf8proc_charwidth(c));
}
return 0;
}
15 changes: 15 additions & 0 deletions utf8proc.c
Original file line number Diff line number Diff line change
@@ -223,6 +223,21 @@ DLLEXPORT bool utf8proc_grapheme_break(int32_t c1, int32_t c2) {
utf8proc_get_property(c2)->boundclass);
}

/* return a character width analogous to wcwidth (except portable and
hopefully less buggy than most system wcwidth functions). */
DLLEXPORT int utf8proc_charwidth(int32_t c) {
return utf8proc_get_property(c)->charwidth;
}

DLLEXPORT int utf8proc_category(int32_t c) {
return utf8proc_get_property(c)->category;
}

DLLEXPORT const char *utf8proc_category_string(int32_t c) {
static const char s[][3] = {"Cn","Lu","Ll","Lt","Lm","Lo","Mn","Mc","Me","Nd","Nl","No","Pc","Pd","Ps","Pe","Pi","Pf","Po","Sm","Sc","Sk","So","Zs","Zl","Zp","Cc","Cf","Cs","Co"};
return s[utf8proc_category(c)];
}

#define utf8proc_decompose_lump(replacement_uc) \
return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
options & ~UTF8PROC_LUMP, last_boundclass)
16 changes: 16 additions & 0 deletions utf8proc.h
Original file line number Diff line number Diff line change
@@ -181,6 +181,7 @@ typedef struct utf8proc_property_struct {
unsigned ignorable:1;
unsigned control_boundary:1;
unsigned boundclass:4;
unsigned charwidth:2;
} utf8proc_property_t;

#define UTF8PROC_CATEGORY_CN 0
@@ -388,6 +389,21 @@ DLLEXPORT bool utf8proc_grapheme_break(int32_t c1, int32_t c2);
* permitted between them (as defined by the extended grapheme clusters in UAX#29).
*/

DLLEXPORT int utf8proc_charwidth(int32_t c);
/* Given a codepoint c, return a character width analogous to wcwidth(c),
except that a width of 0 is returned for non-printable characters
instead of -1 as in wcwidth.
If you want to check for particular types of non-printable characters,
(analogous to isprint or iscntrl), use utf8proc_category(c). */

DLLEXPORT int utf8proc_category(int32_t c);
/* Return the Unicode character category for c (one of the
UTF8PROC_CATEGORY_* constants.) */

DLLEXPORT const char *utf8proc_category_string(int32_t c);
/* Return the two-letter Unicode category string for c (e.g. "Lu" or "Co"). */

DLLEXPORT ssize_t utf8proc_map(
const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options
);
20,267 changes: 10,285 additions & 9,982 deletions utf8proc_data.c

Large diffs are not rendered by default.