Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RFC: export utf8proc Unicode transformation functionality in Julia #5576

Merged
merged 4 commits into from
Feb 3, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions base/char.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
char(x) = convert(Char, x)
char(x::FloatingPoint) = char(iround(x))

is_valid_char(c) = !('\ud800' <= c <= '\udfff' || '\U10ffff' < c)

integer(x::Char) = int(x)
unsigned(x::Char) = uint(x)

Expand Down
2 changes: 2 additions & 0 deletions base/exports.jl
Original file line number Diff line number Diff line change
Expand Up @@ -766,6 +766,7 @@ export
hex2bytes,
ind2chr,
info,
is_assigned_char,
is_valid_ascii,
is_valid_char,
is_valid_utf8,
Expand Down Expand Up @@ -793,6 +794,7 @@ export
matchall,
ndigits,
nextind,
normalize_string,
oct,
parsefloat,
parseint,
Expand Down
2 changes: 2 additions & 0 deletions base/sysimg.jl
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ include("utf8.jl")
include("utf16.jl")
include("iobuffer.jl")
include("string.jl")
include("utf8proc.jl")
importall .UTF8proc
include("regex.jl")
include("base64.jl")
importall .Base64
Expand Down
89 changes: 89 additions & 0 deletions base/utf8proc.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# Various Unicode functionality from the utf8proc library
module UTF8proc

import Base: show, showcompact, ==, string, symbol, isless, hash

# also exported by Base:
export normalize_string, is_valid_char, is_assigned_char

# whether codepoints are valid Unicode
is_valid_char(c) = bool(ccall(:utf8proc_codepoint_valid, Cchar, (Int32,), c))

const UTF8PROC_NULLTERM = (1<<0)
const UTF8PROC_STABLE = (1<<1)
const UTF8PROC_COMPAT = (1<<2)
const UTF8PROC_COMPOSE = (1<<3)
const UTF8PROC_DECOMPOSE = (1<<4)
const UTF8PROC_IGNORE = (1<<5)
const UTF8PROC_REJECTNA = (1<<6)
const UTF8PROC_NLF2LS = (1<<7)
const UTF8PROC_NLF2PS = (1<<8)
const UTF8PROC_NLF2LF = (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS)
const UTF8PROC_STRIPCC = (1<<9)
const UTF8PROC_CASEFOLD = (1<<10)
const UTF8PROC_CHARBOUND = (1<<11)
const UTF8PROC_LUMP = (1<<12)
const UTF8PROC_STRIPMARK = (1<<13)

let
const p = Array(Ptr{Uint8}, 1)
global utf8proc_map
function utf8proc_map(s::String, flags::Integer)
result = ccall(:utf8proc_map, Cssize_t,
(Ptr{Uint8}, Cssize_t, Ptr{Ptr{Uint8}}, Cint),
bytestring(s), 0, p, flags | UTF8PROC_NULLTERM)
result < 0 && error(bytestring(ccall(:utf8proc_errmsg, Ptr{Uint8},
(Cssize_t,), result)))
a = ccall(:jl_ptr_to_array_1d, Vector{Uint8},
(Any, Ptr{Uint8}, Csize_t, Cint),
Vector{Uint8}, p[1], result, true)
ccall(:jl_array_to_string, Any, (Any,), a)::ByteString
end
end

function normalize_string(s::String; stable::Bool=false, compat::Bool=false, compose::Bool=true, decompose::Bool=false, stripignore::Bool=false, rejectna::Bool=false, newline2ls::Bool=false, newline2ps::Bool=false, newline2lf::Bool=false, stripcc::Bool=false, casefold::Bool=false, lump::Bool=false, stripmark::Bool=false)
flags = 0
stable && (flags = flags | UTF8PROC_STABLE)
compat && (flags = flags | UTF8PROC_COMPAT)
if decompose
flags = flags | UTF8PROC_DECOMPOSE
elseif compose
flags = flags | UTF8PROC_COMPOSE
elseif compat || stripmark
throw(ArgumentError("compat=true or stripmark=true require compose=true or decompose=true"))
end
stripignore && (flags = flags | UTF8PROC_IGNORE)
rejectna && (flags = flags | UTF8PROC_REJECTNA)
newline2ls + newline2ps + newline2lf > 1 && throw(ArgumentError("only one newline conversion may be specified"))
newline2ls && (flags = flags | UTF8PROC_NLF2LS)
newline2ps && (flags = flags | UTF8PROC_NLF2PS)
newline2lf && (flags = flags | UTF8PROC_NLF2LF)
stripcc && (flags = flags | UTF8PROC_STRIPCC)
casefold && (flags = flags | UTF8PROC_CASEFOLD)
lump && (flags = flags | UTF8PROC_LUMP)
stripmark && (flags = flags | UTF8PROC_STRIPMARK)
utf8proc_map(s, flags)
end

function normalize_string(s::String, nf::Symbol)
utf8proc_map(s, nf == :NFC ? (UTF8PROC_STABLE | UTF8PROC_COMPOSE) :
nf == :NFD ? (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE) :
nf == :NFKC ? (UTF8PROC_STABLE | UTF8PROC_COMPOSE
| UTF8PROC_COMPAT) :
nf == :NFKD ? (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE
| UTF8PROC_COMPAT) :
throw(ArgumentError(":$nf is not one of :NFC, :NFD, :NFKC, :NFKD")))
end

# returns UTF8PROC_CATEGORY code in 0..30 giving Unicode category
function category_code(c)
# note: utf8proc returns 0, not UTF8PROC_CATEGORY_CN, for unassigned c
c > 0x10FFFF && return 0x0000 # see utf8proc_get_property docs
unsafe_load(ccall(:utf8proc_get_property, Ptr{Uint16}, (Int32,), c))
end

is_assigned_char(c) = category_code(c) != 0

# TODO: use UTF8PROC_CHARBOUND to extract graphemes from a string, e.g. to iterate over graphemes?

end # module
36 changes: 36 additions & 0 deletions doc/stdlib/base.rst
Original file line number Diff line number Diff line change
Expand Up @@ -945,6 +945,38 @@ Strings

Convert a string to a contiguous UTF-8 string (all characters must be valid UTF-8 characters).

.. function:: normalize_string(s, normalform::Symbol)

Normalize the string ``s`` according to one of the four "normal
forms" of the Unicode standard: ``normalform`` can be ``:NFC``,
``:NFD``, ``:NFKC``, or ``:NFKD``. Normal forms C (canonical
composition) and D (canonical decomposition) convert different
visually identical representations of the same abstract string into
a single canonical form, with form C being more compact. Normal
forms KC and KD additionally canonicalize "compatibility
equivalents": they convert characters that are abstractly similar
but visually distinct into a single canonical choice (e.g. they expand
ligatures into the individual characters), with form KC being more compact.

Alternatively, finer control and additional transformations may be
be obtained by calling `normalize_string(s; keywords...)`, where
any number of the following boolean keywords options (which all default
to ``false`` except for ``compose``) are specified:

* ``compose=false``: do not perform canonical composition
* ``decompose=true``: do canonical decomposition instead of canonical composition (``compose=true`` is ignored if present)
* ``compat=true``: compatibility equivalents are canonicalized
* ``casefold=true``: perform Unicode case folding, e.g. for case-insensitive string comparison
* ``lump=true``: non--standard canonicalization of various similar-looking characters into a single ASCII character, as defined by the utf8proc library (e.g. fraction and division slashes, space characters, dash characters, etcetera)
* ``newline2lf=true``, ``newline2ls=true``, or ``newline2ps=true``: convert various newline sequences (LF, CRLF, CR, NEL) into a linefeed (LF), line-separation (LS), or paragraph-separation (PS) character, respectively
* ``stripmark=true``: strip diacritical marks (e.g. accents)
* ``stripignore=true``: strip Unicode's "default ignorable" characters (e.g. the soft hyphen or the left-to-right marker)
* ``stripcc=true``: strip control characters; horizontal tabs and form feeds are converted to spaces; newlines are also converted to spaces unless a newline-conversion flag was specified
* ``rejectna=true``: throw an error if unassigned code points are found
* ``stable=true``: enforce Unicode Versioning Stability

For example, NFKC corresponds to the options ``compose=true, compat=true, stable=true``.

.. function:: is_valid_ascii(s) -> Bool

Returns true if the string or byte vector is valid ASCII, false otherwise.
Expand All @@ -957,6 +989,10 @@ Strings

Returns true if the given char or integer is a valid Unicode code point.

.. function:: is_assigned_char(c) -> Bool

Returns true if the given char or integer is an assigned Unicode code point.

.. function:: ismatch(r::Regex, s::String) -> Bool

Test whether a string contains a match of the given regular expression.
Expand Down
73 changes: 73 additions & 0 deletions test/strings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -855,3 +855,76 @@ for T = (Uint8,Int8,Uint16,Int16,Uint32,Int32,Uint64,Int64,Uint128,Int128,BigInt
n = T != BigInt ? rand(T) : BigInt(rand(Int128))
@test parseint(T,base(b,n),b) == n
end

# normalize_string (Unicode normalization etc.):
@test normalize_string("\u006e\u0303", :NFC) == "\u00f1"
@test "\u006e\u0303" == normalize_string("\u00f1", :NFD)
@test normalize_string("\ufb00", :NFC) != "ff"
@test normalize_string("\ufb00", :NFKC) == "ff"
@test normalize_string("\u006e\u0303\ufb00", :NFKC) == "\u00f1"*"ff"
@test normalize_string("\u00f1\ufb00", :NFKD) == "\u006e\u0303"*"ff"
@test normalize_string("\u006e\u0303", compose=true) == "\u00f1"
@test "\u006e\u0303" == normalize_string("\u00f1", decompose=true)
@test normalize_string("\u006e\u0303\u00b5",compat=true) == "\u00f1\u03bc"
@test normalize_string("Σσς",casefold=true) == "σσσ"
@test normalize_string("∕⁄", lump=true) == "//"
@test normalize_string("\ua\n\r\r\ua", newline2lf=true) == "\ua\ua\ua\ua"
@test normalize_string("\ua\n\r\r\ua", newline2ls=true) == "\u2028\u2028\u2028\u2028"
@test normalize_string("\ua\n\r\r\ua", newline2ps=true) == "\u2029\u2029\u2029\u2029"
@test normalize_string("\u00f1", stripmark=true) == "n"
@test isempty(normalize_string("\u00ad", stripignore=true))
@test normalize_string("\t\r", stripcc=true) == " "
@test normalize_string("\t\r", stripcc=true, newline2ls=true) == " \u2028"

#Tests from Unicode SA#15, "Unicode normalization forms"
#http://www.unicode.org/reports/tr15/

#1. Canonical equivalence
==(a::Array{Char},b::Array{Char}) =
normalize_string(string(a...), :NFC)==normalize_string(string(b...), :NFC)
@test ['C', '̧'] == ['Ç']
@test ['q', '̇', '̣'] == ['q', '̣', '̇']
@test ['가'] == ['ᄀ', 'ᅡ']
@test ['Ω'] == ['Ω']

#2. Compatibility Equivalence
==(a::Array{Char},b::Array{Char}) =
normalize_string(string(a...), :NFKC)==normalize_string(string(b...), :NFKC)
@test ['ℌ'] == ['ℍ'] == ['H']
@test ['ﻨ'] == ['ﻧ'] == ['ﻦ'] == ['ﻥ']
@test ['①'] == ['1']
@test ['カ'] == ['カ']
@test ['︷'] == ['{']
@test ['⁹'] == ['₉']
@test ['㌀'] == ['ア', 'パ', 'ー', 'ト']
@test ['¼'] == ['1', '⁄', '4']
@test ['dž'] == ['d', 'ž']

#3. Singletons
@test normalize_string("\U212b", :NFD) == "A\U030a"
@test normalize_string("\U212b", :NFC) == "\U00c5"
@test normalize_string("\U2126", :NFC) == normalize_string("\U2126", :NFD) == "\U03a9"

#4. Canonical Composites
@test normalize_string("\U00c5", :NFC) == "\U00c5"
@test normalize_string("\U00c5", :NFD) == "A\U030a"
@test normalize_string("\U00f4", :NFC) == "\U00f4"
@test normalize_string("\U00f4", :NFD) == "o\U0302"

#5. Multiple Combining Marks
@test normalize_string("\U1e69", :NFD) == "s\U0323\U0307"
@test normalize_string("\U1e69", :NFC) == "\U1e69"
@test normalize_string("\U1e0b\U0323", :NFD) == "d\U0323\U0307"
@test normalize_string("\U1e0b\U0323", :NFC) == "\U1e0d\U0307"
@test normalize_string("q\U0307\U0323", :NFC) == "q\U0323\U0307"
@test normalize_string("q\U0307\U0323", :NFD) == "q\U0323\U0307"

#6. Compatibility Composites
@test normalize_string("\Ufb01", :NFD) == normalize_string("\Ufb01", :NFC) == "\Ufb01"
@test normalize_string("\Ufb01", :NFKD) == normalize_string("\Ufb01", :NFKC) == "fi"
@test normalize_string("2\U2075", :NFD) == normalize_string("2\U2075", :NFC) == "2\U2075"
@test normalize_string("2\U2075", :NFKD) == normalize_string("2\U2075", :NFKC) == "25"
@test normalize_string("\U1e9b\U0323", :NFD) == "\U017f\U0323\U0307"
@test normalize_string("\U1e9b\U0323", :NFC) == "\U1e9b\U0323"
@test normalize_string("\U1e9b\U0323", :NFKD) == "s\U0323\U0307"
@test normalize_string("\U1e9b\U0323", :NFKC) == "\U1e69"