JuliaLang · ScottPJones · Dec 13, 2015 · Dec 13, 2015 · Dec 14, 2015 · Dec 14, 2015
diff --git a/base/exports.jl b/base/exports.jl
@@ -22,6 +22,7 @@ export
     Serializer,
     Docs,
     Markdown,
+    Unicode,
 
 # Types
     AbstractChannel,
@@ -116,6 +117,7 @@ export
     SymTridiagonal,
     Timer,
     Tridiagonal,
+    UnicodeProperty,
     UnitRange,
     UpperTriangular,
     UTF16String,
@@ -818,6 +820,7 @@ export
     bits,
     bytes2hex,
     bytestring,
+    charprop,
     charwidth,
     chomp,
     chop,

diff --git a/base/io.jl b/base/io.jl
@@ -176,15 +176,15 @@ function read(s::IO, ::Type{Char})
     end
 
     # mimic utf8.next function
-    trailing = Base.utf8_trailing[ch+1]
+    trailing = Unicode.utf8_trailing[ch+1]
     c::UInt32 = 0
     for j = 1:trailing
         c += ch
         c <<= 6
         ch = read(s, UInt8)
     end
     c += ch
-    c -= Base.utf8_offset[trailing+1]
+    c -= Unicode.utf8_offset[trailing+1]
     Char(c)
 end
 

diff --git a/base/unicode.jl b/base/unicode.jl
@@ -1,10 +1,23 @@
 # This file is a part of Julia. License is MIT: http://julialang.org/license
 
+module Unicode
+import Base: string, convert, write, length, endof, next, reverseind, lastidx, reverse, isvalid,
+	     sizeof, unsafe_convert, map, getindex, search, rsearch, pointer, containsnul,
+	     lowercase, uppercase, eltype, isless, promote_rule, ==
+
+export UnicodeError, UTF16String, UTF32String, unsafe_checkstring, checkstring,
+       utf8, utf16, utf32, containsnul, WString, wstring, charprop, Category,
+       is_assigned_char, islower, isupper, isdigit, isalpha, isnumber, isalnum, iscntrl,
+       ispunct, isspace, isprint, isgraph,
+       isgraphemebreak, GraphemeIterator, normalize_string, graphemes, charwidth
+
 include("unicode/UnicodeError.jl")
 include("unicode/types.jl")
 include("unicode/checkstring.jl")
 include("unicode/utf8.jl")
 include("unicode/utf16.jl")
 include("unicode/utf32.jl")
+include("unicode/properties.jl")
 include("unicode/utf8proc.jl")
-importall .UTF8proc
+end
+importall .Unicode
diff --git a/base/unicode/UnicodeError.jl b/base/unicode/UnicodeError.jl
@@ -2,30 +2,31 @@
 
 ##    Error messages for Unicode / UTF support
 
-const UTF_ERR_SHORT             = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)"
-const UTF_ERR_CONT              = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)"
-const UTF_ERR_LONG              = "invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)"
-const UTF_ERR_NOT_LEAD          = "not a leading Unicode surrogate code unit at index <<1>> (0x<<2>>)"
-const UTF_ERR_NOT_TRAIL         = "not a trailing Unicode surrogate code unit at index <<1>> (0x<<2>>)"
-const UTF_ERR_NOT_SURROGATE     = "not a valid Unicode surrogate code unit at index <<1>> (0x<<2>>"
-const UTF_ERR_MISSING_SURROGATE = "missing trailing Unicode surrogate code unit after index <<1>> (0x<<2>>)"
-const UTF_ERR_INVALID           = "invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)"
-const UTF_ERR_SURROGATE         = "surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)"
-const UTF_ERR_NULL_16_TERMINATE = "UTF16String data must be NULL-terminated"
-const UTF_ERR_NULL_32_TERMINATE = "UTF32String data must be NULL-terminated"
-const UTF_ERR_ODD_BYTES_16      = "UTF16String can't have odd number of bytes <<1>>"
-const UTF_ERR_ODD_BYTES_32      = "UTF32String must have multiple of 4 bytes <<1>>"
-const UTF_ERR_INVALID_CHAR      = "invalid Unicode character (0x<<2>> > 0x10ffff)"
-const UTF_ERR_INVALID_8         = "invalid UTF-8 data"
-const UTF_ERR_INVALID_16        = "invalid UTF-16 data"
-const UTF_ERR_INVALID_INDEX     = "invalid character index"
-const UTF_ERR_MAP_CHAR          = "map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"
+const ERR_SHORT             = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)"
+const ERR_CONT              = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)"
+const ERR_LONG              = "invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)"
+const ERR_NOT_LEAD          = "not a leading Unicode surrogate code unit at index <<1>> (0x<<2>>)"
+const ERR_NOT_TRAIL         = "not a trailing Unicode surrogate code unit at index <<1>> (0x<<2>>)"
+const ERR_NOT_SURROGATE     = "not a valid Unicode surrogate code unit at index <<1>> (0x<<2>>"
+const ERR_MISSING_SURROGATE = "missing trailing Unicode surrogate code unit after index <<1>> (0x<<2>>)"
+const ERR_INVALID           = "invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)"
+const ERR_SURROGATE         = "surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)"
+const ERR_NULL_16_TERMINATE = "UTF16String data must be NULL-terminated"
+const ERR_NULL_32_TERMINATE = "UTF32String data must be NULL-terminated"
+const ERR_ODD_BYTES_16      = "UTF16String can't have odd number of bytes <<1>>"
+const ERR_ODD_BYTES_32      = "UTF32String must have multiple of 4 bytes <<1>>"
+const ERR_INVALID_CHAR      = "invalid Unicode character (0x<<2>> > 0x10ffff)"
+const ERR_INVALID_8         = "invalid UTF-8 data"
+const ERR_INVALID_16        = "invalid UTF-16 data"
+const ERR_INVALID_INDEX     = "invalid character index"
+const ERR_MAP_CHAR          = "map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"
 
 type UnicodeError <: Exception
-    errmsg::AbstractString      ##< A UTF_ERR_ message
+    errmsg::AbstractString      ##< A Unicode.ERR_ message
     errpos::Int32               ##< Position of invalid character
     errchr::UInt32              ##< Invalid character
 end
 
-show(io::IO, exc::UnicodeError) = print(io, replace(replace(string("UnicodeError: ",exc.errmsg),
-    "<<1>>",string(exc.errpos)),"<<2>>",hex(exc.errchr)))
+Base.show(io::IO, exc::UnicodeError) =
+    print(io, replace(replace(string("UnicodeError: ",exc.errmsg),
+                              "<<1>>",string(exc.errpos)),"<<2>>",hex(exc.errchr)))
diff --git a/base/unicode/checkstring.jl b/base/unicode/checkstring.jl
@@ -20,7 +20,7 @@ const UTF_SURROGATE = 32        ##< surrogate pairs present
 ## Get a UTF-8 continuation byte, give error if invalid, return updated character value
 @inline function get_continuation(ch::UInt32, byt::UInt8, pos)
     if !is_valid_continuation(byt)
-        throw(UnicodeError(UTF_ERR_CONT, pos, byt))
+        throw(UnicodeError(ERR_CONT, pos, byt))
     end
     (ch << 6) | (byt & 0x3f)
 end
@@ -73,7 +73,7 @@ function unsafe_checkstring(dat::Vector{UInt8},
             # Check UTF-8 encoding
             if ch < 0xe0
                 # 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff)
-                (pos > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
+                (pos > endpos) && throw(UnicodeError(ERR_SHORT, pos, ch))
                 byt, pos = next(dat, pos)
                 ch = get_continuation(ch & 0x3f, byt, pos)
                 if ch > 0x7f
@@ -84,28 +84,28 @@ function unsafe_checkstring(dat::Vector{UInt8},
                 elseif (ch == 0) && accept_long_null
                     flags |= UTF_LONG
                 else
-                    throw(UnicodeError(UTF_ERR_LONG, pos, ch))
+                    throw(UnicodeError(ERR_LONG, pos, ch))
                 end
              elseif ch < 0xf0
                 # 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff)
-                (pos + 1 > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
+                (pos + 1 > endpos) && throw(UnicodeError(ERR_SHORT, pos, ch))
                 byt, pos = next(dat, pos)
                 ch = get_continuation(ch & 0x0f, byt, pos)
                 byt, pos = next(dat, pos)
                 ch = get_continuation(ch, byt, pos)
                 # check for surrogate pairs, make sure correct
                 if is_surrogate_codeunit(ch)
-                    !is_surrogate_lead(ch) && throw(UnicodeError(UTF_ERR_NOT_LEAD, pos-2, ch))
+                    !is_surrogate_lead(ch) && throw(UnicodeError(ERR_NOT_LEAD, pos-2, ch))
                     # next character *must* be a trailing surrogate character
-                    (pos + 2 > endpos) && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, pos-2, ch))
+                    (pos + 2 > endpos) && throw(UnicodeError(ERR_MISSING_SURROGATE, pos-2, ch))
                     byt, pos = next(dat, pos)
-                    (byt != 0xed) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos, byt))
+                    (byt != 0xed) && throw(UnicodeError(ERR_NOT_TRAIL, pos, byt))
                     byt, pos = next(dat, pos)
                     surr = get_continuation(0x0000d, byt, pos)
                     byt, pos = next(dat, pos)
                     surr = get_continuation(surr, byt, pos)
-                    !is_surrogate_trail(surr) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos-2, surr))
-                    !accept_surrogates && throw(UnicodeError(UTF_ERR_SURROGATE, pos-2, surr))
+                    !is_surrogate_trail(surr) && throw(UnicodeError(ERR_NOT_TRAIL, pos-2, surr))
+                    !accept_surrogates && throw(UnicodeError(ERR_SURROGATE, pos-2, surr))
                     flags |= UTF_SURROGATE
                     num4byte += 1
                 elseif ch > 0x07ff
@@ -114,23 +114,23 @@ function unsafe_checkstring(dat::Vector{UInt8},
                     flags |= UTF_LONG
                     num2byte += 1
                 else
-                    throw(UnicodeError(UTF_ERR_LONG, pos-2, ch))
+                    throw(UnicodeError(ERR_LONG, pos-2, ch))
                 end
             elseif ch < 0xf5
                 # 4-byte UTF-8 sequence (i.e. characters > 0xffff)
-                (pos + 2 > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
+                (pos + 2 > endpos) && throw(UnicodeError(ERR_SHORT, pos, ch))
                 byt, pos = next(dat, pos)
                 ch = get_continuation(ch & 0x07, byt, pos)
                 byt, pos = next(dat, pos)
                 ch = get_continuation(ch, byt, pos)
                 byt, pos = next(dat, pos)
                 ch = get_continuation(ch, byt, pos)
                 if ch > 0x10ffff
-                    throw(UnicodeError(UTF_ERR_INVALID, pos-3, ch))
+                    throw(UnicodeError(ERR_INVALID, pos-3, ch))
                 elseif ch > 0xffff
                     num4byte += 1
                 elseif is_surrogate_codeunit(ch)
-                    throw(UnicodeError(UTF_ERR_SURROGATE, pos-3, ch))
+                    throw(UnicodeError(ERR_SURROGATE, pos-3, ch))
                 elseif accept_long_char
                     # This is an overly long encoded character
                     flags |= UTF_LONG
@@ -140,10 +140,10 @@ function unsafe_checkstring(dat::Vector{UInt8},
                         num2byte += 1
                     end
                 else
-                    throw(UnicodeError(UTF_ERR_LONG, pos-2, ch))
+                    throw(UnicodeError(ERR_LONG, pos-2, ch))
                 end
             else
-                throw(UnicodeError(UTF_ERR_INVALID, pos, ch))
+                throw(UnicodeError(ERR_INVALID, pos, ch))
             end
         end
     end
@@ -174,22 +174,22 @@ function unsafe_checkstring{T <: Union{Vector{UInt16}, Vector{UInt32}, AbstractS
                 num2byte += 1
                 flags |= UTF_UNICODE2
             elseif ch > 0x0ffff
-                (ch > 0x10ffff) && throw(UnicodeError(UTF_ERR_INVALID, pos, ch))
+                (ch > 0x10ffff) && throw(UnicodeError(ERR_INVALID, pos, ch))
                 num4byte += 1
             elseif !is_surrogate_codeunit(ch)
                 num3byte += 1
             elseif is_surrogate_lead(ch)
-                pos > endpos && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, pos, ch))
+                pos > endpos && throw(UnicodeError(ERR_MISSING_SURROGATE, pos, ch))
                 # next character *must* be a trailing surrogate character
                 ch, pos = next(dat, pos)
-                !is_surrogate_trail(ch) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos, ch))
+                !is_surrogate_trail(ch) && throw(UnicodeError(ERR_NOT_TRAIL, pos, ch))
                 num4byte += 1
                 if T != Vector{UInt16}
-                    !accept_surrogates && throw(UnicodeError(UTF_ERR_SURROGATE, pos, ch))
+                    !accept_surrogates && throw(UnicodeError(ERR_SURROGATE, pos, ch))
                     flags |= UTF_SURROGATE
                 end
             else
-                throw(UnicodeError(UTF_ERR_NOT_LEAD, pos, ch))
+                throw(UnicodeError(ERR_NOT_LEAD, pos, ch))
             end
         end
     end