From 8365c8467010328f4d6838759c22deaee6574ab5 Mon Sep 17 00:00:00 2001
From: ScottPJones <scottjones@alum.mit.edu>
Date: Sat, 12 Dec 2015 19:59:05 -0500
Subject: [PATCH 1/7] Unicode character properties

---
 base/exports.jl              |   5 +
 base/io.jl                   |   4 +-
 base/unicode.jl              |  17 ++-
 base/unicode/UnicodeError.jl |  43 ++++----
 base/unicode/checkstring.jl  |  40 +++----
 base/unicode/properties.jl   | 157 ++++++++++++++++++++++++++
 base/unicode/types.jl        |  21 ++--
 base/unicode/utf16.jl        |  16 +--
 base/unicode/utf32.jl        |   4 +-
 base/unicode/utf8.jl         |  12 +-
 base/unicode/utf8proc.jl     | 208 ++++++++++-------------------------
 test/unicode.jl              |   3 +-
 test/unicode/UnicodeError.jl |   2 +-
 test/unicode/properties.jl   | 164 +++++++++++++++++++++++++++
 test/unicode/utf8proc.jl     | 163 ---------------------------
 15 files changed, 474 insertions(+), 385 deletions(-)
 create mode 100644 base/unicode/properties.jl
 create mode 100644 test/unicode/properties.jl

diff --git a/base/exports.jl b/base/exports.jl
index e05c856bd9a3f..00d714ceb74be 100644
--- a/base/exports.jl
+++ b/base/exports.jl
@@ -22,6 +22,7 @@ export
     Serializer,
     Docs,
     Markdown,
+    Unicode,
 
 # Types
     AbstractChannel,
@@ -40,6 +41,8 @@ export
     CartesianIndex,
     CartesianRange,
     Channel,
+    CharCategory,
+    CharCategoryCode,
     Cmd,
     Colon,
     Complex,
@@ -116,6 +119,7 @@ export
     SymTridiagonal,
     Timer,
     Tridiagonal,
+    UnicodeProperty,
     UnitRange,
     UpperTriangular,
     UTF16String,
@@ -818,6 +822,7 @@ export
     bits,
     bytes2hex,
     bytestring,
+    charprop,
     charwidth,
     chomp,
     chop,
diff --git a/base/io.jl b/base/io.jl
index d2f17595b5f33..eaf25d7bae5d1 100644
--- a/base/io.jl
+++ b/base/io.jl
@@ -176,7 +176,7 @@ function read(s::IO, ::Type{Char})
     end
 
     # mimic utf8.next function
-    trailing = Base.utf8_trailing[ch+1]
+    trailing = Unicode.utf8_trailing[ch+1]
     c::UInt32 = 0
     for j = 1:trailing
         c += ch
@@ -184,7 +184,7 @@ function read(s::IO, ::Type{Char})
         ch = read(s, UInt8)
     end
     c += ch
-    c -= Base.utf8_offset[trailing+1]
+    c -= Unicode.utf8_offset[trailing+1]
     Char(c)
 end
 
diff --git a/base/unicode.jl b/base/unicode.jl
index e0ed8b5d1b0a8..a0f6096e75289 100644
--- a/base/unicode.jl
+++ b/base/unicode.jl
@@ -1,10 +1,25 @@
 # This file is a part of Julia. License is MIT: http://julialang.org/license
 
+module Unicode
+import Base: string, convert, write, length, endof, next, reverseind, lastidx, reverse, isvalid,
+	     sizeof, unsafe_convert, map, getindex, search, rsearch, pointer, containsnul,
+	     lowercase, uppercase, eltype, isless, promote_rule, ==
+
+export UnicodeError, UTF16String, UTF32String, unsafe_checkstring, checkstring,
+       utf8, utf16, utf32, containsnul, WString, wstring,
+       charprop, CharCategoryCode, UnicodeProperty, CharCategory, CatLetter, CatMark, CatNumber,
+       CatPunctuation, CatSymbol, CatSeparator, CatOther, CatUpper,
+       is_assigned_char, islower, isupper, isdigit, isalpha, isnumber, isalnum, iscntrl,
+       ispunct, isspace, isprint, isgraph,
+       isgraphemebreak, GraphemeIterator, normalize_string, graphemes, charwidth
+
 include("unicode/UnicodeError.jl")
 include("unicode/types.jl")
 include("unicode/checkstring.jl")
 include("unicode/utf8.jl")
 include("unicode/utf16.jl")
 include("unicode/utf32.jl")
+include("unicode/properties.jl")
 include("unicode/utf8proc.jl")
-importall .UTF8proc
+end
+importall .Unicode
diff --git a/base/unicode/UnicodeError.jl b/base/unicode/UnicodeError.jl
index 5b9002729ccf3..c2626f5167af5 100644
--- a/base/unicode/UnicodeError.jl
+++ b/base/unicode/UnicodeError.jl
@@ -2,30 +2,31 @@
 
 ##    Error messages for Unicode / UTF support
 
-const UTF_ERR_SHORT             = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)"
-const UTF_ERR_CONT              = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)"
-const UTF_ERR_LONG              = "invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)"
-const UTF_ERR_NOT_LEAD          = "not a leading Unicode surrogate code unit at index <<1>> (0x<<2>>)"
-const UTF_ERR_NOT_TRAIL         = "not a trailing Unicode surrogate code unit at index <<1>> (0x<<2>>)"
-const UTF_ERR_NOT_SURROGATE     = "not a valid Unicode surrogate code unit at index <<1>> (0x<<2>>"
-const UTF_ERR_MISSING_SURROGATE = "missing trailing Unicode surrogate code unit after index <<1>> (0x<<2>>)"
-const UTF_ERR_INVALID           = "invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)"
-const UTF_ERR_SURROGATE         = "surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)"
-const UTF_ERR_NULL_16_TERMINATE = "UTF16String data must be NULL-terminated"
-const UTF_ERR_NULL_32_TERMINATE = "UTF32String data must be NULL-terminated"
-const UTF_ERR_ODD_BYTES_16      = "UTF16String can't have odd number of bytes <<1>>"
-const UTF_ERR_ODD_BYTES_32      = "UTF32String must have multiple of 4 bytes <<1>>"
-const UTF_ERR_INVALID_CHAR      = "invalid Unicode character (0x<<2>> > 0x10ffff)"
-const UTF_ERR_INVALID_8         = "invalid UTF-8 data"
-const UTF_ERR_INVALID_16        = "invalid UTF-16 data"
-const UTF_ERR_INVALID_INDEX     = "invalid character index"
-const UTF_ERR_MAP_CHAR          = "map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"
+const ERR_SHORT             = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)"
+const ERR_CONT              = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)"
+const ERR_LONG              = "invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)"
+const ERR_NOT_LEAD          = "not a leading Unicode surrogate code unit at index <<1>> (0x<<2>>)"
+const ERR_NOT_TRAIL         = "not a trailing Unicode surrogate code unit at index <<1>> (0x<<2>>)"
+const ERR_NOT_SURROGATE     = "not a valid Unicode surrogate code unit at index <<1>> (0x<<2>>"
+const ERR_MISSING_SURROGATE = "missing trailing Unicode surrogate code unit after index <<1>> (0x<<2>>)"
+const ERR_INVALID           = "invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)"
+const ERR_SURROGATE         = "surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)"
+const ERR_NULL_16_TERMINATE = "UTF16String data must be NULL-terminated"
+const ERR_NULL_32_TERMINATE = "UTF32String data must be NULL-terminated"
+const ERR_ODD_BYTES_16      = "UTF16String can't have odd number of bytes <<1>>"
+const ERR_ODD_BYTES_32      = "UTF32String must have multiple of 4 bytes <<1>>"
+const ERR_INVALID_CHAR      = "invalid Unicode character (0x<<2>> > 0x10ffff)"
+const ERR_INVALID_8         = "invalid UTF-8 data"
+const ERR_INVALID_16        = "invalid UTF-16 data"
+const ERR_INVALID_INDEX     = "invalid character index"
+const ERR_MAP_CHAR          = "map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"
 
 type UnicodeError <: Exception
-    errmsg::AbstractString      ##< A UTF_ERR_ message
+    errmsg::AbstractString      ##< A Unicode.ERR_ message
     errpos::Int32               ##< Position of invalid character
     errchr::UInt32              ##< Invalid character
 end
 
-show(io::IO, exc::UnicodeError) = print(io, replace(replace(string("UnicodeError: ",exc.errmsg),
-    "<<1>>",string(exc.errpos)),"<<2>>",hex(exc.errchr)))
+Base.show(io::IO, exc::UnicodeError) =
+    print(io, replace(replace(string("UnicodeError: ",exc.errmsg),
+                              "<<1>>",string(exc.errpos)),"<<2>>",hex(exc.errchr)))
diff --git a/base/unicode/checkstring.jl b/base/unicode/checkstring.jl
index 8b9f344831f95..df185ede675f3 100644
--- a/base/unicode/checkstring.jl
+++ b/base/unicode/checkstring.jl
@@ -20,7 +20,7 @@ const UTF_SURROGATE = 32        ##< surrogate pairs present
 ## Get a UTF-8 continuation byte, give error if invalid, return updated character value
 @inline function get_continuation(ch::UInt32, byt::UInt8, pos)
     if !is_valid_continuation(byt)
-        throw(UnicodeError(UTF_ERR_CONT, pos, byt))
+        throw(UnicodeError(ERR_CONT, pos, byt))
     end
     (ch << 6) | (byt & 0x3f)
 end
@@ -73,7 +73,7 @@ function unsafe_checkstring(dat::Vector{UInt8},
             # Check UTF-8 encoding
             if ch < 0xe0
                 # 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff)
-                (pos > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
+                (pos > endpos) && throw(UnicodeError(ERR_SHORT, pos, ch))
                 byt, pos = next(dat, pos)
                 ch = get_continuation(ch & 0x3f, byt, pos)
                 if ch > 0x7f
@@ -84,28 +84,28 @@ function unsafe_checkstring(dat::Vector{UInt8},
                 elseif (ch == 0) && accept_long_null
                     flags |= UTF_LONG
                 else
-                    throw(UnicodeError(UTF_ERR_LONG, pos, ch))
+                    throw(UnicodeError(ERR_LONG, pos, ch))
                 end
              elseif ch < 0xf0
                 # 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff)
-                (pos + 1 > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
+                (pos + 1 > endpos) && throw(UnicodeError(ERR_SHORT, pos, ch))
                 byt, pos = next(dat, pos)
                 ch = get_continuation(ch & 0x0f, byt, pos)
                 byt, pos = next(dat, pos)
                 ch = get_continuation(ch, byt, pos)
                 # check for surrogate pairs, make sure correct
                 if is_surrogate_codeunit(ch)
-                    !is_surrogate_lead(ch) && throw(UnicodeError(UTF_ERR_NOT_LEAD, pos-2, ch))
+                    !is_surrogate_lead(ch) && throw(UnicodeError(ERR_NOT_LEAD, pos-2, ch))
                     # next character *must* be a trailing surrogate character
-                    (pos + 2 > endpos) && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, pos-2, ch))
+                    (pos + 2 > endpos) && throw(UnicodeError(ERR_MISSING_SURROGATE, pos-2, ch))
                     byt, pos = next(dat, pos)
-                    (byt != 0xed) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos, byt))
+                    (byt != 0xed) && throw(UnicodeError(ERR_NOT_TRAIL, pos, byt))
                     byt, pos = next(dat, pos)
                     surr = get_continuation(0x0000d, byt, pos)
                     byt, pos = next(dat, pos)
                     surr = get_continuation(surr, byt, pos)
-                    !is_surrogate_trail(surr) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos-2, surr))
-                    !accept_surrogates && throw(UnicodeError(UTF_ERR_SURROGATE, pos-2, surr))
+                    !is_surrogate_trail(surr) && throw(UnicodeError(ERR_NOT_TRAIL, pos-2, surr))
+                    !accept_surrogates && throw(UnicodeError(ERR_SURROGATE, pos-2, surr))
                     flags |= UTF_SURROGATE
                     num4byte += 1
                 elseif ch > 0x07ff
@@ -114,11 +114,11 @@ function unsafe_checkstring(dat::Vector{UInt8},
                     flags |= UTF_LONG
                     num2byte += 1
                 else
-                    throw(UnicodeError(UTF_ERR_LONG, pos-2, ch))
+                    throw(UnicodeError(ERR_LONG, pos-2, ch))
                 end
             elseif ch < 0xf5
                 # 4-byte UTF-8 sequence (i.e. characters > 0xffff)
-                (pos + 2 > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
+                (pos + 2 > endpos) && throw(UnicodeError(ERR_SHORT, pos, ch))
                 byt, pos = next(dat, pos)
                 ch = get_continuation(ch & 0x07, byt, pos)
                 byt, pos = next(dat, pos)
@@ -126,11 +126,11 @@ function unsafe_checkstring(dat::Vector{UInt8},
                 byt, pos = next(dat, pos)
                 ch = get_continuation(ch, byt, pos)
                 if ch > 0x10ffff
-                    throw(UnicodeError(UTF_ERR_INVALID, pos-3, ch))
+                    throw(UnicodeError(ERR_INVALID, pos-3, ch))
                 elseif ch > 0xffff
                     num4byte += 1
                 elseif is_surrogate_codeunit(ch)
-                    throw(UnicodeError(UTF_ERR_SURROGATE, pos-3, ch))
+                    throw(UnicodeError(ERR_SURROGATE, pos-3, ch))
                 elseif accept_long_char
                     # This is an overly long encoded character
                     flags |= UTF_LONG
@@ -140,10 +140,10 @@ function unsafe_checkstring(dat::Vector{UInt8},
                         num2byte += 1
                     end
                 else
-                    throw(UnicodeError(UTF_ERR_LONG, pos-2, ch))
+                    throw(UnicodeError(ERR_LONG, pos-2, ch))
                 end
             else
-                throw(UnicodeError(UTF_ERR_INVALID, pos, ch))
+                throw(UnicodeError(ERR_INVALID, pos, ch))
             end
         end
     end
@@ -174,22 +174,22 @@ function unsafe_checkstring{T <: Union{Vector{UInt16}, Vector{UInt32}, AbstractS
                 num2byte += 1
                 flags |= UTF_UNICODE2
             elseif ch > 0x0ffff
-                (ch > 0x10ffff) && throw(UnicodeError(UTF_ERR_INVALID, pos, ch))
+                (ch > 0x10ffff) && throw(UnicodeError(ERR_INVALID, pos, ch))
                 num4byte += 1
             elseif !is_surrogate_codeunit(ch)
                 num3byte += 1
             elseif is_surrogate_lead(ch)
-                pos > endpos && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, pos, ch))
+                pos > endpos && throw(UnicodeError(ERR_MISSING_SURROGATE, pos, ch))
                 # next character *must* be a trailing surrogate character
                 ch, pos = next(dat, pos)
-                !is_surrogate_trail(ch) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos, ch))
+                !is_surrogate_trail(ch) && throw(UnicodeError(ERR_NOT_TRAIL, pos, ch))
                 num4byte += 1
                 if T != Vector{UInt16}
-                    !accept_surrogates && throw(UnicodeError(UTF_ERR_SURROGATE, pos, ch))
+                    !accept_surrogates && throw(UnicodeError(ERR_SURROGATE, pos, ch))
                     flags |= UTF_SURROGATE
                 end
             else
-                throw(UnicodeError(UTF_ERR_NOT_LEAD, pos, ch))
+                throw(UnicodeError(ERR_NOT_LEAD, pos, ch))
             end
         end
     end
diff --git a/base/unicode/properties.jl b/base/unicode/properties.jl
new file mode 100644
index 0000000000000..e2cf4e99b6a00
--- /dev/null
+++ b/base/unicode/properties.jl
@@ -0,0 +1,157 @@
+# This file is a part of Julia. License is MIT: http://julialang.org/license
+
+# Unicode properties, such as General Category
+# Unix/C is* convenience functions (for now)
+
+# whether codepoints are valid Unicode scalar values, i.e. 0-0xd7ff, 0xe000-0x10ffff
+isvalid(::Type{Char}, ch::Unsigned) = !((ch - 0xd800 < 0x800) | (ch > 0x10ffff))
+isvalid(::Type{Char}, ch::Integer)  = isvalid(Char, Unsigned(ch))
+isvalid(::Type{Char}, ch::Char)     = isvalid(Char, UInt32(ch))
+
+isvalid(ch::Char) = isvalid(Char, ch)
+
+# Unicode General Category constants
+
+"""Unicode character properties"""
+abstract UnicodeProperty
+"""Unicode character categories"""
+abstract CharCategory   <: UnicodeProperty
+
+"""Unicode letter character category"""
+abstract CatLetter      <: CharCategory
+"""Unicode Mark character category"""
+abstract CatMark        <: CharCategory
+"""Unicode Numeric character category"""
+abstract CatNumber      <: CharCategory
+"""Unicode Punctuation character category"""
+abstract CatPunctuation <: CharCategory
+"""Unicode Symbol character category"""
+abstract CatSymbol      <: CharCategory
+"""Unicode Separator character category"""
+abstract CatSeparator   <: CharCategory
+"""Unicode Other character category"""
+abstract CatOther       <: CharCategory
+
+"""Unicode uppercase & titlecase letters"""
+abstract CatUpper       <: CatLetter
+
+"""Unicode Character Category Code (0-29)"""
+bitstype 8 CharCategoryCode
+
+convert(::Type{CharCategoryCode}, x::Integer) = reinterpret(CharCategoryCode, x%UInt8)
+convert{T<:Integer}(::Type{T}, x::CharCategoryCode) = convert(T, reinterpret(UInt8, x))
+promote_rule{T<:Integer}(::Type{T}, ::Type{CharCategoryCode}) = T
+isless(x::CharCategoryCode, y::CharCategoryCode) = isless(UInt32(x), UInt32(y))
+isless(x::CharCategoryCode, y::Integer) = isless(UInt32(x), y)
+isless(x::Integer, y::CharCategoryCode) = isless(x, UInt32(y))
+
+for (nam, val, cat, typ, des) in
+    ((:Cn, 0,  :NotAssignedChar,    CatOther,       "Other, Not assigned"),
+     (:Lu, 1,  :UpperCase,          CatUpper,       "Letter, uppercase"),
+     (:Ll, 2,  :LowerCase,          CatLetter,      "Letter, lowercase"),
+     (:Lt, 3,  :TitleCase,          CatUpper,       "Letter, titlecase"),
+     (:Lm, 4,  :ModifierLetter,     CatLetter,      "Letter, modifier"),
+     (:Lo, 5,  :OtherLetter,        CatLetter,      "Letter, other"),
+     (:Mn, 6,  :NonSpacingMark,     CatMark,        "Mark, nonspacing"),
+     (:Mc, 7,  :CombiningMark,      CatMark,        "Mark, spacing combining"),
+     (:Me, 8,  :EnclosingMark,      CatMark,        "Mark, enclosing"),
+     (:Nd, 9,  :DecimalDigit,       CatNumber,      "Number, decimal digit"),
+     (:Nl, 10, :NumericLetter,      CatNumber,      "Number, letter"),
+     (:No, 11, :OtherNumber,        CatNumber,      "Number, other"),
+     (:Pc, 12, :ConnectorPunct,     CatPunctuation, "Punctuation, connector"),
+     (:Pd, 13, :DashPunct,          CatPunctuation, "Punctuation, dash"),
+     (:Ps, 14, :OpenPunct,          CatPunctuation, "Punctuation, open"),
+     (:Pe, 15, :ClosePunct,         CatPunctuation, "Punctuation, close"),
+     (:Pi, 16, :BegQuotePunct,      CatPunctuation, "Punctuation, initial quote"),
+     (:Pf, 17, :EndQuotePunct,      CatPunctuation, "Punctuation, final quote"),
+     (:Po, 18, :OtherPunct,         CatPunctuation, "Punctuation, other"),
+     (:Sm, 19, :MathSymbol,         CatSymbol,      "Symbol, math"),
+     (:Sc, 20, :CurrencySymbol,     CatSymbol,      "Symbol, currency"),
+     (:Sk, 21, :ModifierSymbol,     CatSymbol,      "Symbol, modifier"),
+     (:So, 22, :OtherSymbol,        CatSymbol,      "Symbol, other"),
+     (:Zs, 23, :SpaceSeparator,     CatSeparator,   "Separator, space"),
+     (:Zl, 24, :LineSeparator,      CatSeparator,   "Separator, line"),
+     (:Zp, 25, :ParagraphSeparator, CatSeparator,   "Separator, paragraph"),
+     (:Cc, 26, :ControlChar,        CatOther,       "Other, control"),
+     (:Cf, 27, :FormatChar,         CatOther,       "Other, format"),
+     (:Cs, 28, :SurrogateChar,      CatOther,       "Other, surrogate"),
+     (:Co, 29, :PrivateUseChar,     CatOther,       "Other, private use"))
+    @eval const global $nam = CharCategoryCode($val)
+    @eval export $cat
+    @eval abstract $cat <: $typ
+    @eval @doc $(string("Unicode Category Code: ",des)) $nam
+    @eval @doc $(string("Unicode Category Type: ",des)) $cat
+end
+
+const c2t = [NotAssignedChar, UpperCase, LowerCase, TitleCase, ModifierLetter, OtherLetter,
+             NonSpacingMark, CombiningMark, EnclosingMark,
+             DecimalDigit, NumericLetter, OtherNumber,
+             ConnectorPunct, DashPunct, OpenPunct, ClosePunct,
+             BegQuotePunct, EndQuotePunct, OtherPunct,
+             MathSymbol, CurrencySymbol, ModifierSymbol, OtherSymbol,
+             SpaceSeparator, LineSeparator, ParagraphSeparator,
+	     ControlChar, FormatChar, SurrogateChar, PrivateUseChar]
+
+############################################################################
+
+
+"""
+Return various Unicode properties for character
+"""
+function charprop end
+
+charprop(::Type{CharCategory}, c) = c2t[Int(charprop(CharCategoryCode, c))+1]
+
+is_assigned_char(c) = charprop(CharCategoryCode, c) != Cn
+
+## libc character class predicates ##
+
+islower(c::Char) = charprop(CharCategoryCode, c) == Ll
+
+# true for Unicode upper and mixed case
+isupper(c::Char) = (ccode = charprop(CharCategoryCode, c)) == Lu || ccode == Lt
+
+isdigit(c::Char)  = ('0' <= c <= '9')
+isalpha(c::Char)  = (Lu <= charprop(CharCategoryCode, c) <= Lo)
+isnumber(c::Char) = (Nd <= charprop(CharCategoryCode, c) <= No)
+isalnum(c::Char)  = (Lu <= (ccode = charprop(CharCategoryCode, c)) <= Lo) || (Nd <= ccode <= No)
+
+# These are about 3 times slower, because the isa method
+# is much slower than checking if an integer is within range (or two ranges)
+# If that is sped up, then these, which are more readable, could replace the other forms.
+#=
+isalpha(c::Char)  = charprop(CharCategory, c) <: CatLetter
+isnumber(c::Char) = charprop(CharCategory, c) <: CatNumber
+isupper(c::Char)  = charprop(CharCategory, c) <: CatUpper
+isalnum(c::Char)  = charprop(CharCategory, c) <: Union{CatLetter, CatNumber}
+ispunct(c::Char)  = charprop(CharCategory, c) <: CatPunctuation
+=#
+
+# following C++ only control characters from the Latin-1 subset return true
+iscntrl(c::Char) = (c <= Char(0x1f) || Char(0x7f) <= c <= Char(0x9f))
+
+ispunct(c::Char) = (Pc <= charprop(CharCategoryCode, c) <= Po)
+
+# \u85 is the Unicode Next Line (NEL) character
+# the check for \ufffd allows for branch removal on ASCIIStrings
+@inline isspace(c::Char) =
+    (c == ' ' || '\t' <= c <='\r' || c == '\u85' ||
+     ('\ua0' <= c && c != '\ufffd' && charprop(CharCategoryCode, c) == Zs))
+
+isprint(c::Char) = (Lu <= charprop(CharCategoryCode, c) <= Zs)
+
+# true in principle if a printer would use ink
+isgraph(c::Char) = (Lu <= charprop(CharCategoryCode, c) <= So)
+
+for name = ("alnum", "alpha", "cntrl", "digit", "number", "graph",
+            "lower", "print", "punct", "space", "upper")
+    f = symbol("is",name)
+    @eval begin
+        function $f(s::AbstractString)
+            for c in s
+                $f(c) || return false
+            end
+            return true
+        end
+    end
+end
diff --git a/base/unicode/types.jl b/base/unicode/types.jl
index 52765a853303b..9c6dc990921e4 100644
--- a/base/unicode/types.jl
+++ b/base/unicode/types.jl
@@ -1,29 +1,31 @@
 # This file is a part of Julia. License is MIT: http://julialang.org/license
 
-##\brief      Base UTF16String type, has 16-bit NULL termination word after data, native byte order
-#
-# \throws     UnicodeError
+"""
+Base UTF16String type, has 16-bit NULL termination word after data, native byte order
 
+Throws: UnicodeError
+"""
 immutable UTF16String <: AbstractString
     data::Vector{UInt16} # includes 16-bit NULL termination after string chars
     function UTF16String(data::Vector{UInt16})
         if length(data) < 1 || data[end] != 0
-            throw(UnicodeError(UTF_ERR_NULL_16_TERMINATE, 0, 0))
+            throw(UnicodeError(ERR_NULL_16_TERMINATE, 0, 0))
         end
         new(data)
     end
 end
 
-##\brief      Base UTF32String type, has 32-bit NULL termination word after data, native byte order
-#
-# \throws     UnicodeError
+"""
+Base UTF32String type, has 32-bit NULL termination word after data, native byte order
 
+Throws: UnicodeError
+"""
 immutable UTF32String <: DirectIndexString
     data::Vector{UInt32} # includes 32-bit NULL termination after string chars
 
     function UTF32String(data::Vector{UInt32})
         if length(data) < 1 || data[end] != 0
-            throw(UnicodeError(UTF_ERR_NULL_32_TERMINATE, 0, 0))
+            throw(UnicodeError(ERR_NULL_32_TERMINATE, 0, 0))
         end
         new(data)
     end
@@ -31,4 +33,5 @@ end
 UTF32String(data::Vector{Char}) = UTF32String(reinterpret(UInt32, data))
 
 isvalid{T<:Union{ASCIIString,UTF8String,UTF16String,UTF32String}}(str::T) = isvalid(T, str.data)
-isvalid{T<:Union{ASCIIString,UTF8String,UTF16String,UTF32String}}(::Type{T}, str::T) = isvalid(T, str.data)
+isvalid{T<:Union{ASCIIString,UTF8String,UTF16String,UTF32String}}(::Type{T}, str::T) =
+    isvalid(T, str.data)
diff --git a/base/unicode/utf16.jl b/base/unicode/utf16.jl
index 712adbb75a896..551b8f97b3f25 100644
--- a/base/unicode/utf16.jl
+++ b/base/unicode/utf16.jl
@@ -54,10 +54,10 @@ function next(s::UTF16String, i::Int)
     ch = s.data[i]
     !is_surrogate_codeunit(ch) && return (Char(ch), i+1)
     # check length, account for terminating \0
-    i >= (length(s.data)-1) && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, i, UInt32(ch)))
-    !is_surrogate_lead(ch) && throw(UnicodeError(UTF_ERR_NOT_LEAD, i, ch))
+    i >= (length(s.data)-1) && throw(UnicodeError(ERR_MISSING_SURROGATE, i, UInt32(ch)))
+    !is_surrogate_lead(ch) && throw(UnicodeError(ERR_NOT_LEAD, i, ch))
     ct = s.data[i+1]
-    !is_surrogate_trail(ct) && throw((UTF_ERR_NOT_TRAIL, i, ch))
+    !is_surrogate_trail(ct) && throw((ERR_NOT_TRAIL, i, ch))
     Char(get_supplementary(ch, ct)), i+2
 end
 
@@ -222,7 +222,7 @@ end
 
 function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
     isempty(bytes) && return UTF16String(UInt16[0])
-    isodd(length(bytes)) && throw(UnicodeError(UTF_ERR_ODD_BYTES_16, length(bytes), 0))
+    isodd(length(bytes)) && throw(UnicodeError(ERR_ODD_BYTES_16, length(bytes), 0))
     data = reinterpret(UInt16, bytes)
     # check for byte-order mark (BOM):
     if data[1] == 0xfeff        # native byte order
@@ -238,7 +238,7 @@ function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
         copy!(d,1, data,1, length(data)) # assume native byte order
     end
     d[end] = 0 # NULL terminate
-    !isvalid(UTF16String, d) && throw(UnicodeError(UTF_ERR_INVALID_16,0,0))
+    !isvalid(UTF16String, d) && throw(UnicodeError(ERR_INVALID_16,0,0))
     UTF16String(d)
 end
 
@@ -257,19 +257,19 @@ function map(fun, str::UTF16String)
     for ch in str
         c2 = fun(ch)
         if !isa(c2, Char)
-            throw(UnicodeError(UTF_ERR_MAP_CHAR, 0, 0))
+            throw(UnicodeError(ERR_MAP_CHAR, 0, 0))
         end
         uc = UInt32(c2)
         if uc < 0x10000
             if is_surrogate_codeunit(UInt16(uc))
-                throw(UnicodeError(UTF_ERR_INVALID_CHAR, 0, uc))
+                throw(UnicodeError(ERR_INVALID_CHAR, 0, uc))
             end
             push!(buf, UInt16(uc))
         elseif uc <= 0x10ffff
             push!(buf, UInt16(0xd7c0 + (uc >> 10)))
             push!(buf, UInt16(0xdc00 + (uc & 0x3ff)))
         else
-            throw(UnicodeError(UTF_ERR_INVALID_CHAR, 0, uc))
+            throw(UnicodeError(ERR_INVALID_CHAR, 0, uc))
         end
     end
     push!(buf, 0)
diff --git a/base/unicode/utf32.jl b/base/unicode/utf32.jl
index 4b9ebeee4f278..b53fff9450e34 100644
--- a/base/unicode/utf32.jl
+++ b/base/unicode/utf32.jl
@@ -149,7 +149,7 @@ unsafe_convert{T<:Union{UInt32,Int32,Char}}(::Type{Ptr{T}}, s::UTF32String) =
 
 function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8})
     isempty(bytes) && return empty_utf32
-    length(bytes) & 3 != 0 && throw(UnicodeError(UTF_ERR_ODD_BYTES_32,0,0))
+    length(bytes) & 3 != 0 && throw(UnicodeError(ERR_ODD_BYTES_32,0,0))
     data = reinterpret(UInt32, bytes)
     # check for byte-order mark (BOM):
     if data[1] == 0x0000feff # native byte order
@@ -194,7 +194,7 @@ function map(f, s::UTF32String)
     @inbounds for i = 1:(length(d)-1)
         c2 = f(Char(d[i]))
         if !isa(c2, Char)
-            throw(UnicodeError(UTF_ERR_MAP_CHAR, 0, 0))
+            throw(UnicodeError(ERR_MAP_CHAR, 0, 0))
         end
         out[i] = (c2::Char)
     end
diff --git a/base/unicode/utf8.jl b/base/unicode/utf8.jl
index 5f278c0e18b4b..4f8d89b7e1d20 100644
--- a/base/unicode/utf8.jl
+++ b/base/unicode/utf8.jl
@@ -62,7 +62,7 @@ function next(s::UTF8String, i::Int)
     d = s.data
     b = d[i]
     if is_valid_continuation(b)
-        throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i]))
+        throw(UnicodeError(ERR_INVALID_INDEX, i, d[i]))
     end
     trailing = utf8_trailing[b+1]
     if length(d) < i + trailing
@@ -114,7 +114,7 @@ function getindex(s::UTF8String, r::UnitRange{Int})
         throw(BoundsError(s, i))
     end
     if is_valid_continuation(d[i])
-        throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i]))
+        throw(UnicodeError(ERR_INVALID_INDEX, i, d[i]))
     end
     if j > length(d)
         throw(BoundsError())
@@ -130,7 +130,7 @@ function search(s::UTF8String, c::Char, i::Integer)
     end
     d = s.data
     if is_valid_continuation(d[i])
-        throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i]))
+        throw(UnicodeError(ERR_INVALID_INDEX, i, d[i]))
     end
     c < Char(0x80) && return search(d, c%UInt8, i)
     while true
@@ -203,16 +203,16 @@ function reverse(s::UTF8String)
         ch = dat[pos]
         if ch > 0xdf
             if ch < 0xf0
-                (out -= 3) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
+                (out -= 3) < 0 && throw(UnicodeError(ERR_SHORT, pos, ch))
                 buf[out + 1], buf[out + 2], buf[out + 3] = ch, dat[pos + 1], dat[pos + 2]
                 pos += 3
             else
-                (out -= 4) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
+                (out -= 4) < 0 && throw(UnicodeError(ERR_SHORT, pos, ch))
                 buf[out+1], buf[out+2], buf[out+3], buf[out+4] = ch, dat[pos+1], dat[pos+2], dat[pos+3]
                 pos += 4
             end
         elseif ch > 0x7f
-            (out -= 2) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
+            (out -= 2) < 0 && throw(UnicodeError(ERR_SHORT, pos, ch))
             buf[out + 1], buf[out + 2] = ch, dat[pos + 1]
             pos += 2
         else
diff --git a/base/unicode/utf8proc.jl b/base/unicode/utf8proc.jl
index 4b8ee196cb9fb..7968d27a8c276 100644
--- a/base/unicode/utf8proc.jl
+++ b/base/unicode/utf8proc.jl
@@ -1,70 +1,22 @@
 # This file is a part of Julia. License is MIT: http://julialang.org/license
 
 # Various Unicode functionality from the utf8proc library
-module UTF8proc
-
-import Base: show, showcompact, ==, hash, string, symbol, isless, length, eltype, start, next, done, convert, isvalid, lowercase, uppercase
-
-export isgraphemebreak
-
-# also exported by Base:
-export normalize_string, graphemes, is_assigned_char, charwidth, isvalid,
-   islower, isupper, isalpha, isdigit, isnumber, isalnum,
-   iscntrl, ispunct, isspace, isprint, isgraph, isblank
-
-# whether codepoints are valid Unicode scalar values, i.e. 0-0xd7ff, 0xe000-0x10ffff
-isvalid(::Type{Char}, ch::Unsigned) = !((ch - 0xd800 < 0x800) | (ch > 0x10ffff))
-isvalid(::Type{Char}, ch::Integer) = isvalid(Char, Unsigned(ch))
-isvalid(::Type{Char}, ch::Char) = isvalid(Char, UInt32(ch))
-
-isvalid(ch::Char) = isvalid(Char, ch)
-
-# utf8 category constants
-const UTF8PROC_CATEGORY_CN = 0
-const UTF8PROC_CATEGORY_LU = 1
-const UTF8PROC_CATEGORY_LL = 2
-const UTF8PROC_CATEGORY_LT = 3
-const UTF8PROC_CATEGORY_LM = 4
-const UTF8PROC_CATEGORY_LO = 5
-const UTF8PROC_CATEGORY_MN = 6
-const UTF8PROC_CATEGORY_MC = 7
-const UTF8PROC_CATEGORY_ME = 8
-const UTF8PROC_CATEGORY_ND = 9
-const UTF8PROC_CATEGORY_NL = 10
-const UTF8PROC_CATEGORY_NO = 11
-const UTF8PROC_CATEGORY_PC = 12
-const UTF8PROC_CATEGORY_PD = 13
-const UTF8PROC_CATEGORY_PS = 14
-const UTF8PROC_CATEGORY_PE = 15
-const UTF8PROC_CATEGORY_PI = 16
-const UTF8PROC_CATEGORY_PF = 17
-const UTF8PROC_CATEGORY_PO = 18
-const UTF8PROC_CATEGORY_SM = 19
-const UTF8PROC_CATEGORY_SC = 20
-const UTF8PROC_CATEGORY_SK = 21
-const UTF8PROC_CATEGORY_SO = 22
-const UTF8PROC_CATEGORY_ZS = 23
-const UTF8PROC_CATEGORY_ZL = 24
-const UTF8PROC_CATEGORY_ZP = 25
-const UTF8PROC_CATEGORY_CC = 26
-const UTF8PROC_CATEGORY_CF = 27
-const UTF8PROC_CATEGORY_CS = 28
-const UTF8PROC_CATEGORY_CO = 29
-
-const UTF8PROC_STABLE    = (1<<1)
-const UTF8PROC_COMPAT    = (1<<2)
-const UTF8PROC_COMPOSE   = (1<<3)
-const UTF8PROC_DECOMPOSE = (1<<4)
-const UTF8PROC_IGNORE    = (1<<5)
-const UTF8PROC_REJECTNA  = (1<<6)
-const UTF8PROC_NLF2LS    = (1<<7)
-const UTF8PROC_NLF2PS    = (1<<8)
-const UTF8PROC_NLF2LF    = (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS)
-const UTF8PROC_STRIPCC   = (1<<9)
-const UTF8PROC_CASEFOLD  = (1<<10)
-const UTF8PROC_CHARBOUND = (1<<11)
-const UTF8PROC_LUMP      = (1<<12)
-const UTF8PROC_STRIPMARK = (1<<13)
+
+const STABLE    = (1<<1)
+const COMPAT    = (1<<2)
+const COMPOSE   = (1<<3)
+const DECOMPOSE = (1<<4)
+const IGNORE    = (1<<5)
+const REJECTNA  = (1<<6)
+const NLF2LS    = (1<<7)
+const NLF2PS    = (1<<8)
+const STRIPCC   = (1<<9)
+const CASEFOLD  = (1<<10)
+const CHARBOUND = (1<<11)
+const LUMP      = (1<<12)
+const STRIPMARK = (1<<13)
+
+const NLF2LF = (NLF2LS | NLF2PS)
 
 ############################################################################
 
@@ -80,106 +32,61 @@ end
 
 utf8proc_map(s::AbstractString, flags::Integer) = utf8proc_map(bytestring(s), flags)
 
-function normalize_string(s::AbstractString; stable::Bool=false, compat::Bool=false, compose::Bool=true, decompose::Bool=false, stripignore::Bool=false, rejectna::Bool=false, newline2ls::Bool=false, newline2ps::Bool=false, newline2lf::Bool=false, stripcc::Bool=false, casefold::Bool=false, lump::Bool=false, stripmark::Bool=false)
+function normalize_string(s::AbstractString; stable::Bool=false, compat::Bool=false,
+                          compose::Bool=true, decompose::Bool=false, stripignore::Bool=false,
+                          rejectna::Bool=false, newline2ls::Bool=false, newline2ps::Bool=false,
+                          newline2lf::Bool=false, stripcc::Bool=false, casefold::Bool=false,
+                          lump::Bool=false, stripmark::Bool=false)
     flags = 0
-    stable && (flags = flags | UTF8PROC_STABLE)
-    compat && (flags = flags | UTF8PROC_COMPAT)
+    stable && (flags = flags | STABLE)
+    compat && (flags = flags | COMPAT)
     if decompose
-        flags = flags | UTF8PROC_DECOMPOSE
+        flags = flags | DECOMPOSE
     elseif compose
-        flags = flags | UTF8PROC_COMPOSE
+        flags = flags | COMPOSE
     elseif compat || stripmark
         throw(ArgumentError("compat=true or stripmark=true require compose=true or decompose=true"))
     end
-    stripignore && (flags = flags | UTF8PROC_IGNORE)
-    rejectna && (flags = flags | UTF8PROC_REJECTNA)
-    newline2ls + newline2ps + newline2lf > 1 && throw(ArgumentError("only one newline conversion may be specified"))
-    newline2ls && (flags = flags | UTF8PROC_NLF2LS)
-    newline2ps && (flags = flags | UTF8PROC_NLF2PS)
-    newline2lf && (flags = flags | UTF8PROC_NLF2LF)
-    stripcc && (flags = flags | UTF8PROC_STRIPCC)
-    casefold && (flags = flags | UTF8PROC_CASEFOLD)
-    lump && (flags = flags | UTF8PROC_LUMP)
-    stripmark && (flags = flags | UTF8PROC_STRIPMARK)
+    stripignore && (flags = flags | IGNORE)
+    rejectna    && (flags = flags | REJECTNA)
+    newline2ls + newline2ps + newline2lf > 1 &&
+	throw(ArgumentError("only one newline conversion may be specified"))
+    newline2ls  && (flags = flags | NLF2LS)
+    newline2ps  && (flags = flags | NLF2PS)
+    newline2lf  && (flags = flags | NLF2LF)
+    stripcc     && (flags = flags | STRIPCC)
+    casefold    && (flags = flags | CASEFOLD)
+    lump        && (flags = flags | LUMP)
+    stripmark   && (flags = flags | STRIPMARK)
     utf8proc_map(s, flags)
 end
 
-function normalize_string(s::AbstractString, nf::Symbol)
-    utf8proc_map(s, nf == :NFC ? (UTF8PROC_STABLE | UTF8PROC_COMPOSE) :
-                    nf == :NFD ? (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE) :
-                    nf == :NFKC ? (UTF8PROC_STABLE | UTF8PROC_COMPOSE
-                                   | UTF8PROC_COMPAT) :
-                    nf == :NFKD ? (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE
-                                   | UTF8PROC_COMPAT) :
+normalize_string(s::AbstractString, nf::Symbol) =
+    utf8proc_map(s, nf == :NFC ? (STABLE | COMPOSE) :
+                    nf == :NFD ? (STABLE | DECOMPOSE) :
+                    nf == :NFKC ? (STABLE | COMPOSE | COMPAT) :
+                    nf == :NFKD ? (STABLE | DECOMPOSE | COMPAT) :
                     throw(ArgumentError(":$nf is not one of :NFC, :NFD, :NFKC, :NFKD")))
-end
 
 ############################################################################
 
 charwidth(c::Char) = Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), c))
 
-lowercase(c::Char) = isascii(c) ? ('A' <= c <= 'Z' ? c + 0x20 : c) : Char(ccall(:utf8proc_tolower, UInt32, (UInt32,), c))
-uppercase(c::Char) = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) : Char(ccall(:utf8proc_toupper, UInt32, (UInt32,), c))
+lowercase(c::Char) = (isascii(c)
+                      ? ('A' <= c <= 'Z' ? c + 0x20 : c)
+                      : Char(ccall(:utf8proc_tolower, UInt32, (UInt32,), c)))
+uppercase(c::Char) = (isascii(c)
+                      ? ('a' <= c <= 'z' ? c - 0x20 : c)
+                      : Char(ccall(:utf8proc_toupper, UInt32, (UInt32,), c)))
 
 ############################################################################
 
-# returns UTF8PROC_CATEGORY code in 0:30 giving Unicode category
-function category_code(c)
-    return ccall(:utf8proc_category, Cint, (UInt32,), c)
-end
-
-is_assigned_char(c) = category_code(c) != UTF8PROC_CATEGORY_CN
-
-## libc character class predicates ##
-
-islower(c::Char) = (category_code(c) == UTF8PROC_CATEGORY_LL)
-
-# true for Unicode upper and mixed case
-function isupper(c::Char)
-    ccode = category_code(c)
-    return ccode == UTF8PROC_CATEGORY_LU || ccode == UTF8PROC_CATEGORY_LT
-end
-
-isdigit(c::Char)  = ('0' <= c <= '9')
-isalpha(c::Char)  = (UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_LO)
-isnumber(c::Char) = (UTF8PROC_CATEGORY_ND <= category_code(c) <= UTF8PROC_CATEGORY_NO)
-
-function isalnum(c::Char)
-    ccode = category_code(c)
-    return (UTF8PROC_CATEGORY_LU <= ccode <= UTF8PROC_CATEGORY_LO) ||
-           (UTF8PROC_CATEGORY_ND <= ccode <= UTF8PROC_CATEGORY_NO)
-end
-
-# following C++ only control characters from the Latin-1 subset return true
-iscntrl(c::Char) = (c <= Char(0x1f) || Char(0x7f) <= c <= Char(0x9f))
-
-ispunct(c::Char) = (UTF8PROC_CATEGORY_PC <= category_code(c) <= UTF8PROC_CATEGORY_PO)
-
-# \u85 is the Unicode Next Line (NEL) character
-# the check for \ufffd allows for branch removal on ASCIIStrings
-@inline isspace(c::Char) = c == ' ' || '\t' <= c <='\r' || c == '\u85' || '\ua0' <= c && c != '\ufffd' && category_code(c)==UTF8PROC_CATEGORY_ZS
-
-isprint(c::Char) = (UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_ZS)
-
-# true in principal if a printer would use ink
-isgraph(c::Char) = (UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_SO)
-
-for name = ("alnum", "alpha", "cntrl", "digit", "number", "graph",
-            "lower", "print", "punct", "space", "upper")
-    f = symbol("is",name)
-    @eval begin
-        function $f(s::AbstractString)
-            for c in s
-                if !$f(c)
-                    return false
-                end
-            end
-            return true
-        end
-    end
-end
+# returns CharCategoryCode (enum values 0:29) giving Unicode category
+charprop(::Type{CharCategoryCode}, c) =
+    CharCategoryCode(ccall(:utf8proc_category, Cint, (UInt32,), c))
 
 ############################################################################
+
 # iterators for grapheme segmentation
 
 isgraphemebreak(c1::Char, c2::Char) =
@@ -190,7 +97,7 @@ immutable GraphemeIterator{S<:AbstractString}
 end
 graphemes(s::AbstractString) = GraphemeIterator{typeof(s)}(s)
 
-eltype{S}(::Type{GraphemeIterator{S}}) = SubString{S}
+Base.eltype{S}(::Type{GraphemeIterator{S}}) = SubString{S}
 
 function length(g::GraphemeIterator)
     c0 = Char(0x00ad) # soft hyphen (grapheme break always allowed after this)
@@ -202,8 +109,8 @@ function length(g::GraphemeIterator)
     return n
 end
 
-start(g::GraphemeIterator) = start(g.s)
-done(g::GraphemeIterator, i) = done(g.s, i)
+Base.start(g::GraphemeIterator) = start(g.s)
+Base.done(g::GraphemeIterator, i) = done(g.s, i)
 
 function next(g::GraphemeIterator, i)
     s = g.s
@@ -220,13 +127,12 @@ function next(g::GraphemeIterator, i)
 end
 
 ==(g1::GraphemeIterator, g2::GraphemeIterator) = g1.s == g2.s
-hash(g::GraphemeIterator, h::UInt) = hash(g.s, h)
+Base.hash(g::GraphemeIterator, h::UInt) = hash(g.s, h)
 isless(g1::GraphemeIterator, g2::GraphemeIterator) = isless(g1.s, g2.s)
 
 convert{S<:AbstractString}(::Type{S}, g::GraphemeIterator) = convert(S, g.s)
 
-show{S}(io::IO, g::GraphemeIterator{S}) = print(io, "length-$(length(g)) GraphemeIterator{$S} for \"$(g.s)\"")
+Base.show{S}(io::IO, g::GraphemeIterator{S}) =
+    print(io, "length-$(length(g)) GraphemeIterator{$S} for \"$(g.s)\"")
 
 ############################################################################
-
-end # module
diff --git a/test/unicode.jl b/test/unicode.jl
index 21f3dd7d48fb4..3f0004c7ac737 100644
--- a/test/unicode.jl
+++ b/test/unicode.jl
@@ -1,9 +1,10 @@
 # This file is a part of Julia. License is MIT: http://julialang.org/license
-
 include("unicode/UnicodeError.jl")
 include("unicode/types.jl")
 include("unicode/checkstring.jl")
 include("unicode/utf8.jl")
 include("unicode/utf16.jl")
 include("unicode/utf32.jl")
+include("unicode/properties.jl")
 include("unicode/utf8proc.jl")
+
diff --git a/test/unicode/UnicodeError.jl b/test/unicode/UnicodeError.jl
index 0f78ab85bb94d..272e3bffd6878 100644
--- a/test/unicode/UnicodeError.jl
+++ b/test/unicode/UnicodeError.jl
@@ -1,7 +1,7 @@
 # This file is a part of Julia. License is MIT: http://julialang.org/license
 
 let io = IOBuffer()
-    show(io, UnicodeError(Base.UTF_ERR_SHORT, 1, 10))
+    show(io, UnicodeError(Unicode.ERR_SHORT, 1, 10))
     check = "UnicodeError: invalid UTF-8 sequence starting at index 1 (0xa) missing one or more continuation bytes)"
     @test takebuf_string(io) == check
 end
diff --git a/test/unicode/properties.jl b/test/unicode/properties.jl
new file mode 100644
index 0000000000000..735eed3d04fbe
--- /dev/null
+++ b/test/unicode/properties.jl
@@ -0,0 +1,164 @@
+# This file is a part of Julia. License is MIT: http://julialang.org/license
+
+#issue #5939  uft8proc/libmojibake character predicates
+let
+    alower=['a', 'd', 'j', 'y', 'z']
+    ulower=['α', 'β', 'γ', 'δ', 'ф', 'я']
+    for c in vcat(alower,ulower)
+        @test islower(c) == true
+        @test isupper(c) == false
+        @test isdigit(c) == false
+        @test isnumber(c) == false
+    end
+
+    aupper=['A', 'D', 'J', 'Y', 'Z']
+    uupper= ['Δ', 'Γ', 'Π', 'Ψ', 'ǅ', 'Ж', 'Д']
+
+    for c in vcat(aupper,uupper)
+        @test islower(c) == false
+        @test isupper(c) == true
+        @test isdigit(c) == false
+        @test isnumber(c) == false
+    end
+
+    nocase=['א','ﺵ']
+    alphas=vcat(alower,ulower,aupper,uupper,nocase)
+
+    for c in alphas
+         @test isalpha(c) == true
+         @test isnumber(c) == false
+    end
+
+
+    anumber=['0', '1', '5', '9']
+    unumber=['٣', '٥', '٨', '¹', 'ⅳ' ]
+
+    for c in anumber
+         @test isdigit(c) == true
+         @test isnumber(c) == true
+    end
+    for c in unumber
+         @test isdigit(c) == false
+         @test isnumber(c) == true
+    end
+
+    alnums=vcat(alphas,anumber,unumber)
+    for c in alnums
+         @test isalnum(c) == true
+         @test ispunct(c) == false
+    end
+
+    asymbol = ['(',')', '~', '$' ]
+    usymbol = ['∪', '∩', '⊂', '⊃', '√', '€', '¥', '↰', '△', '§']
+
+    apunct =['.',',',';',':','&']
+    upunct =['‡', '؟', '჻' ]
+
+    for c in vcat(apunct,upunct)
+         @test ispunct(c) == true
+         @test isalnum(c) == false
+    end
+
+    for c in vcat(alnums,asymbol,usymbol,apunct,upunct)
+        @test isprint(c) == true
+        @test isgraph(c) == true
+        @test isspace(c) == false
+        @test iscntrl(c) == false
+    end
+
+    NBSP = Char(0x0000A0)
+    ENSPACE = Char(0x002002)
+    EMSPACE = Char(0x002003)
+    THINSPACE = Char(0x002009)
+    ZWSPACE = Char(0x002060)
+
+    uspace = [ENSPACE, EMSPACE, THINSPACE]
+    aspace = [' ']
+    acntrl_space = ['\t', '\n', '\v', '\f', '\r']
+    for c in vcat(aspace,uspace)
+        @test isspace(c) == true
+        @test isprint(c) == true
+        @test isgraph(c) == false
+    end
+
+    for c in vcat(acntrl_space)
+        @test isspace(c) == true
+        @test isprint(c) == false
+        @test isgraph(c) == false
+    end
+
+    @test isspace(ZWSPACE) == false # zero-width space
+
+    acontrol = [ Char(0x001c), Char(0x001d), Char(0x001e), Char(0x001f)]
+    latincontrol = [ Char(0x0080), Char(0x0085) ]
+    ucontrol = [ Char(0x200E), Char(0x202E) ]
+
+    for c in vcat(acontrol, acntrl_space, latincontrol)
+        @test iscntrl(c) == true
+        @test isalnum(c) == false
+        @test isprint(c) == false
+        @test isgraph(c) == false
+    end
+
+    for c in ucontrol  #non-latin1 controls
+        if c!=Char(0x0085)
+            @test iscntrl(c) == false
+            @test isspace(c) == false
+            @test isalnum(c) == false
+            @test isprint(c) == false
+            @test isgraph(c) == false
+        end
+    end
+
+end
+
+@test isspace("  \t   \n   \r  ")==true
+@test isgraph("  \t   \n   \r  ")==false
+@test isprint("  \t   \n   \r  ")==false
+@test isalpha("  \t   \n   \r  ")==false
+@test isnumber("  \t   \n   \r  ")==false
+@test ispunct("  \t   \n   \r  ")==false
+
+@test isspace("ΣβΣβ")==false
+@test isalpha("ΣβΣβ")==true
+@test isgraph("ΣβΣβ")==true
+@test isprint("ΣβΣβ")==true
+@test isupper("ΣβΣβ")==false
+@test islower("ΣβΣβ")==false
+@test isnumber("ΣβΣβ")==false
+@test iscntrl("ΣβΣβ")==false
+@test ispunct("ΣβΣβ")==false
+
+@test isnumber("23435")==true
+@test isdigit("23435")==true
+@test isalnum("23435")==true
+@test isalpha("23435")==false
+@test iscntrl( string(Char(0x0080))) == true
+@test ispunct( "‡؟჻") ==true
+
+@test isxdigit('0') == true
+@test isxdigit("0") == true
+@test isxdigit("a") == true
+@test isxdigit("g") == false
+
+# check handling of CN category constants
+let c_ll = 'β', c_cn = '\u038B'
+    @test charprop(CharCategoryCode, c_ll) == Unicode.Ll
+    # check codepoint with category code CN
+    @test charprop(CharCategoryCode, c_cn) == Unicode.Cn
+end
+
+# Make sure fastplus is called for coverage
+@test lowercase('A') == 'a'
+@test uppercase('a') == 'A'
+
+@test is_assigned_char('A')
+
+# Get full coverage of isspace function
+@test isspace(' ')
+@test isspace('\t')
+@test isspace('\r')
+@test isspace('\u85')
+@test isspace('\ua0')
+@test !isspace('\ufffd')
+@test !isspace('\U10ffff')
diff --git a/test/unicode/utf8proc.jl b/test/unicode/utf8proc.jl
index 4f979c347b721..c54e1ac65b41d 100644
--- a/test/unicode/utf8proc.jl
+++ b/test/unicode/utf8proc.jl
@@ -75,154 +75,6 @@ end
 @test normalize_string("\U1e9b\U0323", :NFKD) == "s\U0323\U0307"
 @test normalize_string("\U1e9b\U0323", :NFKC) == "\U1e69"
 
-#issue #5939  uft8proc/libmojibake character predicates
-let
-    alower=['a', 'd', 'j', 'y', 'z']
-    ulower=['α', 'β', 'γ', 'δ', 'ф', 'я']
-    for c in vcat(alower,ulower)
-        @test islower(c) == true
-        @test isupper(c) == false
-        @test isdigit(c) == false
-        @test isnumber(c) == false
-    end
-
-    aupper=['A', 'D', 'J', 'Y', 'Z']
-    uupper= ['Δ', 'Γ', 'Π', 'Ψ', 'ǅ', 'Ж', 'Д']
-
-    for c in vcat(aupper,uupper)
-        @test islower(c) == false
-        @test isupper(c) == true
-        @test isdigit(c) == false
-        @test isnumber(c) == false
-    end
-
-    nocase=['א','ﺵ']
-    alphas=vcat(alower,ulower,aupper,uupper,nocase)
-
-    for c in alphas
-         @test isalpha(c) == true
-         @test isnumber(c) == false
-    end
-
-
-    anumber=['0', '1', '5', '9']
-    unumber=['٣', '٥', '٨', '¹', 'ⅳ' ]
-
-    for c in anumber
-         @test isdigit(c) == true
-         @test isnumber(c) == true
-    end
-    for c in unumber
-         @test isdigit(c) == false
-         @test isnumber(c) == true
-    end
-
-    alnums=vcat(alphas,anumber,unumber)
-    for c in alnums
-         @test isalnum(c) == true
-         @test ispunct(c) == false
-    end
-
-    asymbol = ['(',')', '~', '$' ]
-    usymbol = ['∪', '∩', '⊂', '⊃', '√', '€', '¥', '↰', '△', '§']
-
-    apunct =['.',',',';',':','&']
-    upunct =['‡', '؟', '჻' ]
-
-    for c in vcat(apunct,upunct)
-         @test ispunct(c) == true
-         @test isalnum(c) == false
-    end
-
-    for c in vcat(alnums,asymbol,usymbol,apunct,upunct)
-        @test isprint(c) == true
-        @test isgraph(c) == true
-        @test isspace(c) == false
-        @test iscntrl(c) == false
-    end
-
-    NBSP = Char(0x0000A0)
-    ENSPACE = Char(0x002002)
-    EMSPACE = Char(0x002003)
-    THINSPACE = Char(0x002009)
-    ZWSPACE = Char(0x002060)
-
-    uspace = [ENSPACE, EMSPACE, THINSPACE]
-    aspace = [' ']
-    acntrl_space = ['\t', '\n', '\v', '\f', '\r']
-    for c in vcat(aspace,uspace)
-        @test isspace(c) == true
-        @test isprint(c) == true
-        @test isgraph(c) == false
-    end
-
-    for c in vcat(acntrl_space)
-        @test isspace(c) == true
-        @test isprint(c) == false
-        @test isgraph(c) == false
-    end
-
-    @test isspace(ZWSPACE) == false # zero-width space
-
-    acontrol = [ Char(0x001c), Char(0x001d), Char(0x001e), Char(0x001f)]
-    latincontrol = [ Char(0x0080), Char(0x0085) ]
-    ucontrol = [ Char(0x200E), Char(0x202E) ]
-
-    for c in vcat(acontrol, acntrl_space, latincontrol)
-        @test iscntrl(c) == true
-        @test isalnum(c) == false
-        @test isprint(c) == false
-        @test isgraph(c) == false
-    end
-
-    for c in ucontrol  #non-latin1 controls
-        if c!=Char(0x0085)
-            @test iscntrl(c) == false
-            @test isspace(c) == false
-            @test isalnum(c) == false
-            @test isprint(c) == false
-            @test isgraph(c) == false
-        end
-    end
-
-end
-
-@test isspace("  \t   \n   \r  ")==true
-@test isgraph("  \t   \n   \r  ")==false
-@test isprint("  \t   \n   \r  ")==false
-@test isalpha("  \t   \n   \r  ")==false
-@test isnumber("  \t   \n   \r  ")==false
-@test ispunct("  \t   \n   \r  ")==false
-
-@test isspace("ΣβΣβ")==false
-@test isalpha("ΣβΣβ")==true
-@test isgraph("ΣβΣβ")==true
-@test isprint("ΣβΣβ")==true
-@test isupper("ΣβΣβ")==false
-@test islower("ΣβΣβ")==false
-@test isnumber("ΣβΣβ")==false
-@test iscntrl("ΣβΣβ")==false
-@test ispunct("ΣβΣβ")==false
-
-@test isnumber("23435")==true
-@test isdigit("23435")==true
-@test isalnum("23435")==true
-@test isalpha("23435")==false
-@test iscntrl( string(Char(0x0080))) == true
-@test ispunct( "‡؟჻") ==true
-
-@test isxdigit('0') == true
-@test isxdigit("0") == true
-@test isxdigit("a") == true
-@test isxdigit("g") == false
-
-# check utf8proc handling of CN category constants
-let c_ll = 'β', c_cn = '\u038B'
-    @test Base.UTF8proc.category_code(c_ll) == Base.UTF8proc.UTF8PROC_CATEGORY_LL
-    # check codepoint with category code CN
-    @test Base.UTF8proc.category_code(c_cn) == Base.UTF8proc.UTF8PROC_CATEGORY_CN
-end
-
 # graphemes
 let grphtest = (("b\u0300lahβlahb\u0302láh", ["b\u0300","l","a","h",
                                               "β","l","a","h",
@@ -269,21 +121,6 @@ end
 @test_throws ArgumentError normalize_string("\u006e\u0303", compose=false, compat=true)
 @test_throws ArgumentError normalize_string("\u006e\u0303", compose=false, stripmark=true)
 
-# Make sure fastplus is called for coverage
-@test lowercase('A') == 'a'
-@test uppercase('a') == 'A'
-
-@test is_assigned_char('A')
-
-# Get full coverage of isspace function
-@test isspace(' ')
-@test isspace('\t')
-@test isspace('\r')
-@test isspace('\u85')
-@test isspace('\ua0')
-@test !isspace('\ufffd')
-@test !isspace('\U10ffff')
-
 # Get full coverage of grapheme iterator functions
 let str = ascii("This is a test")
     g = graphemes(str)

From 2c53e526d8d0582ec77cd70c92ff72aa17c982a5 Mon Sep 17 00:00:00 2001
From: ScottPJones <scottjones@alum.mit.edu>
Date: Sun, 13 Dec 2015 17:19:53 -0500
Subject: [PATCH 2/7] Update to use submodule

---
 base/exports.jl            |   5 +-
 base/unicode.jl            |   4 +-
 base/unicode/properties.jl | 156 +++++++++++++++++++------------------
 base/unicode/utf8proc.jl   |   5 +-
 test/unicode/properties.jl |   4 +-
 5 files changed, 89 insertions(+), 85 deletions(-)

diff --git a/base/exports.jl b/base/exports.jl
index 00d714ceb74be..803a41125ff33 100644
--- a/base/exports.jl
+++ b/base/exports.jl
@@ -22,6 +22,7 @@ export
     Serializer,
     Docs,
     Markdown,
+    Cat,
     Unicode,
 
 # Types
@@ -41,8 +42,8 @@ export
     CartesianIndex,
     CartesianRange,
     Channel,
-    CharCategory,
-    CharCategoryCode,
+    CharCode,
+    CharType,
     Cmd,
     Colon,
     Complex,
diff --git a/base/unicode.jl b/base/unicode.jl
index a0f6096e75289..fc94993410638 100644
--- a/base/unicode.jl
+++ b/base/unicode.jl
@@ -6,9 +6,7 @@ import Base: string, convert, write, length, endof, next, reverseind, lastidx, r
 	     lowercase, uppercase, eltype, isless, promote_rule, ==
 
 export UnicodeError, UTF16String, UTF32String, unsafe_checkstring, checkstring,
-       utf8, utf16, utf32, containsnul, WString, wstring,
-       charprop, CharCategoryCode, UnicodeProperty, CharCategory, CatLetter, CatMark, CatNumber,
-       CatPunctuation, CatSymbol, CatSeparator, CatOther, CatUpper,
+       utf8, utf16, utf32, containsnul, WString, wstring, charprop, Cat, CharType, CharCode,
        is_assigned_char, islower, isupper, isdigit, isalpha, isnumber, isalnum, iscntrl,
        ispunct, isspace, isprint, isgraph,
        isgraphemebreak, GraphemeIterator, normalize_string, graphemes, charwidth
diff --git a/base/unicode/properties.jl b/base/unicode/properties.jl
index e2cf4e99b6a00..82db6cafc6861 100644
--- a/base/unicode/properties.jl
+++ b/base/unicode/properties.jl
@@ -12,71 +12,78 @@ isvalid(ch::Char) = isvalid(Char, ch)
 
 # Unicode General Category constants
 
+module Cat
+export Property, CharType, CharCode
+
 """Unicode character properties"""
-abstract UnicodeProperty
-"""Unicode character categories"""
-abstract CharCategory   <: UnicodeProperty
-
-"""Unicode letter character category"""
-abstract CatLetter      <: CharCategory
-"""Unicode Mark character category"""
-abstract CatMark        <: CharCategory
-"""Unicode Numeric character category"""
-abstract CatNumber      <: CharCategory
-"""Unicode Punctuation character category"""
-abstract CatPunctuation <: CharCategory
-"""Unicode Symbol character category"""
-abstract CatSymbol      <: CharCategory
-"""Unicode Separator character category"""
-abstract CatSeparator   <: CharCategory
-"""Unicode Other character category"""
-abstract CatOther       <: CharCategory
+abstract Property
+
+"""Unicode character category type"""
+abstract CharType    <: Property
+
+"""Unicode 'Letter' character category"""
+abstract Letter      <: CharType
+"""Unicode 'Mark' character category"""
+abstract Mark        <: CharType
+"""Unicode 'Number' character category"""
+abstract Number      <: CharType
+"""Unicode 'Punctuation' character category"""
+abstract Punctuation <: CharType
+"""Unicode 'Symbol' character category"""
+abstract Symbol      <: CharType
+"""Unicode 'Separator' character category"""
+abstract Separator   <: CharType
+"""Unicode 'Other' character category"""
+abstract Other       <: CharType
 
 """Unicode uppercase & titlecase letters"""
-abstract CatUpper       <: CatLetter
+abstract Upper       <: Letter
+
+"""Unicode character category code (0-29)"""
+bitstype 8 CharCode
 
-"""Unicode Character Category Code (0-29)"""
-bitstype 8 CharCategoryCode
+end # module Cat
+import .Cat: Property, CharType, CharCode
 
-convert(::Type{CharCategoryCode}, x::Integer) = reinterpret(CharCategoryCode, x%UInt8)
-convert{T<:Integer}(::Type{T}, x::CharCategoryCode) = convert(T, reinterpret(UInt8, x))
-promote_rule{T<:Integer}(::Type{T}, ::Type{CharCategoryCode}) = T
-isless(x::CharCategoryCode, y::CharCategoryCode) = isless(UInt32(x), UInt32(y))
-isless(x::CharCategoryCode, y::Integer) = isless(UInt32(x), y)
-isless(x::Integer, y::CharCategoryCode) = isless(x, UInt32(y))
+convert(::Type{CharCode}, x::Integer) = reinterpret(CharCode, x%UInt8)
+convert{T<:Integer}(::Type{T}, x::CharCode) = convert(T, reinterpret(UInt8, x))
+promote_rule{T<:Integer}(::Type{T}, ::Type{CharCode}) = T
+isless(x::CharCode, y::CharCode) = isless(UInt8(x), UInt8(y))
+isless(x::CharCode, y::Integer)  = isless(UInt8(x), y)
+isless(x::Integer, y::CharCode)  = isless(x, UInt8(y))
 
 for (nam, val, cat, typ, des) in
-    ((:Cn, 0,  :NotAssignedChar,    CatOther,       "Other, Not assigned"),
-     (:Lu, 1,  :UpperCase,          CatUpper,       "Letter, uppercase"),
-     (:Ll, 2,  :LowerCase,          CatLetter,      "Letter, lowercase"),
-     (:Lt, 3,  :TitleCase,          CatUpper,       "Letter, titlecase"),
-     (:Lm, 4,  :ModifierLetter,     CatLetter,      "Letter, modifier"),
-     (:Lo, 5,  :OtherLetter,        CatLetter,      "Letter, other"),
-     (:Mn, 6,  :NonSpacingMark,     CatMark,        "Mark, nonspacing"),
-     (:Mc, 7,  :CombiningMark,      CatMark,        "Mark, spacing combining"),
-     (:Me, 8,  :EnclosingMark,      CatMark,        "Mark, enclosing"),
-     (:Nd, 9,  :DecimalDigit,       CatNumber,      "Number, decimal digit"),
-     (:Nl, 10, :NumericLetter,      CatNumber,      "Number, letter"),
-     (:No, 11, :OtherNumber,        CatNumber,      "Number, other"),
-     (:Pc, 12, :ConnectorPunct,     CatPunctuation, "Punctuation, connector"),
-     (:Pd, 13, :DashPunct,          CatPunctuation, "Punctuation, dash"),
-     (:Ps, 14, :OpenPunct,          CatPunctuation, "Punctuation, open"),
-     (:Pe, 15, :ClosePunct,         CatPunctuation, "Punctuation, close"),
-     (:Pi, 16, :BegQuotePunct,      CatPunctuation, "Punctuation, initial quote"),
-     (:Pf, 17, :EndQuotePunct,      CatPunctuation, "Punctuation, final quote"),
-     (:Po, 18, :OtherPunct,         CatPunctuation, "Punctuation, other"),
-     (:Sm, 19, :MathSymbol,         CatSymbol,      "Symbol, math"),
-     (:Sc, 20, :CurrencySymbol,     CatSymbol,      "Symbol, currency"),
-     (:Sk, 21, :ModifierSymbol,     CatSymbol,      "Symbol, modifier"),
-     (:So, 22, :OtherSymbol,        CatSymbol,      "Symbol, other"),
-     (:Zs, 23, :SpaceSeparator,     CatSeparator,   "Separator, space"),
-     (:Zl, 24, :LineSeparator,      CatSeparator,   "Separator, line"),
-     (:Zp, 25, :ParagraphSeparator, CatSeparator,   "Separator, paragraph"),
-     (:Cc, 26, :ControlChar,        CatOther,       "Other, control"),
-     (:Cf, 27, :FormatChar,         CatOther,       "Other, format"),
-     (:Cs, 28, :SurrogateChar,      CatOther,       "Other, surrogate"),
-     (:Co, 29, :PrivateUseChar,     CatOther,       "Other, private use"))
-    @eval const global $nam = CharCategoryCode($val)
+    ((:Cn, 0,  :NotAssignedChar,    Cat.Other,       "Other, Not assigned"),
+     (:Lu, 1,  :UpperCase,          Cat.Upper,       "Letter, uppercase"),
+     (:Ll, 2,  :LowerCase,          Cat.Letter,      "Letter, lowercase"),
+     (:Lt, 3,  :TitleCase,          Cat.Upper,       "Letter, titlecase"),
+     (:Lm, 4,  :ModifierLetter,     Cat.Letter,      "Letter, modifier"),
+     (:Lo, 5,  :OtherLetter,        Cat.Letter,      "Letter, other"),
+     (:Mn, 6,  :NonSpacingMark,     Cat.Mark,        "Mark, nonspacing"),
+     (:Mc, 7,  :CombiningMark,      Cat.Mark,        "Mark, spacing combining"),
+     (:Me, 8,  :EnclosingMark,      Cat.Mark,        "Mark, enclosing"),
+     (:Nd, 9,  :DecimalDigit,       Cat.Number,      "Number, decimal digit"),
+     (:Nl, 10, :NumericLetter,      Cat.Number,      "Number, letter"),
+     (:No, 11, :OtherNumber,        Cat.Number,      "Number, other"),
+     (:Pc, 12, :ConnectorPunct,     Cat.Punctuation, "Punctuation, connector"),
+     (:Pd, 13, :DashPunct,          Cat.Punctuation, "Punctuation, dash"),
+     (:Ps, 14, :OpenPunct,          Cat.Punctuation, "Punctuation, open"),
+     (:Pe, 15, :ClosePunct,         Cat.Punctuation, "Punctuation, close"),
+     (:Pi, 16, :BegQuotePunct,      Cat.Punctuation, "Punctuation, initial quote"),
+     (:Pf, 17, :EndQuotePunct,      Cat.Punctuation, "Punctuation, final quote"),
+     (:Po, 18, :OtherPunct,         Cat.Punctuation, "Punctuation, other"),
+     (:Sm, 19, :MathSymbol,         Cat.Symbol,      "Symbol, math"),
+     (:Sc, 20, :CurrencySymbol,     Cat.Symbol,      "Symbol, currency"),
+     (:Sk, 21, :ModifierSymbol,     Cat.Symbol,      "Symbol, modifier"),
+     (:So, 22, :OtherSymbol,        Cat.Symbol,      "Symbol, other"),
+     (:Zs, 23, :SpaceSeparator,     Cat.Separator,   "Separator, space"),
+     (:Zl, 24, :LineSeparator,      Cat.Separator,   "Separator, line"),
+     (:Zp, 25, :ParagraphSeparator, Cat.Separator,   "Separator, paragraph"),
+     (:Cc, 26, :ControlChar,        Cat.Other,       "Other, control"),
+     (:Cf, 27, :FormatChar,         Cat.Other,       "Other, format"),
+     (:Cs, 28, :SurrogateChar,      Cat.Other,       "Other, surrogate"),
+     (:Co, 29, :PrivateUseChar,     Cat.Other,       "Other, private use"))
+    @eval const global $nam = CharCode($val)
     @eval export $cat
     @eval abstract $cat <: $typ
     @eval @doc $(string("Unicode Category Code: ",des)) $nam
@@ -94,54 +101,53 @@ const c2t = [NotAssignedChar, UpperCase, LowerCase, TitleCase, ModifierLetter, O
 
 ############################################################################
 
-
 """
 Return various Unicode properties for character
 """
 function charprop end
 
-charprop(::Type{CharCategory}, c) = c2t[Int(charprop(CharCategoryCode, c))+1]
+charprop(::Type{CharType}, c) = c2t[Int(charprop(CharCode, c))+1]
 
-is_assigned_char(c) = charprop(CharCategoryCode, c) != Cn
+is_assigned_char(c) = charprop(CharCode, c) != Cn
 
 ## libc character class predicates ##
 
-islower(c::Char) = charprop(CharCategoryCode, c) == Ll
+islower(c::Char) = charprop(CharCode, c) == Ll
 
 # true for Unicode upper and mixed case
-isupper(c::Char) = (ccode = charprop(CharCategoryCode, c)) == Lu || ccode == Lt
+isupper(c::Char) = (ccode = charprop(CharCode, c)) == Lu || ccode == Lt
 
 isdigit(c::Char)  = ('0' <= c <= '9')
-isalpha(c::Char)  = (Lu <= charprop(CharCategoryCode, c) <= Lo)
-isnumber(c::Char) = (Nd <= charprop(CharCategoryCode, c) <= No)
-isalnum(c::Char)  = (Lu <= (ccode = charprop(CharCategoryCode, c)) <= Lo) || (Nd <= ccode <= No)
+isalpha(c::Char)  = (Lu <= charprop(CharCode, c) <= Lo)
+isnumber(c::Char) = (Nd <= charprop(CharCode, c) <= No)
+isalnum(c::Char)  = (Lu <= (ccode = charprop(CharCode, c)) <= Lo) || (Nd <= ccode <= No)
 
 # These are about 3 times slower, because the isa method
 # is much slower than checking if an integer is within range (or two ranges)
 # If that is sped up, then these, which are more readable, could replace the other forms.
 #=
-isalpha(c::Char)  = charprop(CharCategory, c) <: CatLetter
-isnumber(c::Char) = charprop(CharCategory, c) <: CatNumber
-isupper(c::Char)  = charprop(CharCategory, c) <: CatUpper
-isalnum(c::Char)  = charprop(CharCategory, c) <: Union{CatLetter, CatNumber}
-ispunct(c::Char)  = charprop(CharCategory, c) <: CatPunctuation
+isalpha(c::Char)  = charprop(CharType, c) <: CatLetter
+isnumber(c::Char) = charprop(CharType, c) <: CatNumber
+isupper(c::Char)  = charprop(CharType, c) <: CatUpper
+isalnum(c::Char)  = charprop(CharType, c) <: Union{CatLetter, CatNumber}
+ispunct(c::Char)  = charprop(CharType, c) <: CatPunctuation
 =#
 
 # following C++ only control characters from the Latin-1 subset return true
 iscntrl(c::Char) = (c <= Char(0x1f) || Char(0x7f) <= c <= Char(0x9f))
 
-ispunct(c::Char) = (Pc <= charprop(CharCategoryCode, c) <= Po)
+ispunct(c::Char) = (Pc <= charprop(CharCode, c) <= Po)
 
 # \u85 is the Unicode Next Line (NEL) character
 # the check for \ufffd allows for branch removal on ASCIIStrings
 @inline isspace(c::Char) =
     (c == ' ' || '\t' <= c <='\r' || c == '\u85' ||
-     ('\ua0' <= c && c != '\ufffd' && charprop(CharCategoryCode, c) == Zs))
+     ('\ua0' <= c && c != '\ufffd' && charprop(CharCode, c) == Zs))
 
-isprint(c::Char) = (Lu <= charprop(CharCategoryCode, c) <= Zs)
+isprint(c::Char) = (Lu <= charprop(CharCode, c) <= Zs)
 
 # true in principle if a printer would use ink
-isgraph(c::Char) = (Lu <= charprop(CharCategoryCode, c) <= So)
+isgraph(c::Char) = (Lu <= charprop(CharCode, c) <= So)
 
 for name = ("alnum", "alpha", "cntrl", "digit", "number", "graph",
             "lower", "print", "punct", "space", "upper")
diff --git a/base/unicode/utf8proc.jl b/base/unicode/utf8proc.jl
index 7968d27a8c276..b8dc1c1a888ed 100644
--- a/base/unicode/utf8proc.jl
+++ b/base/unicode/utf8proc.jl
@@ -81,9 +81,8 @@ uppercase(c::Char) = (isascii(c)
 
 ############################################################################
 
-# returns CharCategoryCode (enum values 0:29) giving Unicode category
-charprop(::Type{CharCategoryCode}, c) =
-    CharCategoryCode(ccall(:utf8proc_category, Cint, (UInt32,), c))
+# returns CharCode (values 0:29) giving Unicode category
+charprop(::Type{CharCode}, c) = CharCode(ccall(:utf8proc_category, Cint, (UInt32,), c))
 
 ############################################################################
 
diff --git a/test/unicode/properties.jl b/test/unicode/properties.jl
index 735eed3d04fbe..79a4ad3c5c809 100644
--- a/test/unicode/properties.jl
+++ b/test/unicode/properties.jl
@@ -143,9 +143,9 @@ end
 
 # check handling of CN category constants
 let c_ll = 'β', c_cn = '\u038B'
-    @test charprop(CharCategoryCode, c_ll) == Unicode.Ll
+    @test charprop(CharCode, c_ll) == Unicode.Ll
     # check codepoint with category code CN
-    @test charprop(CharCategoryCode, c_cn) == Unicode.Cn
+    @test charprop(CharCode, c_cn) == Unicode.Cn
 end
 
 # Make sure fastplus is called for coverage

From 8b23e470bcaa02b71f6bb1f7f104bdef407bf079 Mon Sep 17 00:00:00 2001
From: ScottPJones <scottjones@alum.mit.edu>
Date: Mon, 14 Dec 2015 07:56:55 -0500
Subject: [PATCH 3/7] Clean up naming, don't want any Cat fights!

---
 base/exports.jl            |   6 +-
 base/unicode.jl            |   3 +-
 base/unicode/properties.jl | 161 +++++++++++++++++++------------------
 base/unicode/utf8proc.jl   |   4 +-
 test/unicode/properties.jl |   4 +-
 5 files changed, 91 insertions(+), 87 deletions(-)

diff --git a/base/exports.jl b/base/exports.jl
index 803a41125ff33..db08c05f81d54 100644
--- a/base/exports.jl
+++ b/base/exports.jl
@@ -22,7 +22,7 @@ export
     Serializer,
     Docs,
     Markdown,
-    Cat,
+    Category,
     Unicode,
 
 # Types
@@ -42,8 +42,8 @@ export
     CartesianIndex,
     CartesianRange,
     Channel,
-    CharCode,
-    CharType,
+    CategoryCode,
+    CategoryType,
     Cmd,
     Colon,
     Complex,
diff --git a/base/unicode.jl b/base/unicode.jl
index fc94993410638..2b119b8565e13 100644
--- a/base/unicode.jl
+++ b/base/unicode.jl
@@ -6,7 +6,8 @@ import Base: string, convert, write, length, endof, next, reverseind, lastidx, r
 	     lowercase, uppercase, eltype, isless, promote_rule, ==
 
 export UnicodeError, UTF16String, UTF32String, unsafe_checkstring, checkstring,
-       utf8, utf16, utf32, containsnul, WString, wstring, charprop, Cat, CharType, CharCode,
+       utf8, utf16, utf32, containsnul, WString, wstring, charprop, Category,
+       CategoryType, CategoryCode,
        is_assigned_char, islower, isupper, isdigit, isalpha, isnumber, isalnum, iscntrl,
        ispunct, isspace, isprint, isgraph,
        isgraphemebreak, GraphemeIterator, normalize_string, graphemes, charwidth
diff --git a/base/unicode/properties.jl b/base/unicode/properties.jl
index 82db6cafc6861..883055f192012 100644
--- a/base/unicode/properties.jl
+++ b/base/unicode/properties.jl
@@ -10,81 +10,85 @@ isvalid(::Type{Char}, ch::Char)     = isvalid(Char, UInt32(ch))
 
 isvalid(ch::Char) = isvalid(Char, ch)
 
-# Unicode General Category constants
-
-module Cat
-export Property, CharType, CharCode
-
 """Unicode character properties"""
 abstract Property
 
+"""
+Return various Unicode properties for character
+"""
+function charprop end
+
+# Unicode General Category constants
+
+module Category
+export CategoryType, CategoryCode
+
 """Unicode character category type"""
-abstract CharType    <: Property
+abstract CategoryType <: Unicode.Property
 
 """Unicode 'Letter' character category"""
-abstract Letter      <: CharType
+abstract Letter      <: CategoryType
 """Unicode 'Mark' character category"""
-abstract Mark        <: CharType
+abstract Mark        <: CategoryType
 """Unicode 'Number' character category"""
-abstract Number      <: CharType
+abstract Number      <: CategoryType
 """Unicode 'Punctuation' character category"""
-abstract Punctuation <: CharType
+abstract Punctuation <: CategoryType
 """Unicode 'Symbol' character category"""
-abstract Symbol      <: CharType
+abstract Symbol      <: CategoryType
 """Unicode 'Separator' character category"""
-abstract Separator   <: CharType
+abstract Separator   <: CategoryType
 """Unicode 'Other' character category"""
-abstract Other       <: CharType
+abstract Other       <: CategoryType
 
 """Unicode uppercase & titlecase letters"""
 abstract Upper       <: Letter
 
-"""Unicode character category code (0-29)"""
-bitstype 8 CharCode
+"""Unicode alphabetic and numeric"""
+typealias AlphaNumeric Union{Letter, Number}
 
-end # module Cat
-import .Cat: Property, CharType, CharCode
+"""Unicode character category code (0-29)"""
+bitstype 8 CategoryCode
 
-convert(::Type{CharCode}, x::Integer) = reinterpret(CharCode, x%UInt8)
-convert{T<:Integer}(::Type{T}, x::CharCode) = convert(T, reinterpret(UInt8, x))
-promote_rule{T<:Integer}(::Type{T}, ::Type{CharCode}) = T
-isless(x::CharCode, y::CharCode) = isless(UInt8(x), UInt8(y))
-isless(x::CharCode, y::Integer)  = isless(UInt8(x), y)
-isless(x::Integer, y::CharCode)  = isless(x, UInt8(y))
+Base.convert(::Type{CategoryCode}, x::Integer) = reinterpret(CategoryCode, x%UInt8)
+Base.convert{T<:Integer}(::Type{T}, x::CategoryCode) = convert(T, reinterpret(UInt8, x))
+Base.promote_rule{T<:Integer}(::Type{T}, ::Type{CategoryCode}) = T
+Base.isless(x::CategoryCode, y::CategoryCode) = isless(UInt8(x), UInt8(y))
+Base.isless(x::CategoryCode, y::Integer)  = isless(UInt8(x), y)
+Base.isless(x::Integer, y::CategoryCode)  = isless(x, UInt8(y))
 
 for (nam, val, cat, typ, des) in
-    ((:Cn, 0,  :NotAssignedChar,    Cat.Other,       "Other, Not assigned"),
-     (:Lu, 1,  :UpperCase,          Cat.Upper,       "Letter, uppercase"),
-     (:Ll, 2,  :LowerCase,          Cat.Letter,      "Letter, lowercase"),
-     (:Lt, 3,  :TitleCase,          Cat.Upper,       "Letter, titlecase"),
-     (:Lm, 4,  :ModifierLetter,     Cat.Letter,      "Letter, modifier"),
-     (:Lo, 5,  :OtherLetter,        Cat.Letter,      "Letter, other"),
-     (:Mn, 6,  :NonSpacingMark,     Cat.Mark,        "Mark, nonspacing"),
-     (:Mc, 7,  :CombiningMark,      Cat.Mark,        "Mark, spacing combining"),
-     (:Me, 8,  :EnclosingMark,      Cat.Mark,        "Mark, enclosing"),
-     (:Nd, 9,  :DecimalDigit,       Cat.Number,      "Number, decimal digit"),
-     (:Nl, 10, :NumericLetter,      Cat.Number,      "Number, letter"),
-     (:No, 11, :OtherNumber,        Cat.Number,      "Number, other"),
-     (:Pc, 12, :ConnectorPunct,     Cat.Punctuation, "Punctuation, connector"),
-     (:Pd, 13, :DashPunct,          Cat.Punctuation, "Punctuation, dash"),
-     (:Ps, 14, :OpenPunct,          Cat.Punctuation, "Punctuation, open"),
-     (:Pe, 15, :ClosePunct,         Cat.Punctuation, "Punctuation, close"),
-     (:Pi, 16, :BegQuotePunct,      Cat.Punctuation, "Punctuation, initial quote"),
-     (:Pf, 17, :EndQuotePunct,      Cat.Punctuation, "Punctuation, final quote"),
-     (:Po, 18, :OtherPunct,         Cat.Punctuation, "Punctuation, other"),
-     (:Sm, 19, :MathSymbol,         Cat.Symbol,      "Symbol, math"),
-     (:Sc, 20, :CurrencySymbol,     Cat.Symbol,      "Symbol, currency"),
-     (:Sk, 21, :ModifierSymbol,     Cat.Symbol,      "Symbol, modifier"),
-     (:So, 22, :OtherSymbol,        Cat.Symbol,      "Symbol, other"),
-     (:Zs, 23, :SpaceSeparator,     Cat.Separator,   "Separator, space"),
-     (:Zl, 24, :LineSeparator,      Cat.Separator,   "Separator, line"),
-     (:Zp, 25, :ParagraphSeparator, Cat.Separator,   "Separator, paragraph"),
-     (:Cc, 26, :ControlChar,        Cat.Other,       "Other, control"),
-     (:Cf, 27, :FormatChar,         Cat.Other,       "Other, format"),
-     (:Cs, 28, :SurrogateChar,      Cat.Other,       "Other, surrogate"),
-     (:Co, 29, :PrivateUseChar,     Cat.Other,       "Other, private use"))
-    @eval const global $nam = CharCode($val)
-    @eval export $cat
+    ((:Cn, 0,  :NotAssignedChar,         :Other,       "Other, Not assigned"),
+     (:Lu, 1,  :UpperCase,               :Upper,       "Letter, uppercase"),
+     (:Ll, 2,  :LowerCase,               :Letter,      "Letter, lowercase"),
+     (:Lt, 3,  :TitleCase,               :Upper,       "Letter, titlecase"),
+     (:Lm, 4,  :ModifierLetter,          :Letter,      "Letter, modifier"),
+     (:Lo, 5,  :OtherLetter,             :Letter,      "Letter, other"),
+     (:Mn, 6,  :NonSpacingMark,          :Mark,        "Mark, nonspacing"),
+     (:Mc, 7,  :CombiningMark,           :Mark,        "Mark, spacing combining"),
+     (:Me, 8,  :EnclosingMark,           :Mark,        "Mark, enclosing"),
+     (:Nd, 9,  :DecimalDigit,            :Number,      "Number, decimal digit"),
+     (:Nl, 10, :NumericLetter,           :Number,      "Number, letter"),
+     (:No, 11, :OtherNumber,             :Number,      "Number, other"),
+     (:Pc, 12, :ConnectorPunctuation,    :Punctuation, "Punctuation, connector"),
+     (:Pd, 13, :DashPunctuation,         :Punctuation, "Punctuation, dash"),
+     (:Ps, 14, :OpenPunctuation,         :Punctuation, "Punctuation, open"),
+     (:Pe, 15, :ClosePunctuation,        :Punctuation, "Punctuation, close"),
+     (:Pi, 16, :InitialQuotePunctuation, :Punctuation, "Punctuation, initial quote"),
+     (:Pf, 17, :FinalQuotePunctuation,   :Punctuation, "Punctuation, final quote"),
+     (:Po, 18, :OtherPunctuation,        :Punctuation, "Punctuation, other"),
+     (:Sm, 19, :MathSymbol,              :Symbol,      "Symbol, math"),
+     (:Sc, 20, :CurrencySymbol,          :Symbol,      "Symbol, currency"),
+     (:Sk, 21, :ModifierSymbol,          :Symbol,      "Symbol, modifier"),
+     (:So, 22, :OtherSymbol,             :Symbol,      "Symbol, other"),
+     (:Zs, 23, :SpaceSeparator,          :Separator,   "Separator, space"),
+     (:Zl, 24, :LineSeparator,           :Separator,   "Separator, line"),
+     (:Zp, 25, :ParagraphSeparator,      :Separator,   "Separator, paragraph"),
+     (:Cc, 26, :ControlChar,             :Other,       "Other, control"),
+     (:Cf, 27, :FormatChar,              :Other,       "Other, format"),
+     (:Cs, 28, :SurrogateChar,           :Other,       "Other, surrogate"),
+     (:Co, 29, :PrivateUseChar,          :Other,       "Other, private use"))
+    @eval const global $nam = CategoryCode($val)
     @eval abstract $cat <: $typ
     @eval @doc $(string("Unicode Category Code: ",des)) $nam
     @eval @doc $(string("Unicode Category Type: ",des)) $cat
@@ -93,61 +97,60 @@ end
 const c2t = [NotAssignedChar, UpperCase, LowerCase, TitleCase, ModifierLetter, OtherLetter,
              NonSpacingMark, CombiningMark, EnclosingMark,
              DecimalDigit, NumericLetter, OtherNumber,
-             ConnectorPunct, DashPunct, OpenPunct, ClosePunct,
-             BegQuotePunct, EndQuotePunct, OtherPunct,
+             ConnectorPunctuation, DashPunctuation, OpenPunctuation, ClosePunctuation,
+             InitialQuotePunctuation, FinalQuotePunctuation, OtherPunctuation,
              MathSymbol, CurrencySymbol, ModifierSymbol, OtherSymbol,
              SpaceSeparator, LineSeparator, ParagraphSeparator,
-	     ControlChar, FormatChar, SurrogateChar, PrivateUseChar]
+             ControlChar, FormatChar, SurrogateChar, PrivateUseChar]
 
-############################################################################
+charprop(::Type{CategoryType}, c) = c2t[Int(charprop(CategoryCode, c))+1]
 
-"""
-Return various Unicode properties for character
-"""
-function charprop end
+end # module Cat
+importall .Category
 
-charprop(::Type{CharType}, c) = c2t[Int(charprop(CharCode, c))+1]
+############################################################################
 
-is_assigned_char(c) = charprop(CharCode, c) != Cn
+is_assigned_char(c) = charprop(CategoryCode, c) != Category.Cn
 
 ## libc character class predicates ##
 
-islower(c::Char) = charprop(CharCode, c) == Ll
+islower(c::Char) = charprop(CategoryCode, c) == Category.Ll
 
 # true for Unicode upper and mixed case
-isupper(c::Char) = (ccode = charprop(CharCode, c)) == Lu || ccode == Lt
+isupper(c::Char) = (ccode = charprop(CategoryCode, c)) == Category.Lu || ccode == Category.Lt
 
 isdigit(c::Char)  = ('0' <= c <= '9')
-isalpha(c::Char)  = (Lu <= charprop(CharCode, c) <= Lo)
-isnumber(c::Char) = (Nd <= charprop(CharCode, c) <= No)
-isalnum(c::Char)  = (Lu <= (ccode = charprop(CharCode, c)) <= Lo) || (Nd <= ccode <= No)
+isalpha(c::Char)  = (Category.Lu <= charprop(CategoryCode, c) <= Category.Lo)
+isnumber(c::Char) = (Category.Nd <= charprop(CategoryCode, c) <= Category.No)
+isalnum(c::Char)  = ((Category.Lu <= (ccode = charprop(CategoryCode, c)) <= Category.Lo) ||
+                     (Category.Nd <= ccode <= Category.No))
 
 # These are about 3 times slower, because the isa method
 # is much slower than checking if an integer is within range (or two ranges)
 # If that is sped up, then these, which are more readable, could replace the other forms.
 #=
-isalpha(c::Char)  = charprop(CharType, c) <: CatLetter
-isnumber(c::Char) = charprop(CharType, c) <: CatNumber
-isupper(c::Char)  = charprop(CharType, c) <: CatUpper
-isalnum(c::Char)  = charprop(CharType, c) <: Union{CatLetter, CatNumber}
-ispunct(c::Char)  = charprop(CharType, c) <: CatPunctuation
+isalpha(c::Char)  = charprop(CategoryType, c) <: Category.Letter
+isnumber(c::Char) = charprop(CategoryType, c) <: Category.Number
+isupper(c::Char)  = charprop(CategoryType, c) <: Category.Upper
+isalnum(c::Char)  = charprop(CategoryType, c) <: Category.AlphaNumeric
+ispunct(c::Char)  = charprop(CategoryType, c) <: Category.Punctuation
 =#
 
 # following C++ only control characters from the Latin-1 subset return true
 iscntrl(c::Char) = (c <= Char(0x1f) || Char(0x7f) <= c <= Char(0x9f))
 
-ispunct(c::Char) = (Pc <= charprop(CharCode, c) <= Po)
+ispunct(c::Char) = (Category.Pc <= charprop(CategoryCode, c) <= Category.Po)
 
 # \u85 is the Unicode Next Line (NEL) character
 # the check for \ufffd allows for branch removal on ASCIIStrings
 @inline isspace(c::Char) =
     (c == ' ' || '\t' <= c <='\r' || c == '\u85' ||
-     ('\ua0' <= c && c != '\ufffd' && charprop(CharCode, c) == Zs))
+     ('\ua0' <= c && c != '\ufffd' && charprop(CategoryCode, c) == Category.Zs))
 
-isprint(c::Char) = (Lu <= charprop(CharCode, c) <= Zs)
+isprint(c::Char) = (Category.Lu <= charprop(CategoryCode, c) <= Category.Zs)
 
 # true in principle if a printer would use ink
-isgraph(c::Char) = (Lu <= charprop(CharCode, c) <= So)
+isgraph(c::Char) = (Category.Lu <= charprop(CategoryCode, c) <= Category.So)
 
 for name = ("alnum", "alpha", "cntrl", "digit", "number", "graph",
             "lower", "print", "punct", "space", "upper")
diff --git a/base/unicode/utf8proc.jl b/base/unicode/utf8proc.jl
index b8dc1c1a888ed..107e61135ccfc 100644
--- a/base/unicode/utf8proc.jl
+++ b/base/unicode/utf8proc.jl
@@ -81,8 +81,8 @@ uppercase(c::Char) = (isascii(c)
 
 ############################################################################
 
-# returns CharCode (values 0:29) giving Unicode category
-charprop(::Type{CharCode}, c) = CharCode(ccall(:utf8proc_category, Cint, (UInt32,), c))
+# returns CategoryCode (values 0:29) giving Unicode category
+charprop(::Type{CategoryCode}, c) = CategoryCode(ccall(:utf8proc_category, Cint, (UInt32,), c))
 
 ############################################################################
 
diff --git a/test/unicode/properties.jl b/test/unicode/properties.jl
index 79a4ad3c5c809..3929cceb030e5 100644
--- a/test/unicode/properties.jl
+++ b/test/unicode/properties.jl
@@ -143,9 +143,9 @@ end
 
 # check handling of CN category constants
 let c_ll = 'β', c_cn = '\u038B'
-    @test charprop(CharCode, c_ll) == Unicode.Ll
+    @test charprop(CategoryCode, c_ll) == Category.Ll
     # check codepoint with category code CN
-    @test charprop(CharCode, c_cn) == Unicode.Cn
+    @test charprop(CategoryCode, c_cn) == Category.Cn
 end
 
 # Make sure fastplus is called for coverage

From c05d46389d9290294f7ea8d05402fdc39a1f527d Mon Sep 17 00:00:00 2001
From: ScottPJones <scottjones@alum.mit.edu>
Date: Mon, 14 Dec 2015 11:12:17 -0500
Subject: [PATCH 4/7] Add category masks

---
 base/exports.jl            |  2 -
 base/unicode.jl            |  1 -
 base/unicode/properties.jl | 92 +++++++++++++++++++-------------------
 base/unicode/utf8proc.jl   |  4 +-
 test/unicode/properties.jl |  4 +-
 5 files changed, 51 insertions(+), 52 deletions(-)

diff --git a/base/exports.jl b/base/exports.jl
index db08c05f81d54..fb0d3b39beeca 100644
--- a/base/exports.jl
+++ b/base/exports.jl
@@ -42,8 +42,6 @@ export
     CartesianIndex,
     CartesianRange,
     Channel,
-    CategoryCode,
-    CategoryType,
     Cmd,
     Colon,
     Complex,
diff --git a/base/unicode.jl b/base/unicode.jl
index 2b119b8565e13..770219f0c8beb 100644
--- a/base/unicode.jl
+++ b/base/unicode.jl
@@ -7,7 +7,6 @@ import Base: string, convert, write, length, endof, next, reverseind, lastidx, r
 
 export UnicodeError, UTF16String, UTF32String, unsafe_checkstring, checkstring,
        utf8, utf16, utf32, containsnul, WString, wstring, charprop, Category,
-       CategoryType, CategoryCode,
        is_assigned_char, islower, isupper, isdigit, isalpha, isnumber, isalnum, iscntrl,
        ispunct, isspace, isprint, isgraph,
        isgraphemebreak, GraphemeIterator, normalize_string, graphemes, charwidth
diff --git a/base/unicode/properties.jl b/base/unicode/properties.jl
index 883055f192012..2b5da80452c54 100644
--- a/base/unicode/properties.jl
+++ b/base/unicode/properties.jl
@@ -21,25 +21,24 @@ function charprop end
 # Unicode General Category constants
 
 module Category
-export CategoryType, CategoryCode
 
 """Unicode character category type"""
-abstract CategoryType <: Unicode.Property
+abstract General     <: Unicode.Property
 
 """Unicode 'Letter' character category"""
-abstract Letter      <: CategoryType
+abstract Letter      <: General
 """Unicode 'Mark' character category"""
-abstract Mark        <: CategoryType
+abstract Mark        <: General
 """Unicode 'Number' character category"""
-abstract Number      <: CategoryType
+abstract Number      <: General
 """Unicode 'Punctuation' character category"""
-abstract Punctuation <: CategoryType
+abstract Punctuation <: General
 """Unicode 'Symbol' character category"""
-abstract Symbol      <: CategoryType
+abstract Symbol      <: General
 """Unicode 'Separator' character category"""
-abstract Separator   <: CategoryType
+abstract Separator   <: General
 """Unicode 'Other' character category"""
-abstract Other       <: CategoryType
+abstract Other       <: General
 
 """Unicode uppercase & titlecase letters"""
 abstract Upper       <: Letter
@@ -48,14 +47,17 @@ abstract Upper       <: Letter
 typealias AlphaNumeric Union{Letter, Number}
 
 """Unicode character category code (0-29)"""
-bitstype 8 CategoryCode
+bitstype 8 Code
 
-Base.convert(::Type{CategoryCode}, x::Integer) = reinterpret(CategoryCode, x%UInt8)
-Base.convert{T<:Integer}(::Type{T}, x::CategoryCode) = convert(T, reinterpret(UInt8, x))
-Base.promote_rule{T<:Integer}(::Type{T}, ::Type{CategoryCode}) = T
-Base.isless(x::CategoryCode, y::CategoryCode) = isless(UInt8(x), UInt8(y))
-Base.isless(x::CategoryCode, y::Integer)  = isless(UInt8(x), y)
-Base.isless(x::Integer, y::CategoryCode)  = isless(x, UInt8(y))
+"""Unicode character category mask"""
+typealias Mask UInt32
+
+Base.convert(::Type{Code}, x::Integer) = reinterpret(Code, x%UInt8)
+Base.convert{T<:Integer}(::Type{T}, x::Code) = convert(T, reinterpret(UInt8, x))
+Base.promote_rule{T<:Integer}(::Type{T}, ::Type{Code}) = T
+Base.isless(x::Code, y::Code) = isless(UInt8(x), UInt8(y))
+Base.isless(x::Code, y::Integer)  = isless(UInt8(x), y)
+Base.isless(x::Integer, y::Code)  = isless(x, UInt8(y))
 
 for (nam, val, cat, typ, des) in
     ((:Cn, 0,  :NotAssignedChar,         :Other,       "Other, Not assigned"),
@@ -88,8 +90,9 @@ for (nam, val, cat, typ, des) in
      (:Cf, 27, :FormatChar,              :Other,       "Other, format"),
      (:Cs, 28, :SurrogateChar,           :Other,       "Other, surrogate"),
      (:Co, 29, :PrivateUseChar,          :Other,       "Other, private use"))
-    @eval const global $nam = CategoryCode($val)
+    @eval const global $nam = $(Code(val))
     @eval abstract $cat <: $typ
+    @eval Base.convert(::Type{Code}, ct::$cat) = $(Code(val))
     @eval @doc $(string("Unicode Category Code: ",des)) $nam
     @eval @doc $(string("Unicode Category Type: ",des)) $cat
 end
@@ -101,56 +104,55 @@ const c2t = [NotAssignedChar, UpperCase, LowerCase, TitleCase, ModifierLetter, O
              InitialQuotePunctuation, FinalQuotePunctuation, OtherPunctuation,
              MathSymbol, CurrencySymbol, ModifierSymbol, OtherSymbol,
              SpaceSeparator, LineSeparator, ParagraphSeparator,
-             ControlChar, FormatChar, SurrogateChar, PrivateUseChar]
+	     ControlChar, FormatChar, SurrogateChar, PrivateUseChar]
+
+Base.convert(::Type{General}, cat::Code) = c2t[Int(cat)+1]
+
+Unicode.charprop(Mask, c) = Mask(1<<Int(charprop(Code, c)))
 
-charprop(::Type{CategoryType}, c) = c2t[Int(charprop(CategoryCode, c))+1]
+const global UpperMask  = Mask(1<<Int(Lu) | 1<<Int(Lt))
+const global AlphaMask  = Mask(1<<Int(Lu) | 1<<Int(Ll) | 1<<Int(Lt) | 1<<Int(Lm) | 1<<Int(Lo))
+const global NumberMask = Mask((1<<Int(Nd) | 1<<Int(Nl) | 1<<Int(No)))
+const global AlphaNumericMask = AlphaMask | NumberMask
+
+let mask = 0 ; for i = Int(Pc):Int(Po) ; mask |= (1<<i) ; end
+    @eval const global PunctuationMask = $(Mask(mask))
+    mask = 0 ; for i = Int(Lu):Int(So) ; mask |= (1<<i) ; end
+    @eval const global GraphMask = $(Mask(mask))
+    @eval const global PrintMask = $(Mask(mask | (1<<Int(Zs))))
+end
 
 end # module Cat
 importall .Category
 
 ############################################################################
 
-is_assigned_char(c) = charprop(CategoryCode, c) != Category.Cn
+is_assigned_char(c) = charprop(Category.Code, c) != Category.Cn
 
-## libc character class predicates ##
-
-islower(c::Char) = charprop(CategoryCode, c) == Category.Ll
+islower(c::Char)  = charprop(Category.Code, c) == Category.Ll
 
 # true for Unicode upper and mixed case
-isupper(c::Char) = (ccode = charprop(CategoryCode, c)) == Category.Lu || ccode == Category.Lt
+isupper(c::Char)  = (charprop(Category.Mask, c) & Category.UpperMask) != 0
+isalpha(c::Char)  = (charprop(Category.Mask, c) & Category.AlphaMask) != 0
+isnumber(c::Char) = (charprop(Category.Mask, c) & Category.NumberMask) != 0
+isalnum(c::Char)  = (charprop(Category.Mask, c) & Category.AlphaNumericMask) != 0
+ispunct(c::Char)  = (charprop(Category.Mask, c) & Category.PunctuationMask) != 0
+isprint(c::Char)  = (charprop(Category.Mask, c) & Category.PrintMask) != 0
+# true in principle if a printer would use ink
+isgraph(c::Char)  = (charprop(Category.Mask, c) & Category.GraphMask) != 0
 
 isdigit(c::Char)  = ('0' <= c <= '9')
-isalpha(c::Char)  = (Category.Lu <= charprop(CategoryCode, c) <= Category.Lo)
-isnumber(c::Char) = (Category.Nd <= charprop(CategoryCode, c) <= Category.No)
-isalnum(c::Char)  = ((Category.Lu <= (ccode = charprop(CategoryCode, c)) <= Category.Lo) ||
-                     (Category.Nd <= ccode <= Category.No))
-
-# These are about 3 times slower, because the isa method
-# is much slower than checking if an integer is within range (or two ranges)
-# If that is sped up, then these, which are more readable, could replace the other forms.
-#=
-isalpha(c::Char)  = charprop(CategoryType, c) <: Category.Letter
-isnumber(c::Char) = charprop(CategoryType, c) <: Category.Number
-isupper(c::Char)  = charprop(CategoryType, c) <: Category.Upper
-isalnum(c::Char)  = charprop(CategoryType, c) <: Category.AlphaNumeric
-ispunct(c::Char)  = charprop(CategoryType, c) <: Category.Punctuation
-=#
 
 # following C++ only control characters from the Latin-1 subset return true
 iscntrl(c::Char) = (c <= Char(0x1f) || Char(0x7f) <= c <= Char(0x9f))
 
-ispunct(c::Char) = (Category.Pc <= charprop(CategoryCode, c) <= Category.Po)
 
 # \u85 is the Unicode Next Line (NEL) character
 # the check for \ufffd allows for branch removal on ASCIIStrings
 @inline isspace(c::Char) =
     (c == ' ' || '\t' <= c <='\r' || c == '\u85' ||
-     ('\ua0' <= c && c != '\ufffd' && charprop(CategoryCode, c) == Category.Zs))
+     ('\ua0' <= c && c != '\ufffd' && charprop(Category.Code, c) == Category.Zs))
 
-isprint(c::Char) = (Category.Lu <= charprop(CategoryCode, c) <= Category.Zs)
-
-# true in principle if a printer would use ink
-isgraph(c::Char) = (Category.Lu <= charprop(CategoryCode, c) <= Category.So)
 
 for name = ("alnum", "alpha", "cntrl", "digit", "number", "graph",
             "lower", "print", "punct", "space", "upper")
diff --git a/base/unicode/utf8proc.jl b/base/unicode/utf8proc.jl
index 107e61135ccfc..d1b274eba070c 100644
--- a/base/unicode/utf8proc.jl
+++ b/base/unicode/utf8proc.jl
@@ -81,8 +81,8 @@ uppercase(c::Char) = (isascii(c)
 
 ############################################################################
 
-# returns CategoryCode (values 0:29) giving Unicode category
-charprop(::Type{CategoryCode}, c) = CategoryCode(ccall(:utf8proc_category, Cint, (UInt32,), c))
+# returns Category.Code (values 0:29) giving Unicode category
+charprop(::Type{Category.Code}, c) = Category.Code(ccall(:utf8proc_category, Cint, (UInt32,), c))
 
 ############################################################################
 
diff --git a/test/unicode/properties.jl b/test/unicode/properties.jl
index 3929cceb030e5..58170c1328d43 100644
--- a/test/unicode/properties.jl
+++ b/test/unicode/properties.jl
@@ -143,9 +143,9 @@ end
 
 # check handling of CN category constants
 let c_ll = 'β', c_cn = '\u038B'
-    @test charprop(CategoryCode, c_ll) == Category.Ll
+    @test charprop(Category.Code, c_ll) == Category.Ll
     # check codepoint with category code CN
-    @test charprop(CategoryCode, c_cn) == Category.Cn
+    @test charprop(Category.Code, c_cn) == Category.Cn
 end
 
 # Make sure fastplus is called for coverage

From f4ae3ec87bba54e3d4945fe0e294265388425ff6 Mon Sep 17 00:00:00 2001
From: ScottPJones <scottjones@alum.mit.edu>
Date: Mon, 14 Dec 2015 11:39:41 -0500
Subject: [PATCH 5/7] Update per comments on use of ?:

Add newline

Fix indentation (Emacs and tabs)
---
 base/unicode/properties.jl | 114 ++++++++++++++++++++-----------------
 base/unicode/utf8proc.jl   |  21 +++++--
 2 files changed, 78 insertions(+), 57 deletions(-)

diff --git a/base/unicode/properties.jl b/base/unicode/properties.jl
index 2b5da80452c54..da80417bdc3bc 100644
--- a/base/unicode/properties.jl
+++ b/base/unicode/properties.jl
@@ -59,44 +59,49 @@ Base.isless(x::Code, y::Code) = isless(UInt8(x), UInt8(y))
 Base.isless(x::Code, y::Integer)  = isless(UInt8(x), y)
 Base.isless(x::Integer, y::Code)  = isless(x, UInt8(y))
 
-for (nam, val, cat, typ, des) in
-    ((:Cn, 0,  :NotAssignedChar,         :Other,       "Other, Not assigned"),
-     (:Lu, 1,  :UpperCase,               :Upper,       "Letter, uppercase"),
-     (:Ll, 2,  :LowerCase,               :Letter,      "Letter, lowercase"),
-     (:Lt, 3,  :TitleCase,               :Upper,       "Letter, titlecase"),
-     (:Lm, 4,  :ModifierLetter,          :Letter,      "Letter, modifier"),
-     (:Lo, 5,  :OtherLetter,             :Letter,      "Letter, other"),
-     (:Mn, 6,  :NonSpacingMark,          :Mark,        "Mark, nonspacing"),
-     (:Mc, 7,  :CombiningMark,           :Mark,        "Mark, spacing combining"),
-     (:Me, 8,  :EnclosingMark,           :Mark,        "Mark, enclosing"),
-     (:Nd, 9,  :DecimalDigit,            :Number,      "Number, decimal digit"),
-     (:Nl, 10, :NumericLetter,           :Number,      "Number, letter"),
-     (:No, 11, :OtherNumber,             :Number,      "Number, other"),
-     (:Pc, 12, :ConnectorPunctuation,    :Punctuation, "Punctuation, connector"),
-     (:Pd, 13, :DashPunctuation,         :Punctuation, "Punctuation, dash"),
-     (:Ps, 14, :OpenPunctuation,         :Punctuation, "Punctuation, open"),
-     (:Pe, 15, :ClosePunctuation,        :Punctuation, "Punctuation, close"),
-     (:Pi, 16, :InitialQuotePunctuation, :Punctuation, "Punctuation, initial quote"),
-     (:Pf, 17, :FinalQuotePunctuation,   :Punctuation, "Punctuation, final quote"),
-     (:Po, 18, :OtherPunctuation,        :Punctuation, "Punctuation, other"),
-     (:Sm, 19, :MathSymbol,              :Symbol,      "Symbol, math"),
-     (:Sc, 20, :CurrencySymbol,          :Symbol,      "Symbol, currency"),
-     (:Sk, 21, :ModifierSymbol,          :Symbol,      "Symbol, modifier"),
-     (:So, 22, :OtherSymbol,             :Symbol,      "Symbol, other"),
-     (:Zs, 23, :SpaceSeparator,          :Separator,   "Separator, space"),
-     (:Zl, 24, :LineSeparator,           :Separator,   "Separator, line"),
-     (:Zp, 25, :ParagraphSeparator,      :Separator,   "Separator, paragraph"),
-     (:Cc, 26, :ControlChar,             :Other,       "Other, control"),
-     (:Cf, 27, :FormatChar,              :Other,       "Other, format"),
-     (:Cs, 28, :SurrogateChar,           :Other,       "Other, surrogate"),
-     (:Co, 29, :PrivateUseChar,          :Other,       "Other, private use"))
-    @eval const global $nam = $(Code(val))
-    @eval abstract $cat <: $typ
-    @eval Base.convert(::Type{Code}, ct::$cat) = $(Code(val))
-    @eval @doc $(string("Unicode Category Code: ",des)) $nam
-    @eval @doc $(string("Unicode Category Type: ",des)) $cat
+let c2t = DataType[]
+    for (nam, val, cat, typ, des) in
+        ((:Cn, 0,  :NotAssignedChar,         :Other,       "Other, Not assigned"),
+         (:Lu, 1,  :UpperCase,               :Upper,       "Letter, uppercase"),
+         (:Ll, 2,  :LowerCase,               :Letter,      "Letter, lowercase"),
+         (:Lt, 3,  :TitleCase,               :Upper,       "Letter, titlecase"),
+         (:Lm, 4,  :ModifierLetter,          :Letter,      "Letter, modifier"),
+         (:Lo, 5,  :OtherLetter,             :Letter,      "Letter, other"),
+         (:Mn, 6,  :NonSpacingMark,          :Mark,        "Mark, nonspacing"),
+         (:Mc, 7,  :CombiningMark,           :Mark,        "Mark, spacing combining"),
+         (:Me, 8,  :EnclosingMark,           :Mark,        "Mark, enclosing"),
+         (:Nd, 9,  :DecimalDigit,            :Number,      "Number, decimal digit"),
+         (:Nl, 10, :NumericLetter,           :Number,      "Number, letter"),
+         (:No, 11, :OtherNumber,             :Number,      "Number, other"),
+         (:Pc, 12, :ConnectorPunctuation,    :Punctuation, "Punctuation, connector"),
+         (:Pd, 13, :DashPunctuation,         :Punctuation, "Punctuation, dash"),
+         (:Ps, 14, :OpenPunctuation,         :Punctuation, "Punctuation, open"),
+         (:Pe, 15, :ClosePunctuation,        :Punctuation, "Punctuation, close"),
+         (:Pi, 16, :InitialQuotePunctuation, :Punctuation, "Punctuation, initial quote"),
+         (:Pf, 17, :FinalQuotePunctuation,   :Punctuation, "Punctuation, final quote"),
+         (:Po, 18, :OtherPunctuation,        :Punctuation, "Punctuation, other"),
+         (:Sm, 19, :MathSymbol,              :Symbol,      "Symbol, math"),
+         (:Sc, 20, :CurrencySymbol,          :Symbol,      "Symbol, currency"),
+         (:Sk, 21, :ModifierSymbol,          :Symbol,      "Symbol, modifier"),
+         (:So, 22, :OtherSymbol,             :Symbol,      "Symbol, other"),
+         (:Zs, 23, :SpaceSeparator,          :Separator,   "Separator, space"),
+         (:Zl, 24, :LineSeparator,           :Separator,   "Separator, line"),
+         (:Zp, 25, :ParagraphSeparator,      :Separator,   "Separator, paragraph"),
+         (:Cc, 26, :ControlChar,             :Other,       "Other, control"),
+         (:Cf, 27, :FormatChar,              :Other,       "Other, format"),
+         (:Cs, 28, :SurrogateChar,           :Other,       "Other, surrogate"),
+         (:Co, 29, :PrivateUseChar,          :Other,       "Other, private use"))
+        @eval const global $nam = $(Code(val))
+        @eval abstract $cat <: $typ
+        @eval push!($c2t, $cat)
+        @eval Base.convert(::Type{Code}, ct::$cat) = $(Code(val))
+        @eval @doc $(string("Unicode Category Code: ",des)) $nam
+        @eval @doc $(string("Unicode Category Type: ",des)) $cat
+    end
+    @eval const global code2general = $c2t
 end
 
+#=
 const c2t = [NotAssignedChar, UpperCase, LowerCase, TitleCase, ModifierLetter, OtherLetter,
              NonSpacingMark, CombiningMark, EnclosingMark,
              DecimalDigit, NumericLetter, OtherNumber,
@@ -104,22 +109,29 @@ const c2t = [NotAssignedChar, UpperCase, LowerCase, TitleCase, ModifierLetter, O
              InitialQuotePunctuation, FinalQuotePunctuation, OtherPunctuation,
              MathSymbol, CurrencySymbol, ModifierSymbol, OtherSymbol,
              SpaceSeparator, LineSeparator, ParagraphSeparator,
-	     ControlChar, FormatChar, SurrogateChar, PrivateUseChar]
+             ControlChar, FormatChar, SurrogateChar, PrivateUseChar]
+=#
 
-Base.convert(::Type{General}, cat::Code) = c2t[Int(cat)+1]
+Base.convert(::Type{General}, cat::Code) = code2general[Int(cat)+1]
 
 Unicode.charprop(Mask, c) = Mask(1<<Int(charprop(Code, c)))
 
-const global UpperMask  = Mask(1<<Int(Lu) | 1<<Int(Lt))
-const global AlphaMask  = Mask(1<<Int(Lu) | 1<<Int(Ll) | 1<<Int(Lt) | 1<<Int(Lm) | 1<<Int(Lo))
-const global NumberMask = Mask((1<<Int(Nd) | 1<<Int(Nl) | 1<<Int(No)))
-const global AlphaNumericMask = AlphaMask | NumberMask
+Base.&(c::Code, m::Mask) = ((1<<Int(c)) & m) != 0
+
+Base.|(x::Code, y::Code) = Mask((1<<Int(x)) | (1<<Int(y)))
+Base.|(c::Code, m::Mask) = Mask((1<<Int(c)) | m)
+Base.|(m::Mask, c::Code) = (c | m)
+
+@eval const global UpperMask  = $(Lu | Lt)
+@eval const global AlphaMask  = $(Lu | Ll | Lt | Lm | Lo)
+@eval const global NumberMask = $(Nd | Nl | No)
+@eval const global AlphaNumericMask = AlphaMask | NumberMask
 
 let mask = 0 ; for i = Int(Pc):Int(Po) ; mask |= (1<<i) ; end
     @eval const global PunctuationMask = $(Mask(mask))
     mask = 0 ; for i = Int(Lu):Int(So) ; mask |= (1<<i) ; end
     @eval const global GraphMask = $(Mask(mask))
-    @eval const global PrintMask = $(Mask(mask | (1<<Int(Zs))))
+    @eval const global PrintMask = $(Mask(mask) | Zs)
 end
 
 end # module Cat
@@ -129,17 +141,17 @@ importall .Category
 
 is_assigned_char(c) = charprop(Category.Code, c) != Category.Cn
 
-islower(c::Char)  = charprop(Category.Code, c) == Category.Ll
+islower(c::Char)    = charprop(Category.Code, c) == Category.Ll
 
 # true for Unicode upper and mixed case
-isupper(c::Char)  = (charprop(Category.Mask, c) & Category.UpperMask) != 0
-isalpha(c::Char)  = (charprop(Category.Mask, c) & Category.AlphaMask) != 0
-isnumber(c::Char) = (charprop(Category.Mask, c) & Category.NumberMask) != 0
-isalnum(c::Char)  = (charprop(Category.Mask, c) & Category.AlphaNumericMask) != 0
-ispunct(c::Char)  = (charprop(Category.Mask, c) & Category.PunctuationMask) != 0
-isprint(c::Char)  = (charprop(Category.Mask, c) & Category.PrintMask) != 0
+isupper(c::Char)  = charprop(Category.Code, c) & Category.UpperMask
+isalpha(c::Char)  = charprop(Category.Code, c) & Category.AlphaMask
+isnumber(c::Char) = charprop(Category.Code, c) & Category.NumberMask
+isalnum(c::Char)  = charprop(Category.Code, c) & Category.AlphaNumericMask
+ispunct(c::Char)  = charprop(Category.Code, c) & Category.PunctuationMask
+isprint(c::Char)  = charprop(Category.Code, c) & Category.PrintMask
 # true in principle if a printer would use ink
-isgraph(c::Char)  = (charprop(Category.Mask, c) & Category.GraphMask) != 0
+isgraph(c::Char)  = charprop(Category.Code, c) & Category.GraphMask
 
 isdigit(c::Char)  = ('0' <= c <= '9')
 
diff --git a/base/unicode/utf8proc.jl b/base/unicode/utf8proc.jl
index d1b274eba070c..e449c77a8fceb 100644
--- a/base/unicode/utf8proc.jl
+++ b/base/unicode/utf8proc.jl
@@ -72,12 +72,21 @@ normalize_string(s::AbstractString, nf::Symbol) =
 
 charwidth(c::Char) = Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), c))
 
-lowercase(c::Char) = (isascii(c)
-                      ? ('A' <= c <= 'Z' ? c + 0x20 : c)
-                      : Char(ccall(:utf8proc_tolower, UInt32, (UInt32,), c)))
-uppercase(c::Char) = (isascii(c)
-                      ? ('a' <= c <= 'z' ? c - 0x20 : c)
-                      : Char(ccall(:utf8proc_toupper, UInt32, (UInt32,), c)))
+function lowercase(c::Char)
+    if isascii(c)
+        'A' <= c <= 'Z' ? c + 0x20 : c
+    else
+        Char(ccall(:utf8proc_tolower, UInt32, (UInt32,), c))
+    end
+end
+
+function uppercase(c::Char)
+    if isascii(c)
+        'a' <= c <= 'z' ? c - 0x20 : c
+    else
+        Char(ccall(:utf8proc_toupper, UInt32, (UInt32,), c))
+    end
+end
 
 ############################################################################
 

From 0166695080ffdcc34c172ae03bedc2bd588f050d Mon Sep 17 00:00:00 2001
From: ScottPJones <scottjones@alum.mit.edu>
Date: Tue, 15 Dec 2015 14:01:08 -0500
Subject: [PATCH 6/7] Remove types for general categories

---
 base/unicode/properties.jl | 192 ++++++++++++++++++-------------------
 1 file changed, 93 insertions(+), 99 deletions(-)

diff --git a/base/unicode/properties.jl b/base/unicode/properties.jl
index da80417bdc3bc..4297a40d86386 100644
--- a/base/unicode/properties.jl
+++ b/base/unicode/properties.jl
@@ -22,36 +22,9 @@ function charprop end
 
 module Category
 
-"""Unicode character category type"""
-abstract General     <: Unicode.Property
-
-"""Unicode 'Letter' character category"""
-abstract Letter      <: General
-"""Unicode 'Mark' character category"""
-abstract Mark        <: General
-"""Unicode 'Number' character category"""
-abstract Number      <: General
-"""Unicode 'Punctuation' character category"""
-abstract Punctuation <: General
-"""Unicode 'Symbol' character category"""
-abstract Symbol      <: General
-"""Unicode 'Separator' character category"""
-abstract Separator   <: General
-"""Unicode 'Other' character category"""
-abstract Other       <: General
-
-"""Unicode uppercase & titlecase letters"""
-abstract Upper       <: Letter
-
-"""Unicode alphabetic and numeric"""
-typealias AlphaNumeric Union{Letter, Number}
-
 """Unicode character category code (0-29)"""
 bitstype 8 Code
 
-"""Unicode character category mask"""
-typealias Mask UInt32
-
 Base.convert(::Type{Code}, x::Integer) = reinterpret(Code, x%UInt8)
 Base.convert{T<:Integer}(::Type{T}, x::Code) = convert(T, reinterpret(UInt8, x))
 Base.promote_rule{T<:Integer}(::Type{T}, ::Type{Code}) = T
@@ -59,79 +32,100 @@ Base.isless(x::Code, y::Code) = isless(UInt8(x), UInt8(y))
 Base.isless(x::Code, y::Integer)  = isless(UInt8(x), y)
 Base.isless(x::Integer, y::Code)  = isless(x, UInt8(y))
 
-let c2t = DataType[]
-    for (nam, val, cat, typ, des) in
-        ((:Cn, 0,  :NotAssignedChar,         :Other,       "Other, Not assigned"),
-         (:Lu, 1,  :UpperCase,               :Upper,       "Letter, uppercase"),
-         (:Ll, 2,  :LowerCase,               :Letter,      "Letter, lowercase"),
-         (:Lt, 3,  :TitleCase,               :Upper,       "Letter, titlecase"),
-         (:Lm, 4,  :ModifierLetter,          :Letter,      "Letter, modifier"),
-         (:Lo, 5,  :OtherLetter,             :Letter,      "Letter, other"),
-         (:Mn, 6,  :NonSpacingMark,          :Mark,        "Mark, nonspacing"),
-         (:Mc, 7,  :CombiningMark,           :Mark,        "Mark, spacing combining"),
-         (:Me, 8,  :EnclosingMark,           :Mark,        "Mark, enclosing"),
-         (:Nd, 9,  :DecimalDigit,            :Number,      "Number, decimal digit"),
-         (:Nl, 10, :NumericLetter,           :Number,      "Number, letter"),
-         (:No, 11, :OtherNumber,             :Number,      "Number, other"),
-         (:Pc, 12, :ConnectorPunctuation,    :Punctuation, "Punctuation, connector"),
-         (:Pd, 13, :DashPunctuation,         :Punctuation, "Punctuation, dash"),
-         (:Ps, 14, :OpenPunctuation,         :Punctuation, "Punctuation, open"),
-         (:Pe, 15, :ClosePunctuation,        :Punctuation, "Punctuation, close"),
-         (:Pi, 16, :InitialQuotePunctuation, :Punctuation, "Punctuation, initial quote"),
-         (:Pf, 17, :FinalQuotePunctuation,   :Punctuation, "Punctuation, final quote"),
-         (:Po, 18, :OtherPunctuation,        :Punctuation, "Punctuation, other"),
-         (:Sm, 19, :MathSymbol,              :Symbol,      "Symbol, math"),
-         (:Sc, 20, :CurrencySymbol,          :Symbol,      "Symbol, currency"),
-         (:Sk, 21, :ModifierSymbol,          :Symbol,      "Symbol, modifier"),
-         (:So, 22, :OtherSymbol,             :Symbol,      "Symbol, other"),
-         (:Zs, 23, :SpaceSeparator,          :Separator,   "Separator, space"),
-         (:Zl, 24, :LineSeparator,           :Separator,   "Separator, line"),
-         (:Zp, 25, :ParagraphSeparator,      :Separator,   "Separator, paragraph"),
-         (:Cc, 26, :ControlChar,             :Other,       "Other, control"),
-         (:Cf, 27, :FormatChar,              :Other,       "Other, format"),
-         (:Cs, 28, :SurrogateChar,           :Other,       "Other, surrogate"),
-         (:Co, 29, :PrivateUseChar,          :Other,       "Other, private use"))
-        @eval const global $nam = $(Code(val))
-        @eval abstract $cat <: $typ
-        @eval push!($c2t, $cat)
-        @eval Base.convert(::Type{Code}, ct::$cat) = $(Code(val))
-        @eval @doc $(string("Unicode Category Code: ",des)) $nam
-        @eval @doc $(string("Unicode Category Type: ",des)) $cat
-    end
-    @eval const global code2general = $c2t
+"""Unicode character category mask"""
+bitstype 32 Mask
+
+Base.convert(::Type{Mask}, x::Integer) = reinterpret(Mask, x%UInt32)
+Base.convert{T<:Integer}(::Type{T}, x::Mask) = convert(T, reinterpret(UInt32, x))
+Base.promote_rule{T<:Integer}(::Type{T}, ::Type{Mask}) = T
+
+Base.convert(::Type{Mask}, c::Code) = Mask(1<<Int(c))
+
+for (nam, val, cat, typ, des) in
+    ((:Cn, 0,  :NotAssignedChar,         :Other,       "Other, Not assigned"),
+     (:Lu, 1,  :UpperCase,               :Upper,       "Letter, uppercase"),
+     (:Ll, 2,  :LowerCase,               :Letter,      "Letter, lowercase"),
+     (:Lt, 3,  :TitleCase,               :Upper,       "Letter, titlecase"),
+     (:Lm, 4,  :ModifierLetter,          :Letter,      "Letter, modifier"),
+     (:Lo, 5,  :OtherLetter,             :Letter,      "Letter, other"),
+     (:Mn, 6,  :NonSpacingMark,          :Mark,        "Mark, nonspacing"),
+     (:Mc, 7,  :CombiningMark,           :Mark,        "Mark, spacing combining"),
+     (:Me, 8,  :EnclosingMark,           :Mark,        "Mark, enclosing"),
+     (:Nd, 9,  :DecimalDigit,            :Number,      "Number, decimal digit"),
+     (:Nl, 10, :NumericLetter,           :Number,      "Number, letter"),
+     (:No, 11, :OtherNumber,             :Number,      "Number, other"),
+     (:Pc, 12, :ConnectorPunctuation,    :Punctuation, "Punctuation, connector"),
+     (:Pd, 13, :DashPunctuation,         :Punctuation, "Punctuation, dash"),
+     (:Ps, 14, :OpenPunctuation,         :Punctuation, "Punctuation, open"),
+     (:Pe, 15, :ClosePunctuation,        :Punctuation, "Punctuation, close"),
+     (:Pi, 16, :InitialQuotePunctuation, :Punctuation, "Punctuation, initial quote"),
+     (:Pf, 17, :FinalQuotePunctuation,   :Punctuation, "Punctuation, final quote"),
+     (:Po, 18, :OtherPunctuation,        :Punctuation, "Punctuation, other"),
+     (:Sm, 19, :MathSymbol,              :Symbol,      "Symbol, math"),
+     (:Sc, 20, :CurrencySymbol,          :Symbol,      "Symbol, currency"),
+     (:Sk, 21, :ModifierSymbol,          :Symbol,      "Symbol, modifier"),
+     (:So, 22, :OtherSymbol,             :Symbol,      "Symbol, other"),
+     (:Zs, 23, :SpaceSeparator,          :Separator,   "Separator, space"),
+     (:Zl, 24, :LineSeparator,           :Separator,   "Separator, line"),
+     (:Zp, 25, :ParagraphSeparator,      :Separator,   "Separator, paragraph"),
+     (:Cc, 26, :ControlChar,             :Other,       "Other, control"),
+     (:Cf, 27, :FormatChar,              :Other,       "Other, format"),
+     (:Cs, 28, :SurrogateChar,           :Other,       "Other, surrogate"),
+     (:Co, 29, :PrivateUseChar,          :Other,       "Other, private use"))
+    @eval const global $nam = $(Code(val))
+    @eval const global $cat = $(Code(val))
+    @eval @doc $(string("Unicode Category Code: ",des)) $nam
+    @eval @doc $(string("Unicode Category Code: ",des)) $cat
 end
 
-#=
-const c2t = [NotAssignedChar, UpperCase, LowerCase, TitleCase, ModifierLetter, OtherLetter,
-             NonSpacingMark, CombiningMark, EnclosingMark,
-             DecimalDigit, NumericLetter, OtherNumber,
-             ConnectorPunctuation, DashPunctuation, OpenPunctuation, ClosePunctuation,
-             InitialQuotePunctuation, FinalQuotePunctuation, OtherPunctuation,
-             MathSymbol, CurrencySymbol, ModifierSymbol, OtherSymbol,
-             SpaceSeparator, LineSeparator, ParagraphSeparator,
-             ControlChar, FormatChar, SurrogateChar, PrivateUseChar]
-=#
-
-Base.convert(::Type{General}, cat::Code) = code2general[Int(cat)+1]
-
-Unicode.charprop(Mask, c) = Mask(1<<Int(charprop(Code, c)))
-
-Base.&(c::Code, m::Mask) = ((1<<Int(c)) & m) != 0
+Base.in(c::Code, m::Mask) = ((1<<Int(c)) & m) != 0
+Base.in(x::Code, y::Code) = x == y
 
 Base.|(x::Code, y::Code) = Mask((1<<Int(x)) | (1<<Int(y)))
 Base.|(c::Code, m::Mask) = Mask((1<<Int(c)) | m)
 Base.|(m::Mask, c::Code) = (c | m)
+Base.|(x::Mask, y::Mask) = Mask(UInt32(x) | UInt32(y))
+
+Base.|(x::Integer, y::Mask) = x | Int(y)
+Base.|(x::Mask, y::Integer) = Int(x) | y
+Base.&(x::Integer, y::Mask) = x & Int(y)
+Base.&(x::Mask, y::Integer) = Int(x) & y
+
+@eval const global Letter      = $(Mask(Lu | Ll | Lt | Lm | Lo))
+@doc """Unicode Major Category: Letter (Lu, Ll, Lt, Lm, Lo)""" Letter
+
+@eval const global Mark        = $(Mask(Mn | Mc | Me))
+@doc """Unicode Major Category: Mark (Mn, Mc, Me)""" Mark
+
+@eval const global Number      = $(Mask(Nd | Nl | No))
+@doc """Unicode Major Category: Number (Nd, Nl, No)""" Number
+
+@eval const global Symbol      = $(Mask(Sm | Sc | Sk | So))
+@doc """Unicode Major Category: Symbol (Sm, Sc, Sk, So)""" Symbol
+
+@eval const global Other       = $(Mask(Cn | Cc | Cf | Cs | Co))
+@doc """Unicode Major Category: Other (Cn, Cc, Cf, Cs, Co)""" Other
+
+@eval const global Punctuation = $(Mask(Pc | Pd | Ps | Pe | Pi | Pf | Po))
+@doc """Unicode Major Category: Punctuation (Pc, Pd, Ps, Pe, Pi, Pf, Po)""" Punctuation
+
+@eval const global Separator   = $(Mask(Zs | Zl | Zp))
+@doc """Unicode Major Category: Separator: (Zs, Zl, Zp)""" Separator
+
+@eval const global Lower  = $(Mask(Ll))
+@doc """Unicode Category: Lower = LowerCase""" Lower
+
+@eval const global Upper  = $(Mask(Lu | Lt))
+@doc """Unicode Combined Categories: Upper = UpperCase | TitleCase""" Upper
 
-@eval const global UpperMask  = $(Lu | Lt)
-@eval const global AlphaMask  = $(Lu | Ll | Lt | Lm | Lo)
-@eval const global NumberMask = $(Nd | Nl | No)
-@eval const global AlphaNumericMask = AlphaMask | NumberMask
+@eval const global AlphaNumeric = Letter | Number
+@doc """Unicode Combined Categories: AlphaNumberic = Letter | Number""" AlphaNumeric
 
-let mask = 0 ; for i = Int(Pc):Int(Po) ; mask |= (1<<i) ; end
-    @eval const global PunctuationMask = $(Mask(mask))
-    mask = 0 ; for i = Int(Lu):Int(So) ; mask |= (1<<i) ; end
-    @eval const global GraphMask = $(Mask(mask))
-    @eval const global PrintMask = $(Mask(mask) | Zs)
+let mask = 0 ; for i = Int(Lu):Int(So) ; mask |= (1<<i) ; end
+    @eval const global Graph = $(Mask(mask))
+    @doc """Unicode Combined Categories: Graph (true if printer would use ink)""" Graph
+    @eval const global Print = $(Mask(mask) | Zs)
+    @doc """Unicode Combined Categories: Print""" Print
 end
 
 end # module Cat
@@ -144,14 +138,14 @@ is_assigned_char(c) = charprop(Category.Code, c) != Category.Cn
 islower(c::Char)    = charprop(Category.Code, c) == Category.Ll
 
 # true for Unicode upper and mixed case
-isupper(c::Char)  = charprop(Category.Code, c) & Category.UpperMask
-isalpha(c::Char)  = charprop(Category.Code, c) & Category.AlphaMask
-isnumber(c::Char) = charprop(Category.Code, c) & Category.NumberMask
-isalnum(c::Char)  = charprop(Category.Code, c) & Category.AlphaNumericMask
-ispunct(c::Char)  = charprop(Category.Code, c) & Category.PunctuationMask
-isprint(c::Char)  = charprop(Category.Code, c) & Category.PrintMask
+isupper(c::Char)  = charprop(Category.Code, c) in Category.Upper
+isalpha(c::Char)  = charprop(Category.Code, c) in Category.Letter
+isnumber(c::Char) = charprop(Category.Code, c) in Category.Number
+isalnum(c::Char)  = charprop(Category.Code, c) in Category.AlphaNumeric
+ispunct(c::Char)  = charprop(Category.Code, c) in Category.Punctuation
+isprint(c::Char)  = charprop(Category.Code, c) in Category.Print
 # true in principle if a printer would use ink
-isgraph(c::Char)  = charprop(Category.Code, c) & Category.GraphMask
+isgraph(c::Char)  = charprop(Category.Code, c) in Category.Graph
 
 isdigit(c::Char)  = ('0' <= c <= '9')
 

From 2dd14d8579ae1be5a95b0aa7acdd0d6a07281807 Mon Sep 17 00:00:00 2001
From: ScottPJones <scottjones@alum.mit.edu>
Date: Tue, 15 Dec 2015 18:14:34 -0500
Subject: [PATCH 7/7] Remove Category from exports

Updated tests to use Unicode.Category
---
 base/exports.jl            | 1 -
 test/unicode/properties.jl | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/base/exports.jl b/base/exports.jl
index fb0d3b39beeca..f07a8441669ff 100644
--- a/base/exports.jl
+++ b/base/exports.jl
@@ -22,7 +22,6 @@ export
     Serializer,
     Docs,
     Markdown,
-    Category,
     Unicode,
 
 # Types
diff --git a/test/unicode/properties.jl b/test/unicode/properties.jl
index 58170c1328d43..66b010176f2c4 100644
--- a/test/unicode/properties.jl
+++ b/test/unicode/properties.jl
@@ -143,9 +143,9 @@ end
 
 # check handling of CN category constants
 let c_ll = 'β', c_cn = '\u038B'
-    @test charprop(Category.Code, c_ll) == Category.Ll
+    @test charprop(Unicode.Category.Code, c_ll) == Unicode.Category.Ll
     # check codepoint with category code CN
-    @test charprop(Category.Code, c_cn) == Category.Cn
+    @test charprop(Unicode.Category.Code, c_cn) == Unicode.Category.Cn
 end
 
 # Make sure fastplus is called for coverage