From 8365c8467010328f4d6838759c22deaee6574ab5 Mon Sep 17 00:00:00 2001 From: ScottPJones Date: Sat, 12 Dec 2015 19:59:05 -0500 Subject: [PATCH 1/7] Unicode character properties --- base/exports.jl | 5 + base/io.jl | 4 +- base/unicode.jl | 17 ++- base/unicode/UnicodeError.jl | 43 ++++---- base/unicode/checkstring.jl | 40 +++---- base/unicode/properties.jl | 157 ++++++++++++++++++++++++++ base/unicode/types.jl | 21 ++-- base/unicode/utf16.jl | 16 +-- base/unicode/utf32.jl | 4 +- base/unicode/utf8.jl | 12 +- base/unicode/utf8proc.jl | 208 ++++++++++------------------------- test/unicode.jl | 3 +- test/unicode/UnicodeError.jl | 2 +- test/unicode/properties.jl | 164 +++++++++++++++++++++++++++ test/unicode/utf8proc.jl | 163 --------------------------- 15 files changed, 474 insertions(+), 385 deletions(-) create mode 100644 base/unicode/properties.jl create mode 100644 test/unicode/properties.jl diff --git a/base/exports.jl b/base/exports.jl index e05c856bd9a3f..00d714ceb74be 100644 --- a/base/exports.jl +++ b/base/exports.jl @@ -22,6 +22,7 @@ export Serializer, Docs, Markdown, + Unicode, # Types AbstractChannel, @@ -40,6 +41,8 @@ export CartesianIndex, CartesianRange, Channel, + CharCategory, + CharCategoryCode, Cmd, Colon, Complex, @@ -116,6 +119,7 @@ export SymTridiagonal, Timer, Tridiagonal, + UnicodeProperty, UnitRange, UpperTriangular, UTF16String, @@ -818,6 +822,7 @@ export bits, bytes2hex, bytestring, + charprop, charwidth, chomp, chop, diff --git a/base/io.jl b/base/io.jl index d2f17595b5f33..eaf25d7bae5d1 100644 --- a/base/io.jl +++ b/base/io.jl @@ -176,7 +176,7 @@ function read(s::IO, ::Type{Char}) end # mimic utf8.next function - trailing = Base.utf8_trailing[ch+1] + trailing = Unicode.utf8_trailing[ch+1] c::UInt32 = 0 for j = 1:trailing c += ch @@ -184,7 +184,7 @@ function read(s::IO, ::Type{Char}) ch = read(s, UInt8) end c += ch - c -= Base.utf8_offset[trailing+1] + c -= Unicode.utf8_offset[trailing+1] Char(c) end diff --git a/base/unicode.jl b/base/unicode.jl index e0ed8b5d1b0a8..a0f6096e75289 100644 --- a/base/unicode.jl +++ b/base/unicode.jl @@ -1,10 +1,25 @@ # This file is a part of Julia. License is MIT: http://julialang.org/license +module Unicode +import Base: string, convert, write, length, endof, next, reverseind, lastidx, reverse, isvalid, + sizeof, unsafe_convert, map, getindex, search, rsearch, pointer, containsnul, + lowercase, uppercase, eltype, isless, promote_rule, == + +export UnicodeError, UTF16String, UTF32String, unsafe_checkstring, checkstring, + utf8, utf16, utf32, containsnul, WString, wstring, + charprop, CharCategoryCode, UnicodeProperty, CharCategory, CatLetter, CatMark, CatNumber, + CatPunctuation, CatSymbol, CatSeparator, CatOther, CatUpper, + is_assigned_char, islower, isupper, isdigit, isalpha, isnumber, isalnum, iscntrl, + ispunct, isspace, isprint, isgraph, + isgraphemebreak, GraphemeIterator, normalize_string, graphemes, charwidth + include("unicode/UnicodeError.jl") include("unicode/types.jl") include("unicode/checkstring.jl") include("unicode/utf8.jl") include("unicode/utf16.jl") include("unicode/utf32.jl") +include("unicode/properties.jl") include("unicode/utf8proc.jl") -importall .UTF8proc +end +importall .Unicode diff --git a/base/unicode/UnicodeError.jl b/base/unicode/UnicodeError.jl index 5b9002729ccf3..c2626f5167af5 100644 --- a/base/unicode/UnicodeError.jl +++ b/base/unicode/UnicodeError.jl @@ -2,30 +2,31 @@ ## Error messages for Unicode / UTF support -const UTF_ERR_SHORT = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)" -const UTF_ERR_CONT = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)" -const UTF_ERR_LONG = "invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)" -const UTF_ERR_NOT_LEAD = "not a leading Unicode surrogate code unit at index <<1>> (0x<<2>>)" -const UTF_ERR_NOT_TRAIL = "not a trailing Unicode surrogate code unit at index <<1>> (0x<<2>>)" -const UTF_ERR_NOT_SURROGATE = "not a valid Unicode surrogate code unit at index <<1>> (0x<<2>>" -const UTF_ERR_MISSING_SURROGATE = "missing trailing Unicode surrogate code unit after index <<1>> (0x<<2>>)" -const UTF_ERR_INVALID = "invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)" -const UTF_ERR_SURROGATE = "surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)" -const UTF_ERR_NULL_16_TERMINATE = "UTF16String data must be NULL-terminated" -const UTF_ERR_NULL_32_TERMINATE = "UTF32String data must be NULL-terminated" -const UTF_ERR_ODD_BYTES_16 = "UTF16String can't have odd number of bytes <<1>>" -const UTF_ERR_ODD_BYTES_32 = "UTF32String must have multiple of 4 bytes <<1>>" -const UTF_ERR_INVALID_CHAR = "invalid Unicode character (0x<<2>> > 0x10ffff)" -const UTF_ERR_INVALID_8 = "invalid UTF-8 data" -const UTF_ERR_INVALID_16 = "invalid UTF-16 data" -const UTF_ERR_INVALID_INDEX = "invalid character index" -const UTF_ERR_MAP_CHAR = "map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead" +const ERR_SHORT = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)" +const ERR_CONT = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)" +const ERR_LONG = "invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)" +const ERR_NOT_LEAD = "not a leading Unicode surrogate code unit at index <<1>> (0x<<2>>)" +const ERR_NOT_TRAIL = "not a trailing Unicode surrogate code unit at index <<1>> (0x<<2>>)" +const ERR_NOT_SURROGATE = "not a valid Unicode surrogate code unit at index <<1>> (0x<<2>>" +const ERR_MISSING_SURROGATE = "missing trailing Unicode surrogate code unit after index <<1>> (0x<<2>>)" +const ERR_INVALID = "invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)" +const ERR_SURROGATE = "surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)" +const ERR_NULL_16_TERMINATE = "UTF16String data must be NULL-terminated" +const ERR_NULL_32_TERMINATE = "UTF32String data must be NULL-terminated" +const ERR_ODD_BYTES_16 = "UTF16String can't have odd number of bytes <<1>>" +const ERR_ODD_BYTES_32 = "UTF32String must have multiple of 4 bytes <<1>>" +const ERR_INVALID_CHAR = "invalid Unicode character (0x<<2>> > 0x10ffff)" +const ERR_INVALID_8 = "invalid UTF-8 data" +const ERR_INVALID_16 = "invalid UTF-16 data" +const ERR_INVALID_INDEX = "invalid character index" +const ERR_MAP_CHAR = "map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead" type UnicodeError <: Exception - errmsg::AbstractString ##< A UTF_ERR_ message + errmsg::AbstractString ##< A Unicode.ERR_ message errpos::Int32 ##< Position of invalid character errchr::UInt32 ##< Invalid character end -show(io::IO, exc::UnicodeError) = print(io, replace(replace(string("UnicodeError: ",exc.errmsg), - "<<1>>",string(exc.errpos)),"<<2>>",hex(exc.errchr))) +Base.show(io::IO, exc::UnicodeError) = + print(io, replace(replace(string("UnicodeError: ",exc.errmsg), + "<<1>>",string(exc.errpos)),"<<2>>",hex(exc.errchr))) diff --git a/base/unicode/checkstring.jl b/base/unicode/checkstring.jl index 8b9f344831f95..df185ede675f3 100644 --- a/base/unicode/checkstring.jl +++ b/base/unicode/checkstring.jl @@ -20,7 +20,7 @@ const UTF_SURROGATE = 32 ##< surrogate pairs present ## Get a UTF-8 continuation byte, give error if invalid, return updated character value @inline function get_continuation(ch::UInt32, byt::UInt8, pos) if !is_valid_continuation(byt) - throw(UnicodeError(UTF_ERR_CONT, pos, byt)) + throw(UnicodeError(ERR_CONT, pos, byt)) end (ch << 6) | (byt & 0x3f) end @@ -73,7 +73,7 @@ function unsafe_checkstring(dat::Vector{UInt8}, # Check UTF-8 encoding if ch < 0xe0 # 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff) - (pos > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch)) + (pos > endpos) && throw(UnicodeError(ERR_SHORT, pos, ch)) byt, pos = next(dat, pos) ch = get_continuation(ch & 0x3f, byt, pos) if ch > 0x7f @@ -84,28 +84,28 @@ function unsafe_checkstring(dat::Vector{UInt8}, elseif (ch == 0) && accept_long_null flags |= UTF_LONG else - throw(UnicodeError(UTF_ERR_LONG, pos, ch)) + throw(UnicodeError(ERR_LONG, pos, ch)) end elseif ch < 0xf0 # 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff) - (pos + 1 > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch)) + (pos + 1 > endpos) && throw(UnicodeError(ERR_SHORT, pos, ch)) byt, pos = next(dat, pos) ch = get_continuation(ch & 0x0f, byt, pos) byt, pos = next(dat, pos) ch = get_continuation(ch, byt, pos) # check for surrogate pairs, make sure correct if is_surrogate_codeunit(ch) - !is_surrogate_lead(ch) && throw(UnicodeError(UTF_ERR_NOT_LEAD, pos-2, ch)) + !is_surrogate_lead(ch) && throw(UnicodeError(ERR_NOT_LEAD, pos-2, ch)) # next character *must* be a trailing surrogate character - (pos + 2 > endpos) && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, pos-2, ch)) + (pos + 2 > endpos) && throw(UnicodeError(ERR_MISSING_SURROGATE, pos-2, ch)) byt, pos = next(dat, pos) - (byt != 0xed) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos, byt)) + (byt != 0xed) && throw(UnicodeError(ERR_NOT_TRAIL, pos, byt)) byt, pos = next(dat, pos) surr = get_continuation(0x0000d, byt, pos) byt, pos = next(dat, pos) surr = get_continuation(surr, byt, pos) - !is_surrogate_trail(surr) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos-2, surr)) - !accept_surrogates && throw(UnicodeError(UTF_ERR_SURROGATE, pos-2, surr)) + !is_surrogate_trail(surr) && throw(UnicodeError(ERR_NOT_TRAIL, pos-2, surr)) + !accept_surrogates && throw(UnicodeError(ERR_SURROGATE, pos-2, surr)) flags |= UTF_SURROGATE num4byte += 1 elseif ch > 0x07ff @@ -114,11 +114,11 @@ function unsafe_checkstring(dat::Vector{UInt8}, flags |= UTF_LONG num2byte += 1 else - throw(UnicodeError(UTF_ERR_LONG, pos-2, ch)) + throw(UnicodeError(ERR_LONG, pos-2, ch)) end elseif ch < 0xf5 # 4-byte UTF-8 sequence (i.e. characters > 0xffff) - (pos + 2 > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch)) + (pos + 2 > endpos) && throw(UnicodeError(ERR_SHORT, pos, ch)) byt, pos = next(dat, pos) ch = get_continuation(ch & 0x07, byt, pos) byt, pos = next(dat, pos) @@ -126,11 +126,11 @@ function unsafe_checkstring(dat::Vector{UInt8}, byt, pos = next(dat, pos) ch = get_continuation(ch, byt, pos) if ch > 0x10ffff - throw(UnicodeError(UTF_ERR_INVALID, pos-3, ch)) + throw(UnicodeError(ERR_INVALID, pos-3, ch)) elseif ch > 0xffff num4byte += 1 elseif is_surrogate_codeunit(ch) - throw(UnicodeError(UTF_ERR_SURROGATE, pos-3, ch)) + throw(UnicodeError(ERR_SURROGATE, pos-3, ch)) elseif accept_long_char # This is an overly long encoded character flags |= UTF_LONG @@ -140,10 +140,10 @@ function unsafe_checkstring(dat::Vector{UInt8}, num2byte += 1 end else - throw(UnicodeError(UTF_ERR_LONG, pos-2, ch)) + throw(UnicodeError(ERR_LONG, pos-2, ch)) end else - throw(UnicodeError(UTF_ERR_INVALID, pos, ch)) + throw(UnicodeError(ERR_INVALID, pos, ch)) end end end @@ -174,22 +174,22 @@ function unsafe_checkstring{T <: Union{Vector{UInt16}, Vector{UInt32}, AbstractS num2byte += 1 flags |= UTF_UNICODE2 elseif ch > 0x0ffff - (ch > 0x10ffff) && throw(UnicodeError(UTF_ERR_INVALID, pos, ch)) + (ch > 0x10ffff) && throw(UnicodeError(ERR_INVALID, pos, ch)) num4byte += 1 elseif !is_surrogate_codeunit(ch) num3byte += 1 elseif is_surrogate_lead(ch) - pos > endpos && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, pos, ch)) + pos > endpos && throw(UnicodeError(ERR_MISSING_SURROGATE, pos, ch)) # next character *must* be a trailing surrogate character ch, pos = next(dat, pos) - !is_surrogate_trail(ch) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos, ch)) + !is_surrogate_trail(ch) && throw(UnicodeError(ERR_NOT_TRAIL, pos, ch)) num4byte += 1 if T != Vector{UInt16} - !accept_surrogates && throw(UnicodeError(UTF_ERR_SURROGATE, pos, ch)) + !accept_surrogates && throw(UnicodeError(ERR_SURROGATE, pos, ch)) flags |= UTF_SURROGATE end else - throw(UnicodeError(UTF_ERR_NOT_LEAD, pos, ch)) + throw(UnicodeError(ERR_NOT_LEAD, pos, ch)) end end end diff --git a/base/unicode/properties.jl b/base/unicode/properties.jl new file mode 100644 index 0000000000000..e2cf4e99b6a00 --- /dev/null +++ b/base/unicode/properties.jl @@ -0,0 +1,157 @@ +# This file is a part of Julia. License is MIT: http://julialang.org/license + +# Unicode properties, such as General Category +# Unix/C is* convenience functions (for now) + +# whether codepoints are valid Unicode scalar values, i.e. 0-0xd7ff, 0xe000-0x10ffff +isvalid(::Type{Char}, ch::Unsigned) = !((ch - 0xd800 < 0x800) | (ch > 0x10ffff)) +isvalid(::Type{Char}, ch::Integer) = isvalid(Char, Unsigned(ch)) +isvalid(::Type{Char}, ch::Char) = isvalid(Char, UInt32(ch)) + +isvalid(ch::Char) = isvalid(Char, ch) + +# Unicode General Category constants + +"""Unicode character properties""" +abstract UnicodeProperty +"""Unicode character categories""" +abstract CharCategory <: UnicodeProperty + +"""Unicode letter character category""" +abstract CatLetter <: CharCategory +"""Unicode Mark character category""" +abstract CatMark <: CharCategory +"""Unicode Numeric character category""" +abstract CatNumber <: CharCategory +"""Unicode Punctuation character category""" +abstract CatPunctuation <: CharCategory +"""Unicode Symbol character category""" +abstract CatSymbol <: CharCategory +"""Unicode Separator character category""" +abstract CatSeparator <: CharCategory +"""Unicode Other character category""" +abstract CatOther <: CharCategory + +"""Unicode uppercase & titlecase letters""" +abstract CatUpper <: CatLetter + +"""Unicode Character Category Code (0-29)""" +bitstype 8 CharCategoryCode + +convert(::Type{CharCategoryCode}, x::Integer) = reinterpret(CharCategoryCode, x%UInt8) +convert{T<:Integer}(::Type{T}, x::CharCategoryCode) = convert(T, reinterpret(UInt8, x)) +promote_rule{T<:Integer}(::Type{T}, ::Type{CharCategoryCode}) = T +isless(x::CharCategoryCode, y::CharCategoryCode) = isless(UInt32(x), UInt32(y)) +isless(x::CharCategoryCode, y::Integer) = isless(UInt32(x), y) +isless(x::Integer, y::CharCategoryCode) = isless(x, UInt32(y)) + +for (nam, val, cat, typ, des) in + ((:Cn, 0, :NotAssignedChar, CatOther, "Other, Not assigned"), + (:Lu, 1, :UpperCase, CatUpper, "Letter, uppercase"), + (:Ll, 2, :LowerCase, CatLetter, "Letter, lowercase"), + (:Lt, 3, :TitleCase, CatUpper, "Letter, titlecase"), + (:Lm, 4, :ModifierLetter, CatLetter, "Letter, modifier"), + (:Lo, 5, :OtherLetter, CatLetter, "Letter, other"), + (:Mn, 6, :NonSpacingMark, CatMark, "Mark, nonspacing"), + (:Mc, 7, :CombiningMark, CatMark, "Mark, spacing combining"), + (:Me, 8, :EnclosingMark, CatMark, "Mark, enclosing"), + (:Nd, 9, :DecimalDigit, CatNumber, "Number, decimal digit"), + (:Nl, 10, :NumericLetter, CatNumber, "Number, letter"), + (:No, 11, :OtherNumber, CatNumber, "Number, other"), + (:Pc, 12, :ConnectorPunct, CatPunctuation, "Punctuation, connector"), + (:Pd, 13, :DashPunct, CatPunctuation, "Punctuation, dash"), + (:Ps, 14, :OpenPunct, CatPunctuation, "Punctuation, open"), + (:Pe, 15, :ClosePunct, CatPunctuation, "Punctuation, close"), + (:Pi, 16, :BegQuotePunct, CatPunctuation, "Punctuation, initial quote"), + (:Pf, 17, :EndQuotePunct, CatPunctuation, "Punctuation, final quote"), + (:Po, 18, :OtherPunct, CatPunctuation, "Punctuation, other"), + (:Sm, 19, :MathSymbol, CatSymbol, "Symbol, math"), + (:Sc, 20, :CurrencySymbol, CatSymbol, "Symbol, currency"), + (:Sk, 21, :ModifierSymbol, CatSymbol, "Symbol, modifier"), + (:So, 22, :OtherSymbol, CatSymbol, "Symbol, other"), + (:Zs, 23, :SpaceSeparator, CatSeparator, "Separator, space"), + (:Zl, 24, :LineSeparator, CatSeparator, "Separator, line"), + (:Zp, 25, :ParagraphSeparator, CatSeparator, "Separator, paragraph"), + (:Cc, 26, :ControlChar, CatOther, "Other, control"), + (:Cf, 27, :FormatChar, CatOther, "Other, format"), + (:Cs, 28, :SurrogateChar, CatOther, "Other, surrogate"), + (:Co, 29, :PrivateUseChar, CatOther, "Other, private use")) + @eval const global $nam = CharCategoryCode($val) + @eval export $cat + @eval abstract $cat <: $typ + @eval @doc $(string("Unicode Category Code: ",des)) $nam + @eval @doc $(string("Unicode Category Type: ",des)) $cat +end + +const c2t = [NotAssignedChar, UpperCase, LowerCase, TitleCase, ModifierLetter, OtherLetter, + NonSpacingMark, CombiningMark, EnclosingMark, + DecimalDigit, NumericLetter, OtherNumber, + ConnectorPunct, DashPunct, OpenPunct, ClosePunct, + BegQuotePunct, EndQuotePunct, OtherPunct, + MathSymbol, CurrencySymbol, ModifierSymbol, OtherSymbol, + SpaceSeparator, LineSeparator, ParagraphSeparator, + ControlChar, FormatChar, SurrogateChar, PrivateUseChar] + +############################################################################ + + +""" +Return various Unicode properties for character +""" +function charprop end + +charprop(::Type{CharCategory}, c) = c2t[Int(charprop(CharCategoryCode, c))+1] + +is_assigned_char(c) = charprop(CharCategoryCode, c) != Cn + +## libc character class predicates ## + +islower(c::Char) = charprop(CharCategoryCode, c) == Ll + +# true for Unicode upper and mixed case +isupper(c::Char) = (ccode = charprop(CharCategoryCode, c)) == Lu || ccode == Lt + +isdigit(c::Char) = ('0' <= c <= '9') +isalpha(c::Char) = (Lu <= charprop(CharCategoryCode, c) <= Lo) +isnumber(c::Char) = (Nd <= charprop(CharCategoryCode, c) <= No) +isalnum(c::Char) = (Lu <= (ccode = charprop(CharCategoryCode, c)) <= Lo) || (Nd <= ccode <= No) + +# These are about 3 times slower, because the isa method +# is much slower than checking if an integer is within range (or two ranges) +# If that is sped up, then these, which are more readable, could replace the other forms. +#= +isalpha(c::Char) = charprop(CharCategory, c) <: CatLetter +isnumber(c::Char) = charprop(CharCategory, c) <: CatNumber +isupper(c::Char) = charprop(CharCategory, c) <: CatUpper +isalnum(c::Char) = charprop(CharCategory, c) <: Union{CatLetter, CatNumber} +ispunct(c::Char) = charprop(CharCategory, c) <: CatPunctuation +=# + +# following C++ only control characters from the Latin-1 subset return true +iscntrl(c::Char) = (c <= Char(0x1f) || Char(0x7f) <= c <= Char(0x9f)) + +ispunct(c::Char) = (Pc <= charprop(CharCategoryCode, c) <= Po) + +# \u85 is the Unicode Next Line (NEL) character +# the check for \ufffd allows for branch removal on ASCIIStrings +@inline isspace(c::Char) = + (c == ' ' || '\t' <= c <='\r' || c == '\u85' || + ('\ua0' <= c && c != '\ufffd' && charprop(CharCategoryCode, c) == Zs)) + +isprint(c::Char) = (Lu <= charprop(CharCategoryCode, c) <= Zs) + +# true in principle if a printer would use ink +isgraph(c::Char) = (Lu <= charprop(CharCategoryCode, c) <= So) + +for name = ("alnum", "alpha", "cntrl", "digit", "number", "graph", + "lower", "print", "punct", "space", "upper") + f = symbol("is",name) + @eval begin + function $f(s::AbstractString) + for c in s + $f(c) || return false + end + return true + end + end +end diff --git a/base/unicode/types.jl b/base/unicode/types.jl index 52765a853303b..9c6dc990921e4 100644 --- a/base/unicode/types.jl +++ b/base/unicode/types.jl @@ -1,29 +1,31 @@ # This file is a part of Julia. License is MIT: http://julialang.org/license -##\brief Base UTF16String type, has 16-bit NULL termination word after data, native byte order -# -# \throws UnicodeError +""" +Base UTF16String type, has 16-bit NULL termination word after data, native byte order +Throws: UnicodeError +""" immutable UTF16String <: AbstractString data::Vector{UInt16} # includes 16-bit NULL termination after string chars function UTF16String(data::Vector{UInt16}) if length(data) < 1 || data[end] != 0 - throw(UnicodeError(UTF_ERR_NULL_16_TERMINATE, 0, 0)) + throw(UnicodeError(ERR_NULL_16_TERMINATE, 0, 0)) end new(data) end end -##\brief Base UTF32String type, has 32-bit NULL termination word after data, native byte order -# -# \throws UnicodeError +""" +Base UTF32String type, has 32-bit NULL termination word after data, native byte order +Throws: UnicodeError +""" immutable UTF32String <: DirectIndexString data::Vector{UInt32} # includes 32-bit NULL termination after string chars function UTF32String(data::Vector{UInt32}) if length(data) < 1 || data[end] != 0 - throw(UnicodeError(UTF_ERR_NULL_32_TERMINATE, 0, 0)) + throw(UnicodeError(ERR_NULL_32_TERMINATE, 0, 0)) end new(data) end @@ -31,4 +33,5 @@ end UTF32String(data::Vector{Char}) = UTF32String(reinterpret(UInt32, data)) isvalid{T<:Union{ASCIIString,UTF8String,UTF16String,UTF32String}}(str::T) = isvalid(T, str.data) -isvalid{T<:Union{ASCIIString,UTF8String,UTF16String,UTF32String}}(::Type{T}, str::T) = isvalid(T, str.data) +isvalid{T<:Union{ASCIIString,UTF8String,UTF16String,UTF32String}}(::Type{T}, str::T) = + isvalid(T, str.data) diff --git a/base/unicode/utf16.jl b/base/unicode/utf16.jl index 712adbb75a896..551b8f97b3f25 100644 --- a/base/unicode/utf16.jl +++ b/base/unicode/utf16.jl @@ -54,10 +54,10 @@ function next(s::UTF16String, i::Int) ch = s.data[i] !is_surrogate_codeunit(ch) && return (Char(ch), i+1) # check length, account for terminating \0 - i >= (length(s.data)-1) && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, i, UInt32(ch))) - !is_surrogate_lead(ch) && throw(UnicodeError(UTF_ERR_NOT_LEAD, i, ch)) + i >= (length(s.data)-1) && throw(UnicodeError(ERR_MISSING_SURROGATE, i, UInt32(ch))) + !is_surrogate_lead(ch) && throw(UnicodeError(ERR_NOT_LEAD, i, ch)) ct = s.data[i+1] - !is_surrogate_trail(ct) && throw((UTF_ERR_NOT_TRAIL, i, ch)) + !is_surrogate_trail(ct) && throw((ERR_NOT_TRAIL, i, ch)) Char(get_supplementary(ch, ct)), i+2 end @@ -222,7 +222,7 @@ end function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8}) isempty(bytes) && return UTF16String(UInt16[0]) - isodd(length(bytes)) && throw(UnicodeError(UTF_ERR_ODD_BYTES_16, length(bytes), 0)) + isodd(length(bytes)) && throw(UnicodeError(ERR_ODD_BYTES_16, length(bytes), 0)) data = reinterpret(UInt16, bytes) # check for byte-order mark (BOM): if data[1] == 0xfeff # native byte order @@ -238,7 +238,7 @@ function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8}) copy!(d,1, data,1, length(data)) # assume native byte order end d[end] = 0 # NULL terminate - !isvalid(UTF16String, d) && throw(UnicodeError(UTF_ERR_INVALID_16,0,0)) + !isvalid(UTF16String, d) && throw(UnicodeError(ERR_INVALID_16,0,0)) UTF16String(d) end @@ -257,19 +257,19 @@ function map(fun, str::UTF16String) for ch in str c2 = fun(ch) if !isa(c2, Char) - throw(UnicodeError(UTF_ERR_MAP_CHAR, 0, 0)) + throw(UnicodeError(ERR_MAP_CHAR, 0, 0)) end uc = UInt32(c2) if uc < 0x10000 if is_surrogate_codeunit(UInt16(uc)) - throw(UnicodeError(UTF_ERR_INVALID_CHAR, 0, uc)) + throw(UnicodeError(ERR_INVALID_CHAR, 0, uc)) end push!(buf, UInt16(uc)) elseif uc <= 0x10ffff push!(buf, UInt16(0xd7c0 + (uc >> 10))) push!(buf, UInt16(0xdc00 + (uc & 0x3ff))) else - throw(UnicodeError(UTF_ERR_INVALID_CHAR, 0, uc)) + throw(UnicodeError(ERR_INVALID_CHAR, 0, uc)) end end push!(buf, 0) diff --git a/base/unicode/utf32.jl b/base/unicode/utf32.jl index 4b9ebeee4f278..b53fff9450e34 100644 --- a/base/unicode/utf32.jl +++ b/base/unicode/utf32.jl @@ -149,7 +149,7 @@ unsafe_convert{T<:Union{UInt32,Int32,Char}}(::Type{Ptr{T}}, s::UTF32String) = function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8}) isempty(bytes) && return empty_utf32 - length(bytes) & 3 != 0 && throw(UnicodeError(UTF_ERR_ODD_BYTES_32,0,0)) + length(bytes) & 3 != 0 && throw(UnicodeError(ERR_ODD_BYTES_32,0,0)) data = reinterpret(UInt32, bytes) # check for byte-order mark (BOM): if data[1] == 0x0000feff # native byte order @@ -194,7 +194,7 @@ function map(f, s::UTF32String) @inbounds for i = 1:(length(d)-1) c2 = f(Char(d[i])) if !isa(c2, Char) - throw(UnicodeError(UTF_ERR_MAP_CHAR, 0, 0)) + throw(UnicodeError(ERR_MAP_CHAR, 0, 0)) end out[i] = (c2::Char) end diff --git a/base/unicode/utf8.jl b/base/unicode/utf8.jl index 5f278c0e18b4b..4f8d89b7e1d20 100644 --- a/base/unicode/utf8.jl +++ b/base/unicode/utf8.jl @@ -62,7 +62,7 @@ function next(s::UTF8String, i::Int) d = s.data b = d[i] if is_valid_continuation(b) - throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i])) + throw(UnicodeError(ERR_INVALID_INDEX, i, d[i])) end trailing = utf8_trailing[b+1] if length(d) < i + trailing @@ -114,7 +114,7 @@ function getindex(s::UTF8String, r::UnitRange{Int}) throw(BoundsError(s, i)) end if is_valid_continuation(d[i]) - throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i])) + throw(UnicodeError(ERR_INVALID_INDEX, i, d[i])) end if j > length(d) throw(BoundsError()) @@ -130,7 +130,7 @@ function search(s::UTF8String, c::Char, i::Integer) end d = s.data if is_valid_continuation(d[i]) - throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i])) + throw(UnicodeError(ERR_INVALID_INDEX, i, d[i])) end c < Char(0x80) && return search(d, c%UInt8, i) while true @@ -203,16 +203,16 @@ function reverse(s::UTF8String) ch = dat[pos] if ch > 0xdf if ch < 0xf0 - (out -= 3) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch)) + (out -= 3) < 0 && throw(UnicodeError(ERR_SHORT, pos, ch)) buf[out + 1], buf[out + 2], buf[out + 3] = ch, dat[pos + 1], dat[pos + 2] pos += 3 else - (out -= 4) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch)) + (out -= 4) < 0 && throw(UnicodeError(ERR_SHORT, pos, ch)) buf[out+1], buf[out+2], buf[out+3], buf[out+4] = ch, dat[pos+1], dat[pos+2], dat[pos+3] pos += 4 end elseif ch > 0x7f - (out -= 2) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch)) + (out -= 2) < 0 && throw(UnicodeError(ERR_SHORT, pos, ch)) buf[out + 1], buf[out + 2] = ch, dat[pos + 1] pos += 2 else diff --git a/base/unicode/utf8proc.jl b/base/unicode/utf8proc.jl index 4b8ee196cb9fb..7968d27a8c276 100644 --- a/base/unicode/utf8proc.jl +++ b/base/unicode/utf8proc.jl @@ -1,70 +1,22 @@ # This file is a part of Julia. License is MIT: http://julialang.org/license # Various Unicode functionality from the utf8proc library -module UTF8proc - -import Base: show, showcompact, ==, hash, string, symbol, isless, length, eltype, start, next, done, convert, isvalid, lowercase, uppercase - -export isgraphemebreak - -# also exported by Base: -export normalize_string, graphemes, is_assigned_char, charwidth, isvalid, - islower, isupper, isalpha, isdigit, isnumber, isalnum, - iscntrl, ispunct, isspace, isprint, isgraph, isblank - -# whether codepoints are valid Unicode scalar values, i.e. 0-0xd7ff, 0xe000-0x10ffff -isvalid(::Type{Char}, ch::Unsigned) = !((ch - 0xd800 < 0x800) | (ch > 0x10ffff)) -isvalid(::Type{Char}, ch::Integer) = isvalid(Char, Unsigned(ch)) -isvalid(::Type{Char}, ch::Char) = isvalid(Char, UInt32(ch)) - -isvalid(ch::Char) = isvalid(Char, ch) - -# utf8 category constants -const UTF8PROC_CATEGORY_CN = 0 -const UTF8PROC_CATEGORY_LU = 1 -const UTF8PROC_CATEGORY_LL = 2 -const UTF8PROC_CATEGORY_LT = 3 -const UTF8PROC_CATEGORY_LM = 4 -const UTF8PROC_CATEGORY_LO = 5 -const UTF8PROC_CATEGORY_MN = 6 -const UTF8PROC_CATEGORY_MC = 7 -const UTF8PROC_CATEGORY_ME = 8 -const UTF8PROC_CATEGORY_ND = 9 -const UTF8PROC_CATEGORY_NL = 10 -const UTF8PROC_CATEGORY_NO = 11 -const UTF8PROC_CATEGORY_PC = 12 -const UTF8PROC_CATEGORY_PD = 13 -const UTF8PROC_CATEGORY_PS = 14 -const UTF8PROC_CATEGORY_PE = 15 -const UTF8PROC_CATEGORY_PI = 16 -const UTF8PROC_CATEGORY_PF = 17 -const UTF8PROC_CATEGORY_PO = 18 -const UTF8PROC_CATEGORY_SM = 19 -const UTF8PROC_CATEGORY_SC = 20 -const UTF8PROC_CATEGORY_SK = 21 -const UTF8PROC_CATEGORY_SO = 22 -const UTF8PROC_CATEGORY_ZS = 23 -const UTF8PROC_CATEGORY_ZL = 24 -const UTF8PROC_CATEGORY_ZP = 25 -const UTF8PROC_CATEGORY_CC = 26 -const UTF8PROC_CATEGORY_CF = 27 -const UTF8PROC_CATEGORY_CS = 28 -const UTF8PROC_CATEGORY_CO = 29 - -const UTF8PROC_STABLE = (1<<1) -const UTF8PROC_COMPAT = (1<<2) -const UTF8PROC_COMPOSE = (1<<3) -const UTF8PROC_DECOMPOSE = (1<<4) -const UTF8PROC_IGNORE = (1<<5) -const UTF8PROC_REJECTNA = (1<<6) -const UTF8PROC_NLF2LS = (1<<7) -const UTF8PROC_NLF2PS = (1<<8) -const UTF8PROC_NLF2LF = (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS) -const UTF8PROC_STRIPCC = (1<<9) -const UTF8PROC_CASEFOLD = (1<<10) -const UTF8PROC_CHARBOUND = (1<<11) -const UTF8PROC_LUMP = (1<<12) -const UTF8PROC_STRIPMARK = (1<<13) + +const STABLE = (1<<1) +const COMPAT = (1<<2) +const COMPOSE = (1<<3) +const DECOMPOSE = (1<<4) +const IGNORE = (1<<5) +const REJECTNA = (1<<6) +const NLF2LS = (1<<7) +const NLF2PS = (1<<8) +const STRIPCC = (1<<9) +const CASEFOLD = (1<<10) +const CHARBOUND = (1<<11) +const LUMP = (1<<12) +const STRIPMARK = (1<<13) + +const NLF2LF = (NLF2LS | NLF2PS) ############################################################################ @@ -80,106 +32,61 @@ end utf8proc_map(s::AbstractString, flags::Integer) = utf8proc_map(bytestring(s), flags) -function normalize_string(s::AbstractString; stable::Bool=false, compat::Bool=false, compose::Bool=true, decompose::Bool=false, stripignore::Bool=false, rejectna::Bool=false, newline2ls::Bool=false, newline2ps::Bool=false, newline2lf::Bool=false, stripcc::Bool=false, casefold::Bool=false, lump::Bool=false, stripmark::Bool=false) +function normalize_string(s::AbstractString; stable::Bool=false, compat::Bool=false, + compose::Bool=true, decompose::Bool=false, stripignore::Bool=false, + rejectna::Bool=false, newline2ls::Bool=false, newline2ps::Bool=false, + newline2lf::Bool=false, stripcc::Bool=false, casefold::Bool=false, + lump::Bool=false, stripmark::Bool=false) flags = 0 - stable && (flags = flags | UTF8PROC_STABLE) - compat && (flags = flags | UTF8PROC_COMPAT) + stable && (flags = flags | STABLE) + compat && (flags = flags | COMPAT) if decompose - flags = flags | UTF8PROC_DECOMPOSE + flags = flags | DECOMPOSE elseif compose - flags = flags | UTF8PROC_COMPOSE + flags = flags | COMPOSE elseif compat || stripmark throw(ArgumentError("compat=true or stripmark=true require compose=true or decompose=true")) end - stripignore && (flags = flags | UTF8PROC_IGNORE) - rejectna && (flags = flags | UTF8PROC_REJECTNA) - newline2ls + newline2ps + newline2lf > 1 && throw(ArgumentError("only one newline conversion may be specified")) - newline2ls && (flags = flags | UTF8PROC_NLF2LS) - newline2ps && (flags = flags | UTF8PROC_NLF2PS) - newline2lf && (flags = flags | UTF8PROC_NLF2LF) - stripcc && (flags = flags | UTF8PROC_STRIPCC) - casefold && (flags = flags | UTF8PROC_CASEFOLD) - lump && (flags = flags | UTF8PROC_LUMP) - stripmark && (flags = flags | UTF8PROC_STRIPMARK) + stripignore && (flags = flags | IGNORE) + rejectna && (flags = flags | REJECTNA) + newline2ls + newline2ps + newline2lf > 1 && + throw(ArgumentError("only one newline conversion may be specified")) + newline2ls && (flags = flags | NLF2LS) + newline2ps && (flags = flags | NLF2PS) + newline2lf && (flags = flags | NLF2LF) + stripcc && (flags = flags | STRIPCC) + casefold && (flags = flags | CASEFOLD) + lump && (flags = flags | LUMP) + stripmark && (flags = flags | STRIPMARK) utf8proc_map(s, flags) end -function normalize_string(s::AbstractString, nf::Symbol) - utf8proc_map(s, nf == :NFC ? (UTF8PROC_STABLE | UTF8PROC_COMPOSE) : - nf == :NFD ? (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE) : - nf == :NFKC ? (UTF8PROC_STABLE | UTF8PROC_COMPOSE - | UTF8PROC_COMPAT) : - nf == :NFKD ? (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE - | UTF8PROC_COMPAT) : +normalize_string(s::AbstractString, nf::Symbol) = + utf8proc_map(s, nf == :NFC ? (STABLE | COMPOSE) : + nf == :NFD ? (STABLE | DECOMPOSE) : + nf == :NFKC ? (STABLE | COMPOSE | COMPAT) : + nf == :NFKD ? (STABLE | DECOMPOSE | COMPAT) : throw(ArgumentError(":$nf is not one of :NFC, :NFD, :NFKC, :NFKD"))) -end ############################################################################ charwidth(c::Char) = Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), c)) -lowercase(c::Char) = isascii(c) ? ('A' <= c <= 'Z' ? c + 0x20 : c) : Char(ccall(:utf8proc_tolower, UInt32, (UInt32,), c)) -uppercase(c::Char) = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) : Char(ccall(:utf8proc_toupper, UInt32, (UInt32,), c)) +lowercase(c::Char) = (isascii(c) + ? ('A' <= c <= 'Z' ? c + 0x20 : c) + : Char(ccall(:utf8proc_tolower, UInt32, (UInt32,), c))) +uppercase(c::Char) = (isascii(c) + ? ('a' <= c <= 'z' ? c - 0x20 : c) + : Char(ccall(:utf8proc_toupper, UInt32, (UInt32,), c))) ############################################################################ -# returns UTF8PROC_CATEGORY code in 0:30 giving Unicode category -function category_code(c) - return ccall(:utf8proc_category, Cint, (UInt32,), c) -end - -is_assigned_char(c) = category_code(c) != UTF8PROC_CATEGORY_CN - -## libc character class predicates ## - -islower(c::Char) = (category_code(c) == UTF8PROC_CATEGORY_LL) - -# true for Unicode upper and mixed case -function isupper(c::Char) - ccode = category_code(c) - return ccode == UTF8PROC_CATEGORY_LU || ccode == UTF8PROC_CATEGORY_LT -end - -isdigit(c::Char) = ('0' <= c <= '9') -isalpha(c::Char) = (UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_LO) -isnumber(c::Char) = (UTF8PROC_CATEGORY_ND <= category_code(c) <= UTF8PROC_CATEGORY_NO) - -function isalnum(c::Char) - ccode = category_code(c) - return (UTF8PROC_CATEGORY_LU <= ccode <= UTF8PROC_CATEGORY_LO) || - (UTF8PROC_CATEGORY_ND <= ccode <= UTF8PROC_CATEGORY_NO) -end - -# following C++ only control characters from the Latin-1 subset return true -iscntrl(c::Char) = (c <= Char(0x1f) || Char(0x7f) <= c <= Char(0x9f)) - -ispunct(c::Char) = (UTF8PROC_CATEGORY_PC <= category_code(c) <= UTF8PROC_CATEGORY_PO) - -# \u85 is the Unicode Next Line (NEL) character -# the check for \ufffd allows for branch removal on ASCIIStrings -@inline isspace(c::Char) = c == ' ' || '\t' <= c <='\r' || c == '\u85' || '\ua0' <= c && c != '\ufffd' && category_code(c)==UTF8PROC_CATEGORY_ZS - -isprint(c::Char) = (UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_ZS) - -# true in principal if a printer would use ink -isgraph(c::Char) = (UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_SO) - -for name = ("alnum", "alpha", "cntrl", "digit", "number", "graph", - "lower", "print", "punct", "space", "upper") - f = symbol("is",name) - @eval begin - function $f(s::AbstractString) - for c in s - if !$f(c) - return false - end - end - return true - end - end -end +# returns CharCategoryCode (enum values 0:29) giving Unicode category +charprop(::Type{CharCategoryCode}, c) = + CharCategoryCode(ccall(:utf8proc_category, Cint, (UInt32,), c)) ############################################################################ + # iterators for grapheme segmentation isgraphemebreak(c1::Char, c2::Char) = @@ -190,7 +97,7 @@ immutable GraphemeIterator{S<:AbstractString} end graphemes(s::AbstractString) = GraphemeIterator{typeof(s)}(s) -eltype{S}(::Type{GraphemeIterator{S}}) = SubString{S} +Base.eltype{S}(::Type{GraphemeIterator{S}}) = SubString{S} function length(g::GraphemeIterator) c0 = Char(0x00ad) # soft hyphen (grapheme break always allowed after this) @@ -202,8 +109,8 @@ function length(g::GraphemeIterator) return n end -start(g::GraphemeIterator) = start(g.s) -done(g::GraphemeIterator, i) = done(g.s, i) +Base.start(g::GraphemeIterator) = start(g.s) +Base.done(g::GraphemeIterator, i) = done(g.s, i) function next(g::GraphemeIterator, i) s = g.s @@ -220,13 +127,12 @@ function next(g::GraphemeIterator, i) end ==(g1::GraphemeIterator, g2::GraphemeIterator) = g1.s == g2.s -hash(g::GraphemeIterator, h::UInt) = hash(g.s, h) +Base.hash(g::GraphemeIterator, h::UInt) = hash(g.s, h) isless(g1::GraphemeIterator, g2::GraphemeIterator) = isless(g1.s, g2.s) convert{S<:AbstractString}(::Type{S}, g::GraphemeIterator) = convert(S, g.s) -show{S}(io::IO, g::GraphemeIterator{S}) = print(io, "length-$(length(g)) GraphemeIterator{$S} for \"$(g.s)\"") +Base.show{S}(io::IO, g::GraphemeIterator{S}) = + print(io, "length-$(length(g)) GraphemeIterator{$S} for \"$(g.s)\"") ############################################################################ - -end # module diff --git a/test/unicode.jl b/test/unicode.jl index 21f3dd7d48fb4..3f0004c7ac737 100644 --- a/test/unicode.jl +++ b/test/unicode.jl @@ -1,9 +1,10 @@ # This file is a part of Julia. License is MIT: http://julialang.org/license - include("unicode/UnicodeError.jl") include("unicode/types.jl") include("unicode/checkstring.jl") include("unicode/utf8.jl") include("unicode/utf16.jl") include("unicode/utf32.jl") +include("unicode/properties.jl") include("unicode/utf8proc.jl") + diff --git a/test/unicode/UnicodeError.jl b/test/unicode/UnicodeError.jl index 0f78ab85bb94d..272e3bffd6878 100644 --- a/test/unicode/UnicodeError.jl +++ b/test/unicode/UnicodeError.jl @@ -1,7 +1,7 @@ # This file is a part of Julia. License is MIT: http://julialang.org/license let io = IOBuffer() - show(io, UnicodeError(Base.UTF_ERR_SHORT, 1, 10)) + show(io, UnicodeError(Unicode.ERR_SHORT, 1, 10)) check = "UnicodeError: invalid UTF-8 sequence starting at index 1 (0xa) missing one or more continuation bytes)" @test takebuf_string(io) == check end diff --git a/test/unicode/properties.jl b/test/unicode/properties.jl new file mode 100644 index 0000000000000..735eed3d04fbe --- /dev/null +++ b/test/unicode/properties.jl @@ -0,0 +1,164 @@ +# This file is a part of Julia. License is MIT: http://julialang.org/license + +#issue #5939 uft8proc/libmojibake character predicates +let + alower=['a', 'd', 'j', 'y', 'z'] + ulower=['α', 'β', 'γ', 'δ', 'ф', 'я'] + for c in vcat(alower,ulower) + @test islower(c) == true + @test isupper(c) == false + @test isdigit(c) == false + @test isnumber(c) == false + end + + aupper=['A', 'D', 'J', 'Y', 'Z'] + uupper= ['Δ', 'Γ', 'Π', 'Ψ', 'Dž', 'Ж', 'Д'] + + for c in vcat(aupper,uupper) + @test islower(c) == false + @test isupper(c) == true + @test isdigit(c) == false + @test isnumber(c) == false + end + + nocase=['א','ﺵ'] + alphas=vcat(alower,ulower,aupper,uupper,nocase) + + for c in alphas + @test isalpha(c) == true + @test isnumber(c) == false + end + + + anumber=['0', '1', '5', '9'] + unumber=['٣', '٥', '٨', '¹', 'ⅳ' ] + + for c in anumber + @test isdigit(c) == true + @test isnumber(c) == true + end + for c in unumber + @test isdigit(c) == false + @test isnumber(c) == true + end + + alnums=vcat(alphas,anumber,unumber) + for c in alnums + @test isalnum(c) == true + @test ispunct(c) == false + end + + asymbol = ['(',')', '~', '$' ] + usymbol = ['∪', '∩', '⊂', '⊃', '√', '€', '¥', '↰', '△', '§'] + + apunct =['.',',',';',':','&'] + upunct =['‡', '؟', '჻' ] + + for c in vcat(apunct,upunct) + @test ispunct(c) == true + @test isalnum(c) == false + end + + for c in vcat(alnums,asymbol,usymbol,apunct,upunct) + @test isprint(c) == true + @test isgraph(c) == true + @test isspace(c) == false + @test iscntrl(c) == false + end + + NBSP = Char(0x0000A0) + ENSPACE = Char(0x002002) + EMSPACE = Char(0x002003) + THINSPACE = Char(0x002009) + ZWSPACE = Char(0x002060) + + uspace = [ENSPACE, EMSPACE, THINSPACE] + aspace = [' '] + acntrl_space = ['\t', '\n', '\v', '\f', '\r'] + for c in vcat(aspace,uspace) + @test isspace(c) == true + @test isprint(c) == true + @test isgraph(c) == false + end + + for c in vcat(acntrl_space) + @test isspace(c) == true + @test isprint(c) == false + @test isgraph(c) == false + end + + @test isspace(ZWSPACE) == false # zero-width space + + acontrol = [ Char(0x001c), Char(0x001d), Char(0x001e), Char(0x001f)] + latincontrol = [ Char(0x0080), Char(0x0085) ] + ucontrol = [ Char(0x200E), Char(0x202E) ] + + for c in vcat(acontrol, acntrl_space, latincontrol) + @test iscntrl(c) == true + @test isalnum(c) == false + @test isprint(c) == false + @test isgraph(c) == false + end + + for c in ucontrol #non-latin1 controls + if c!=Char(0x0085) + @test iscntrl(c) == false + @test isspace(c) == false + @test isalnum(c) == false + @test isprint(c) == false + @test isgraph(c) == false + end + end + +end + +@test isspace(" \t \n \r ")==true +@test isgraph(" \t \n \r ")==false +@test isprint(" \t \n \r ")==false +@test isalpha(" \t \n \r ")==false +@test isnumber(" \t \n \r ")==false +@test ispunct(" \t \n \r ")==false + +@test isspace("ΣβΣβ")==false +@test isalpha("ΣβΣβ")==true +@test isgraph("ΣβΣβ")==true +@test isprint("ΣβΣβ")==true +@test isupper("ΣβΣβ")==false +@test islower("ΣβΣβ")==false +@test isnumber("ΣβΣβ")==false +@test iscntrl("ΣβΣβ")==false +@test ispunct("ΣβΣβ")==false + +@test isnumber("23435")==true +@test isdigit("23435")==true +@test isalnum("23435")==true +@test isalpha("23435")==false +@test iscntrl( string(Char(0x0080))) == true +@test ispunct( "‡؟჻") ==true + +@test isxdigit('0') == true +@test isxdigit("0") == true +@test isxdigit("a") == true +@test isxdigit("g") == false + +# check handling of CN category constants +let c_ll = 'β', c_cn = '\u038B' + @test charprop(CharCategoryCode, c_ll) == Unicode.Ll + # check codepoint with category code CN + @test charprop(CharCategoryCode, c_cn) == Unicode.Cn +end + +# Make sure fastplus is called for coverage +@test lowercase('A') == 'a' +@test uppercase('a') == 'A' + +@test is_assigned_char('A') + +# Get full coverage of isspace function +@test isspace(' ') +@test isspace('\t') +@test isspace('\r') +@test isspace('\u85') +@test isspace('\ua0') +@test !isspace('\ufffd') +@test !isspace('\U10ffff') diff --git a/test/unicode/utf8proc.jl b/test/unicode/utf8proc.jl index 4f979c347b721..c54e1ac65b41d 100644 --- a/test/unicode/utf8proc.jl +++ b/test/unicode/utf8proc.jl @@ -75,154 +75,6 @@ end @test normalize_string("\U1e9b\U0323", :NFKD) == "s\U0323\U0307" @test normalize_string("\U1e9b\U0323", :NFKC) == "\U1e69" -#issue #5939 uft8proc/libmojibake character predicates -let - alower=['a', 'd', 'j', 'y', 'z'] - ulower=['α', 'β', 'γ', 'δ', 'ф', 'я'] - for c in vcat(alower,ulower) - @test islower(c) == true - @test isupper(c) == false - @test isdigit(c) == false - @test isnumber(c) == false - end - - aupper=['A', 'D', 'J', 'Y', 'Z'] - uupper= ['Δ', 'Γ', 'Π', 'Ψ', 'Dž', 'Ж', 'Д'] - - for c in vcat(aupper,uupper) - @test islower(c) == false - @test isupper(c) == true - @test isdigit(c) == false - @test isnumber(c) == false - end - - nocase=['א','ﺵ'] - alphas=vcat(alower,ulower,aupper,uupper,nocase) - - for c in alphas - @test isalpha(c) == true - @test isnumber(c) == false - end - - - anumber=['0', '1', '5', '9'] - unumber=['٣', '٥', '٨', '¹', 'ⅳ' ] - - for c in anumber - @test isdigit(c) == true - @test isnumber(c) == true - end - for c in unumber - @test isdigit(c) == false - @test isnumber(c) == true - end - - alnums=vcat(alphas,anumber,unumber) - for c in alnums - @test isalnum(c) == true - @test ispunct(c) == false - end - - asymbol = ['(',')', '~', '$' ] - usymbol = ['∪', '∩', '⊂', '⊃', '√', '€', '¥', '↰', '△', '§'] - - apunct =['.',',',';',':','&'] - upunct =['‡', '؟', '჻' ] - - for c in vcat(apunct,upunct) - @test ispunct(c) == true - @test isalnum(c) == false - end - - for c in vcat(alnums,asymbol,usymbol,apunct,upunct) - @test isprint(c) == true - @test isgraph(c) == true - @test isspace(c) == false - @test iscntrl(c) == false - end - - NBSP = Char(0x0000A0) - ENSPACE = Char(0x002002) - EMSPACE = Char(0x002003) - THINSPACE = Char(0x002009) - ZWSPACE = Char(0x002060) - - uspace = [ENSPACE, EMSPACE, THINSPACE] - aspace = [' '] - acntrl_space = ['\t', '\n', '\v', '\f', '\r'] - for c in vcat(aspace,uspace) - @test isspace(c) == true - @test isprint(c) == true - @test isgraph(c) == false - end - - for c in vcat(acntrl_space) - @test isspace(c) == true - @test isprint(c) == false - @test isgraph(c) == false - end - - @test isspace(ZWSPACE) == false # zero-width space - - acontrol = [ Char(0x001c), Char(0x001d), Char(0x001e), Char(0x001f)] - latincontrol = [ Char(0x0080), Char(0x0085) ] - ucontrol = [ Char(0x200E), Char(0x202E) ] - - for c in vcat(acontrol, acntrl_space, latincontrol) - @test iscntrl(c) == true - @test isalnum(c) == false - @test isprint(c) == false - @test isgraph(c) == false - end - - for c in ucontrol #non-latin1 controls - if c!=Char(0x0085) - @test iscntrl(c) == false - @test isspace(c) == false - @test isalnum(c) == false - @test isprint(c) == false - @test isgraph(c) == false - end - end - -end - -@test isspace(" \t \n \r ")==true -@test isgraph(" \t \n \r ")==false -@test isprint(" \t \n \r ")==false -@test isalpha(" \t \n \r ")==false -@test isnumber(" \t \n \r ")==false -@test ispunct(" \t \n \r ")==false - -@test isspace("ΣβΣβ")==false -@test isalpha("ΣβΣβ")==true -@test isgraph("ΣβΣβ")==true -@test isprint("ΣβΣβ")==true -@test isupper("ΣβΣβ")==false -@test islower("ΣβΣβ")==false -@test isnumber("ΣβΣβ")==false -@test iscntrl("ΣβΣβ")==false -@test ispunct("ΣβΣβ")==false - -@test isnumber("23435")==true -@test isdigit("23435")==true -@test isalnum("23435")==true -@test isalpha("23435")==false -@test iscntrl( string(Char(0x0080))) == true -@test ispunct( "‡؟჻") ==true - -@test isxdigit('0') == true -@test isxdigit("0") == true -@test isxdigit("a") == true -@test isxdigit("g") == false - -# check utf8proc handling of CN category constants -let c_ll = 'β', c_cn = '\u038B' - @test Base.UTF8proc.category_code(c_ll) == Base.UTF8proc.UTF8PROC_CATEGORY_LL - # check codepoint with category code CN - @test Base.UTF8proc.category_code(c_cn) == Base.UTF8proc.UTF8PROC_CATEGORY_CN -end - # graphemes let grphtest = (("b\u0300lahβlahb\u0302láh", ["b\u0300","l","a","h", "β","l","a","h", @@ -269,21 +121,6 @@ end @test_throws ArgumentError normalize_string("\u006e\u0303", compose=false, compat=true) @test_throws ArgumentError normalize_string("\u006e\u0303", compose=false, stripmark=true) -# Make sure fastplus is called for coverage -@test lowercase('A') == 'a' -@test uppercase('a') == 'A' - -@test is_assigned_char('A') - -# Get full coverage of isspace function -@test isspace(' ') -@test isspace('\t') -@test isspace('\r') -@test isspace('\u85') -@test isspace('\ua0') -@test !isspace('\ufffd') -@test !isspace('\U10ffff') - # Get full coverage of grapheme iterator functions let str = ascii("This is a test") g = graphemes(str) From 2c53e526d8d0582ec77cd70c92ff72aa17c982a5 Mon Sep 17 00:00:00 2001 From: ScottPJones Date: Sun, 13 Dec 2015 17:19:53 -0500 Subject: [PATCH 2/7] Update to use submodule --- base/exports.jl | 5 +- base/unicode.jl | 4 +- base/unicode/properties.jl | 156 +++++++++++++++++++------------------ base/unicode/utf8proc.jl | 5 +- test/unicode/properties.jl | 4 +- 5 files changed, 89 insertions(+), 85 deletions(-) diff --git a/base/exports.jl b/base/exports.jl index 00d714ceb74be..803a41125ff33 100644 --- a/base/exports.jl +++ b/base/exports.jl @@ -22,6 +22,7 @@ export Serializer, Docs, Markdown, + Cat, Unicode, # Types @@ -41,8 +42,8 @@ export CartesianIndex, CartesianRange, Channel, - CharCategory, - CharCategoryCode, + CharCode, + CharType, Cmd, Colon, Complex, diff --git a/base/unicode.jl b/base/unicode.jl index a0f6096e75289..fc94993410638 100644 --- a/base/unicode.jl +++ b/base/unicode.jl @@ -6,9 +6,7 @@ import Base: string, convert, write, length, endof, next, reverseind, lastidx, r lowercase, uppercase, eltype, isless, promote_rule, == export UnicodeError, UTF16String, UTF32String, unsafe_checkstring, checkstring, - utf8, utf16, utf32, containsnul, WString, wstring, - charprop, CharCategoryCode, UnicodeProperty, CharCategory, CatLetter, CatMark, CatNumber, - CatPunctuation, CatSymbol, CatSeparator, CatOther, CatUpper, + utf8, utf16, utf32, containsnul, WString, wstring, charprop, Cat, CharType, CharCode, is_assigned_char, islower, isupper, isdigit, isalpha, isnumber, isalnum, iscntrl, ispunct, isspace, isprint, isgraph, isgraphemebreak, GraphemeIterator, normalize_string, graphemes, charwidth diff --git a/base/unicode/properties.jl b/base/unicode/properties.jl index e2cf4e99b6a00..82db6cafc6861 100644 --- a/base/unicode/properties.jl +++ b/base/unicode/properties.jl @@ -12,71 +12,78 @@ isvalid(ch::Char) = isvalid(Char, ch) # Unicode General Category constants +module Cat +export Property, CharType, CharCode + """Unicode character properties""" -abstract UnicodeProperty -"""Unicode character categories""" -abstract CharCategory <: UnicodeProperty - -"""Unicode letter character category""" -abstract CatLetter <: CharCategory -"""Unicode Mark character category""" -abstract CatMark <: CharCategory -"""Unicode Numeric character category""" -abstract CatNumber <: CharCategory -"""Unicode Punctuation character category""" -abstract CatPunctuation <: CharCategory -"""Unicode Symbol character category""" -abstract CatSymbol <: CharCategory -"""Unicode Separator character category""" -abstract CatSeparator <: CharCategory -"""Unicode Other character category""" -abstract CatOther <: CharCategory +abstract Property + +"""Unicode character category type""" +abstract CharType <: Property + +"""Unicode 'Letter' character category""" +abstract Letter <: CharType +"""Unicode 'Mark' character category""" +abstract Mark <: CharType +"""Unicode 'Number' character category""" +abstract Number <: CharType +"""Unicode 'Punctuation' character category""" +abstract Punctuation <: CharType +"""Unicode 'Symbol' character category""" +abstract Symbol <: CharType +"""Unicode 'Separator' character category""" +abstract Separator <: CharType +"""Unicode 'Other' character category""" +abstract Other <: CharType """Unicode uppercase & titlecase letters""" -abstract CatUpper <: CatLetter +abstract Upper <: Letter + +"""Unicode character category code (0-29)""" +bitstype 8 CharCode -"""Unicode Character Category Code (0-29)""" -bitstype 8 CharCategoryCode +end # module Cat +import .Cat: Property, CharType, CharCode -convert(::Type{CharCategoryCode}, x::Integer) = reinterpret(CharCategoryCode, x%UInt8) -convert{T<:Integer}(::Type{T}, x::CharCategoryCode) = convert(T, reinterpret(UInt8, x)) -promote_rule{T<:Integer}(::Type{T}, ::Type{CharCategoryCode}) = T -isless(x::CharCategoryCode, y::CharCategoryCode) = isless(UInt32(x), UInt32(y)) -isless(x::CharCategoryCode, y::Integer) = isless(UInt32(x), y) -isless(x::Integer, y::CharCategoryCode) = isless(x, UInt32(y)) +convert(::Type{CharCode}, x::Integer) = reinterpret(CharCode, x%UInt8) +convert{T<:Integer}(::Type{T}, x::CharCode) = convert(T, reinterpret(UInt8, x)) +promote_rule{T<:Integer}(::Type{T}, ::Type{CharCode}) = T +isless(x::CharCode, y::CharCode) = isless(UInt8(x), UInt8(y)) +isless(x::CharCode, y::Integer) = isless(UInt8(x), y) +isless(x::Integer, y::CharCode) = isless(x, UInt8(y)) for (nam, val, cat, typ, des) in - ((:Cn, 0, :NotAssignedChar, CatOther, "Other, Not assigned"), - (:Lu, 1, :UpperCase, CatUpper, "Letter, uppercase"), - (:Ll, 2, :LowerCase, CatLetter, "Letter, lowercase"), - (:Lt, 3, :TitleCase, CatUpper, "Letter, titlecase"), - (:Lm, 4, :ModifierLetter, CatLetter, "Letter, modifier"), - (:Lo, 5, :OtherLetter, CatLetter, "Letter, other"), - (:Mn, 6, :NonSpacingMark, CatMark, "Mark, nonspacing"), - (:Mc, 7, :CombiningMark, CatMark, "Mark, spacing combining"), - (:Me, 8, :EnclosingMark, CatMark, "Mark, enclosing"), - (:Nd, 9, :DecimalDigit, CatNumber, "Number, decimal digit"), - (:Nl, 10, :NumericLetter, CatNumber, "Number, letter"), - (:No, 11, :OtherNumber, CatNumber, "Number, other"), - (:Pc, 12, :ConnectorPunct, CatPunctuation, "Punctuation, connector"), - (:Pd, 13, :DashPunct, CatPunctuation, "Punctuation, dash"), - (:Ps, 14, :OpenPunct, CatPunctuation, "Punctuation, open"), - (:Pe, 15, :ClosePunct, CatPunctuation, "Punctuation, close"), - (:Pi, 16, :BegQuotePunct, CatPunctuation, "Punctuation, initial quote"), - (:Pf, 17, :EndQuotePunct, CatPunctuation, "Punctuation, final quote"), - (:Po, 18, :OtherPunct, CatPunctuation, "Punctuation, other"), - (:Sm, 19, :MathSymbol, CatSymbol, "Symbol, math"), - (:Sc, 20, :CurrencySymbol, CatSymbol, "Symbol, currency"), - (:Sk, 21, :ModifierSymbol, CatSymbol, "Symbol, modifier"), - (:So, 22, :OtherSymbol, CatSymbol, "Symbol, other"), - (:Zs, 23, :SpaceSeparator, CatSeparator, "Separator, space"), - (:Zl, 24, :LineSeparator, CatSeparator, "Separator, line"), - (:Zp, 25, :ParagraphSeparator, CatSeparator, "Separator, paragraph"), - (:Cc, 26, :ControlChar, CatOther, "Other, control"), - (:Cf, 27, :FormatChar, CatOther, "Other, format"), - (:Cs, 28, :SurrogateChar, CatOther, "Other, surrogate"), - (:Co, 29, :PrivateUseChar, CatOther, "Other, private use")) - @eval const global $nam = CharCategoryCode($val) + ((:Cn, 0, :NotAssignedChar, Cat.Other, "Other, Not assigned"), + (:Lu, 1, :UpperCase, Cat.Upper, "Letter, uppercase"), + (:Ll, 2, :LowerCase, Cat.Letter, "Letter, lowercase"), + (:Lt, 3, :TitleCase, Cat.Upper, "Letter, titlecase"), + (:Lm, 4, :ModifierLetter, Cat.Letter, "Letter, modifier"), + (:Lo, 5, :OtherLetter, Cat.Letter, "Letter, other"), + (:Mn, 6, :NonSpacingMark, Cat.Mark, "Mark, nonspacing"), + (:Mc, 7, :CombiningMark, Cat.Mark, "Mark, spacing combining"), + (:Me, 8, :EnclosingMark, Cat.Mark, "Mark, enclosing"), + (:Nd, 9, :DecimalDigit, Cat.Number, "Number, decimal digit"), + (:Nl, 10, :NumericLetter, Cat.Number, "Number, letter"), + (:No, 11, :OtherNumber, Cat.Number, "Number, other"), + (:Pc, 12, :ConnectorPunct, Cat.Punctuation, "Punctuation, connector"), + (:Pd, 13, :DashPunct, Cat.Punctuation, "Punctuation, dash"), + (:Ps, 14, :OpenPunct, Cat.Punctuation, "Punctuation, open"), + (:Pe, 15, :ClosePunct, Cat.Punctuation, "Punctuation, close"), + (:Pi, 16, :BegQuotePunct, Cat.Punctuation, "Punctuation, initial quote"), + (:Pf, 17, :EndQuotePunct, Cat.Punctuation, "Punctuation, final quote"), + (:Po, 18, :OtherPunct, Cat.Punctuation, "Punctuation, other"), + (:Sm, 19, :MathSymbol, Cat.Symbol, "Symbol, math"), + (:Sc, 20, :CurrencySymbol, Cat.Symbol, "Symbol, currency"), + (:Sk, 21, :ModifierSymbol, Cat.Symbol, "Symbol, modifier"), + (:So, 22, :OtherSymbol, Cat.Symbol, "Symbol, other"), + (:Zs, 23, :SpaceSeparator, Cat.Separator, "Separator, space"), + (:Zl, 24, :LineSeparator, Cat.Separator, "Separator, line"), + (:Zp, 25, :ParagraphSeparator, Cat.Separator, "Separator, paragraph"), + (:Cc, 26, :ControlChar, Cat.Other, "Other, control"), + (:Cf, 27, :FormatChar, Cat.Other, "Other, format"), + (:Cs, 28, :SurrogateChar, Cat.Other, "Other, surrogate"), + (:Co, 29, :PrivateUseChar, Cat.Other, "Other, private use")) + @eval const global $nam = CharCode($val) @eval export $cat @eval abstract $cat <: $typ @eval @doc $(string("Unicode Category Code: ",des)) $nam @@ -94,54 +101,53 @@ const c2t = [NotAssignedChar, UpperCase, LowerCase, TitleCase, ModifierLetter, O ############################################################################ - """ Return various Unicode properties for character """ function charprop end -charprop(::Type{CharCategory}, c) = c2t[Int(charprop(CharCategoryCode, c))+1] +charprop(::Type{CharType}, c) = c2t[Int(charprop(CharCode, c))+1] -is_assigned_char(c) = charprop(CharCategoryCode, c) != Cn +is_assigned_char(c) = charprop(CharCode, c) != Cn ## libc character class predicates ## -islower(c::Char) = charprop(CharCategoryCode, c) == Ll +islower(c::Char) = charprop(CharCode, c) == Ll # true for Unicode upper and mixed case -isupper(c::Char) = (ccode = charprop(CharCategoryCode, c)) == Lu || ccode == Lt +isupper(c::Char) = (ccode = charprop(CharCode, c)) == Lu || ccode == Lt isdigit(c::Char) = ('0' <= c <= '9') -isalpha(c::Char) = (Lu <= charprop(CharCategoryCode, c) <= Lo) -isnumber(c::Char) = (Nd <= charprop(CharCategoryCode, c) <= No) -isalnum(c::Char) = (Lu <= (ccode = charprop(CharCategoryCode, c)) <= Lo) || (Nd <= ccode <= No) +isalpha(c::Char) = (Lu <= charprop(CharCode, c) <= Lo) +isnumber(c::Char) = (Nd <= charprop(CharCode, c) <= No) +isalnum(c::Char) = (Lu <= (ccode = charprop(CharCode, c)) <= Lo) || (Nd <= ccode <= No) # These are about 3 times slower, because the isa method # is much slower than checking if an integer is within range (or two ranges) # If that is sped up, then these, which are more readable, could replace the other forms. #= -isalpha(c::Char) = charprop(CharCategory, c) <: CatLetter -isnumber(c::Char) = charprop(CharCategory, c) <: CatNumber -isupper(c::Char) = charprop(CharCategory, c) <: CatUpper -isalnum(c::Char) = charprop(CharCategory, c) <: Union{CatLetter, CatNumber} -ispunct(c::Char) = charprop(CharCategory, c) <: CatPunctuation +isalpha(c::Char) = charprop(CharType, c) <: CatLetter +isnumber(c::Char) = charprop(CharType, c) <: CatNumber +isupper(c::Char) = charprop(CharType, c) <: CatUpper +isalnum(c::Char) = charprop(CharType, c) <: Union{CatLetter, CatNumber} +ispunct(c::Char) = charprop(CharType, c) <: CatPunctuation =# # following C++ only control characters from the Latin-1 subset return true iscntrl(c::Char) = (c <= Char(0x1f) || Char(0x7f) <= c <= Char(0x9f)) -ispunct(c::Char) = (Pc <= charprop(CharCategoryCode, c) <= Po) +ispunct(c::Char) = (Pc <= charprop(CharCode, c) <= Po) # \u85 is the Unicode Next Line (NEL) character # the check for \ufffd allows for branch removal on ASCIIStrings @inline isspace(c::Char) = (c == ' ' || '\t' <= c <='\r' || c == '\u85' || - ('\ua0' <= c && c != '\ufffd' && charprop(CharCategoryCode, c) == Zs)) + ('\ua0' <= c && c != '\ufffd' && charprop(CharCode, c) == Zs)) -isprint(c::Char) = (Lu <= charprop(CharCategoryCode, c) <= Zs) +isprint(c::Char) = (Lu <= charprop(CharCode, c) <= Zs) # true in principle if a printer would use ink -isgraph(c::Char) = (Lu <= charprop(CharCategoryCode, c) <= So) +isgraph(c::Char) = (Lu <= charprop(CharCode, c) <= So) for name = ("alnum", "alpha", "cntrl", "digit", "number", "graph", "lower", "print", "punct", "space", "upper") diff --git a/base/unicode/utf8proc.jl b/base/unicode/utf8proc.jl index 7968d27a8c276..b8dc1c1a888ed 100644 --- a/base/unicode/utf8proc.jl +++ b/base/unicode/utf8proc.jl @@ -81,9 +81,8 @@ uppercase(c::Char) = (isascii(c) ############################################################################ -# returns CharCategoryCode (enum values 0:29) giving Unicode category -charprop(::Type{CharCategoryCode}, c) = - CharCategoryCode(ccall(:utf8proc_category, Cint, (UInt32,), c)) +# returns CharCode (values 0:29) giving Unicode category +charprop(::Type{CharCode}, c) = CharCode(ccall(:utf8proc_category, Cint, (UInt32,), c)) ############################################################################ diff --git a/test/unicode/properties.jl b/test/unicode/properties.jl index 735eed3d04fbe..79a4ad3c5c809 100644 --- a/test/unicode/properties.jl +++ b/test/unicode/properties.jl @@ -143,9 +143,9 @@ end # check handling of CN category constants let c_ll = 'β', c_cn = '\u038B' - @test charprop(CharCategoryCode, c_ll) == Unicode.Ll + @test charprop(CharCode, c_ll) == Unicode.Ll # check codepoint with category code CN - @test charprop(CharCategoryCode, c_cn) == Unicode.Cn + @test charprop(CharCode, c_cn) == Unicode.Cn end # Make sure fastplus is called for coverage From 8b23e470bcaa02b71f6bb1f7f104bdef407bf079 Mon Sep 17 00:00:00 2001 From: ScottPJones Date: Mon, 14 Dec 2015 07:56:55 -0500 Subject: [PATCH 3/7] Clean up naming, don't want any Cat fights! --- base/exports.jl | 6 +- base/unicode.jl | 3 +- base/unicode/properties.jl | 161 +++++++++++++++++++------------------ base/unicode/utf8proc.jl | 4 +- test/unicode/properties.jl | 4 +- 5 files changed, 91 insertions(+), 87 deletions(-) diff --git a/base/exports.jl b/base/exports.jl index 803a41125ff33..db08c05f81d54 100644 --- a/base/exports.jl +++ b/base/exports.jl @@ -22,7 +22,7 @@ export Serializer, Docs, Markdown, - Cat, + Category, Unicode, # Types @@ -42,8 +42,8 @@ export CartesianIndex, CartesianRange, Channel, - CharCode, - CharType, + CategoryCode, + CategoryType, Cmd, Colon, Complex, diff --git a/base/unicode.jl b/base/unicode.jl index fc94993410638..2b119b8565e13 100644 --- a/base/unicode.jl +++ b/base/unicode.jl @@ -6,7 +6,8 @@ import Base: string, convert, write, length, endof, next, reverseind, lastidx, r lowercase, uppercase, eltype, isless, promote_rule, == export UnicodeError, UTF16String, UTF32String, unsafe_checkstring, checkstring, - utf8, utf16, utf32, containsnul, WString, wstring, charprop, Cat, CharType, CharCode, + utf8, utf16, utf32, containsnul, WString, wstring, charprop, Category, + CategoryType, CategoryCode, is_assigned_char, islower, isupper, isdigit, isalpha, isnumber, isalnum, iscntrl, ispunct, isspace, isprint, isgraph, isgraphemebreak, GraphemeIterator, normalize_string, graphemes, charwidth diff --git a/base/unicode/properties.jl b/base/unicode/properties.jl index 82db6cafc6861..883055f192012 100644 --- a/base/unicode/properties.jl +++ b/base/unicode/properties.jl @@ -10,81 +10,85 @@ isvalid(::Type{Char}, ch::Char) = isvalid(Char, UInt32(ch)) isvalid(ch::Char) = isvalid(Char, ch) -# Unicode General Category constants - -module Cat -export Property, CharType, CharCode - """Unicode character properties""" abstract Property +""" +Return various Unicode properties for character +""" +function charprop end + +# Unicode General Category constants + +module Category +export CategoryType, CategoryCode + """Unicode character category type""" -abstract CharType <: Property +abstract CategoryType <: Unicode.Property """Unicode 'Letter' character category""" -abstract Letter <: CharType +abstract Letter <: CategoryType """Unicode 'Mark' character category""" -abstract Mark <: CharType +abstract Mark <: CategoryType """Unicode 'Number' character category""" -abstract Number <: CharType +abstract Number <: CategoryType """Unicode 'Punctuation' character category""" -abstract Punctuation <: CharType +abstract Punctuation <: CategoryType """Unicode 'Symbol' character category""" -abstract Symbol <: CharType +abstract Symbol <: CategoryType """Unicode 'Separator' character category""" -abstract Separator <: CharType +abstract Separator <: CategoryType """Unicode 'Other' character category""" -abstract Other <: CharType +abstract Other <: CategoryType """Unicode uppercase & titlecase letters""" abstract Upper <: Letter -"""Unicode character category code (0-29)""" -bitstype 8 CharCode +"""Unicode alphabetic and numeric""" +typealias AlphaNumeric Union{Letter, Number} -end # module Cat -import .Cat: Property, CharType, CharCode +"""Unicode character category code (0-29)""" +bitstype 8 CategoryCode -convert(::Type{CharCode}, x::Integer) = reinterpret(CharCode, x%UInt8) -convert{T<:Integer}(::Type{T}, x::CharCode) = convert(T, reinterpret(UInt8, x)) -promote_rule{T<:Integer}(::Type{T}, ::Type{CharCode}) = T -isless(x::CharCode, y::CharCode) = isless(UInt8(x), UInt8(y)) -isless(x::CharCode, y::Integer) = isless(UInt8(x), y) -isless(x::Integer, y::CharCode) = isless(x, UInt8(y)) +Base.convert(::Type{CategoryCode}, x::Integer) = reinterpret(CategoryCode, x%UInt8) +Base.convert{T<:Integer}(::Type{T}, x::CategoryCode) = convert(T, reinterpret(UInt8, x)) +Base.promote_rule{T<:Integer}(::Type{T}, ::Type{CategoryCode}) = T +Base.isless(x::CategoryCode, y::CategoryCode) = isless(UInt8(x), UInt8(y)) +Base.isless(x::CategoryCode, y::Integer) = isless(UInt8(x), y) +Base.isless(x::Integer, y::CategoryCode) = isless(x, UInt8(y)) for (nam, val, cat, typ, des) in - ((:Cn, 0, :NotAssignedChar, Cat.Other, "Other, Not assigned"), - (:Lu, 1, :UpperCase, Cat.Upper, "Letter, uppercase"), - (:Ll, 2, :LowerCase, Cat.Letter, "Letter, lowercase"), - (:Lt, 3, :TitleCase, Cat.Upper, "Letter, titlecase"), - (:Lm, 4, :ModifierLetter, Cat.Letter, "Letter, modifier"), - (:Lo, 5, :OtherLetter, Cat.Letter, "Letter, other"), - (:Mn, 6, :NonSpacingMark, Cat.Mark, "Mark, nonspacing"), - (:Mc, 7, :CombiningMark, Cat.Mark, "Mark, spacing combining"), - (:Me, 8, :EnclosingMark, Cat.Mark, "Mark, enclosing"), - (:Nd, 9, :DecimalDigit, Cat.Number, "Number, decimal digit"), - (:Nl, 10, :NumericLetter, Cat.Number, "Number, letter"), - (:No, 11, :OtherNumber, Cat.Number, "Number, other"), - (:Pc, 12, :ConnectorPunct, Cat.Punctuation, "Punctuation, connector"), - (:Pd, 13, :DashPunct, Cat.Punctuation, "Punctuation, dash"), - (:Ps, 14, :OpenPunct, Cat.Punctuation, "Punctuation, open"), - (:Pe, 15, :ClosePunct, Cat.Punctuation, "Punctuation, close"), - (:Pi, 16, :BegQuotePunct, Cat.Punctuation, "Punctuation, initial quote"), - (:Pf, 17, :EndQuotePunct, Cat.Punctuation, "Punctuation, final quote"), - (:Po, 18, :OtherPunct, Cat.Punctuation, "Punctuation, other"), - (:Sm, 19, :MathSymbol, Cat.Symbol, "Symbol, math"), - (:Sc, 20, :CurrencySymbol, Cat.Symbol, "Symbol, currency"), - (:Sk, 21, :ModifierSymbol, Cat.Symbol, "Symbol, modifier"), - (:So, 22, :OtherSymbol, Cat.Symbol, "Symbol, other"), - (:Zs, 23, :SpaceSeparator, Cat.Separator, "Separator, space"), - (:Zl, 24, :LineSeparator, Cat.Separator, "Separator, line"), - (:Zp, 25, :ParagraphSeparator, Cat.Separator, "Separator, paragraph"), - (:Cc, 26, :ControlChar, Cat.Other, "Other, control"), - (:Cf, 27, :FormatChar, Cat.Other, "Other, format"), - (:Cs, 28, :SurrogateChar, Cat.Other, "Other, surrogate"), - (:Co, 29, :PrivateUseChar, Cat.Other, "Other, private use")) - @eval const global $nam = CharCode($val) - @eval export $cat + ((:Cn, 0, :NotAssignedChar, :Other, "Other, Not assigned"), + (:Lu, 1, :UpperCase, :Upper, "Letter, uppercase"), + (:Ll, 2, :LowerCase, :Letter, "Letter, lowercase"), + (:Lt, 3, :TitleCase, :Upper, "Letter, titlecase"), + (:Lm, 4, :ModifierLetter, :Letter, "Letter, modifier"), + (:Lo, 5, :OtherLetter, :Letter, "Letter, other"), + (:Mn, 6, :NonSpacingMark, :Mark, "Mark, nonspacing"), + (:Mc, 7, :CombiningMark, :Mark, "Mark, spacing combining"), + (:Me, 8, :EnclosingMark, :Mark, "Mark, enclosing"), + (:Nd, 9, :DecimalDigit, :Number, "Number, decimal digit"), + (:Nl, 10, :NumericLetter, :Number, "Number, letter"), + (:No, 11, :OtherNumber, :Number, "Number, other"), + (:Pc, 12, :ConnectorPunctuation, :Punctuation, "Punctuation, connector"), + (:Pd, 13, :DashPunctuation, :Punctuation, "Punctuation, dash"), + (:Ps, 14, :OpenPunctuation, :Punctuation, "Punctuation, open"), + (:Pe, 15, :ClosePunctuation, :Punctuation, "Punctuation, close"), + (:Pi, 16, :InitialQuotePunctuation, :Punctuation, "Punctuation, initial quote"), + (:Pf, 17, :FinalQuotePunctuation, :Punctuation, "Punctuation, final quote"), + (:Po, 18, :OtherPunctuation, :Punctuation, "Punctuation, other"), + (:Sm, 19, :MathSymbol, :Symbol, "Symbol, math"), + (:Sc, 20, :CurrencySymbol, :Symbol, "Symbol, currency"), + (:Sk, 21, :ModifierSymbol, :Symbol, "Symbol, modifier"), + (:So, 22, :OtherSymbol, :Symbol, "Symbol, other"), + (:Zs, 23, :SpaceSeparator, :Separator, "Separator, space"), + (:Zl, 24, :LineSeparator, :Separator, "Separator, line"), + (:Zp, 25, :ParagraphSeparator, :Separator, "Separator, paragraph"), + (:Cc, 26, :ControlChar, :Other, "Other, control"), + (:Cf, 27, :FormatChar, :Other, "Other, format"), + (:Cs, 28, :SurrogateChar, :Other, "Other, surrogate"), + (:Co, 29, :PrivateUseChar, :Other, "Other, private use")) + @eval const global $nam = CategoryCode($val) @eval abstract $cat <: $typ @eval @doc $(string("Unicode Category Code: ",des)) $nam @eval @doc $(string("Unicode Category Type: ",des)) $cat @@ -93,61 +97,60 @@ end const c2t = [NotAssignedChar, UpperCase, LowerCase, TitleCase, ModifierLetter, OtherLetter, NonSpacingMark, CombiningMark, EnclosingMark, DecimalDigit, NumericLetter, OtherNumber, - ConnectorPunct, DashPunct, OpenPunct, ClosePunct, - BegQuotePunct, EndQuotePunct, OtherPunct, + ConnectorPunctuation, DashPunctuation, OpenPunctuation, ClosePunctuation, + InitialQuotePunctuation, FinalQuotePunctuation, OtherPunctuation, MathSymbol, CurrencySymbol, ModifierSymbol, OtherSymbol, SpaceSeparator, LineSeparator, ParagraphSeparator, - ControlChar, FormatChar, SurrogateChar, PrivateUseChar] + ControlChar, FormatChar, SurrogateChar, PrivateUseChar] -############################################################################ +charprop(::Type{CategoryType}, c) = c2t[Int(charprop(CategoryCode, c))+1] -""" -Return various Unicode properties for character -""" -function charprop end +end # module Cat +importall .Category -charprop(::Type{CharType}, c) = c2t[Int(charprop(CharCode, c))+1] +############################################################################ -is_assigned_char(c) = charprop(CharCode, c) != Cn +is_assigned_char(c) = charprop(CategoryCode, c) != Category.Cn ## libc character class predicates ## -islower(c::Char) = charprop(CharCode, c) == Ll +islower(c::Char) = charprop(CategoryCode, c) == Category.Ll # true for Unicode upper and mixed case -isupper(c::Char) = (ccode = charprop(CharCode, c)) == Lu || ccode == Lt +isupper(c::Char) = (ccode = charprop(CategoryCode, c)) == Category.Lu || ccode == Category.Lt isdigit(c::Char) = ('0' <= c <= '9') -isalpha(c::Char) = (Lu <= charprop(CharCode, c) <= Lo) -isnumber(c::Char) = (Nd <= charprop(CharCode, c) <= No) -isalnum(c::Char) = (Lu <= (ccode = charprop(CharCode, c)) <= Lo) || (Nd <= ccode <= No) +isalpha(c::Char) = (Category.Lu <= charprop(CategoryCode, c) <= Category.Lo) +isnumber(c::Char) = (Category.Nd <= charprop(CategoryCode, c) <= Category.No) +isalnum(c::Char) = ((Category.Lu <= (ccode = charprop(CategoryCode, c)) <= Category.Lo) || + (Category.Nd <= ccode <= Category.No)) # These are about 3 times slower, because the isa method # is much slower than checking if an integer is within range (or two ranges) # If that is sped up, then these, which are more readable, could replace the other forms. #= -isalpha(c::Char) = charprop(CharType, c) <: CatLetter -isnumber(c::Char) = charprop(CharType, c) <: CatNumber -isupper(c::Char) = charprop(CharType, c) <: CatUpper -isalnum(c::Char) = charprop(CharType, c) <: Union{CatLetter, CatNumber} -ispunct(c::Char) = charprop(CharType, c) <: CatPunctuation +isalpha(c::Char) = charprop(CategoryType, c) <: Category.Letter +isnumber(c::Char) = charprop(CategoryType, c) <: Category.Number +isupper(c::Char) = charprop(CategoryType, c) <: Category.Upper +isalnum(c::Char) = charprop(CategoryType, c) <: Category.AlphaNumeric +ispunct(c::Char) = charprop(CategoryType, c) <: Category.Punctuation =# # following C++ only control characters from the Latin-1 subset return true iscntrl(c::Char) = (c <= Char(0x1f) || Char(0x7f) <= c <= Char(0x9f)) -ispunct(c::Char) = (Pc <= charprop(CharCode, c) <= Po) +ispunct(c::Char) = (Category.Pc <= charprop(CategoryCode, c) <= Category.Po) # \u85 is the Unicode Next Line (NEL) character # the check for \ufffd allows for branch removal on ASCIIStrings @inline isspace(c::Char) = (c == ' ' || '\t' <= c <='\r' || c == '\u85' || - ('\ua0' <= c && c != '\ufffd' && charprop(CharCode, c) == Zs)) + ('\ua0' <= c && c != '\ufffd' && charprop(CategoryCode, c) == Category.Zs)) -isprint(c::Char) = (Lu <= charprop(CharCode, c) <= Zs) +isprint(c::Char) = (Category.Lu <= charprop(CategoryCode, c) <= Category.Zs) # true in principle if a printer would use ink -isgraph(c::Char) = (Lu <= charprop(CharCode, c) <= So) +isgraph(c::Char) = (Category.Lu <= charprop(CategoryCode, c) <= Category.So) for name = ("alnum", "alpha", "cntrl", "digit", "number", "graph", "lower", "print", "punct", "space", "upper") diff --git a/base/unicode/utf8proc.jl b/base/unicode/utf8proc.jl index b8dc1c1a888ed..107e61135ccfc 100644 --- a/base/unicode/utf8proc.jl +++ b/base/unicode/utf8proc.jl @@ -81,8 +81,8 @@ uppercase(c::Char) = (isascii(c) ############################################################################ -# returns CharCode (values 0:29) giving Unicode category -charprop(::Type{CharCode}, c) = CharCode(ccall(:utf8proc_category, Cint, (UInt32,), c)) +# returns CategoryCode (values 0:29) giving Unicode category +charprop(::Type{CategoryCode}, c) = CategoryCode(ccall(:utf8proc_category, Cint, (UInt32,), c)) ############################################################################ diff --git a/test/unicode/properties.jl b/test/unicode/properties.jl index 79a4ad3c5c809..3929cceb030e5 100644 --- a/test/unicode/properties.jl +++ b/test/unicode/properties.jl @@ -143,9 +143,9 @@ end # check handling of CN category constants let c_ll = 'β', c_cn = '\u038B' - @test charprop(CharCode, c_ll) == Unicode.Ll + @test charprop(CategoryCode, c_ll) == Category.Ll # check codepoint with category code CN - @test charprop(CharCode, c_cn) == Unicode.Cn + @test charprop(CategoryCode, c_cn) == Category.Cn end # Make sure fastplus is called for coverage From c05d46389d9290294f7ea8d05402fdc39a1f527d Mon Sep 17 00:00:00 2001 From: ScottPJones Date: Mon, 14 Dec 2015 11:12:17 -0500 Subject: [PATCH 4/7] Add category masks --- base/exports.jl | 2 - base/unicode.jl | 1 - base/unicode/properties.jl | 92 +++++++++++++++++++------------------- base/unicode/utf8proc.jl | 4 +- test/unicode/properties.jl | 4 +- 5 files changed, 51 insertions(+), 52 deletions(-) diff --git a/base/exports.jl b/base/exports.jl index db08c05f81d54..fb0d3b39beeca 100644 --- a/base/exports.jl +++ b/base/exports.jl @@ -42,8 +42,6 @@ export CartesianIndex, CartesianRange, Channel, - CategoryCode, - CategoryType, Cmd, Colon, Complex, diff --git a/base/unicode.jl b/base/unicode.jl index 2b119b8565e13..770219f0c8beb 100644 --- a/base/unicode.jl +++ b/base/unicode.jl @@ -7,7 +7,6 @@ import Base: string, convert, write, length, endof, next, reverseind, lastidx, r export UnicodeError, UTF16String, UTF32String, unsafe_checkstring, checkstring, utf8, utf16, utf32, containsnul, WString, wstring, charprop, Category, - CategoryType, CategoryCode, is_assigned_char, islower, isupper, isdigit, isalpha, isnumber, isalnum, iscntrl, ispunct, isspace, isprint, isgraph, isgraphemebreak, GraphemeIterator, normalize_string, graphemes, charwidth diff --git a/base/unicode/properties.jl b/base/unicode/properties.jl index 883055f192012..2b5da80452c54 100644 --- a/base/unicode/properties.jl +++ b/base/unicode/properties.jl @@ -21,25 +21,24 @@ function charprop end # Unicode General Category constants module Category -export CategoryType, CategoryCode """Unicode character category type""" -abstract CategoryType <: Unicode.Property +abstract General <: Unicode.Property """Unicode 'Letter' character category""" -abstract Letter <: CategoryType +abstract Letter <: General """Unicode 'Mark' character category""" -abstract Mark <: CategoryType +abstract Mark <: General """Unicode 'Number' character category""" -abstract Number <: CategoryType +abstract Number <: General """Unicode 'Punctuation' character category""" -abstract Punctuation <: CategoryType +abstract Punctuation <: General """Unicode 'Symbol' character category""" -abstract Symbol <: CategoryType +abstract Symbol <: General """Unicode 'Separator' character category""" -abstract Separator <: CategoryType +abstract Separator <: General """Unicode 'Other' character category""" -abstract Other <: CategoryType +abstract Other <: General """Unicode uppercase & titlecase letters""" abstract Upper <: Letter @@ -48,14 +47,17 @@ abstract Upper <: Letter typealias AlphaNumeric Union{Letter, Number} """Unicode character category code (0-29)""" -bitstype 8 CategoryCode +bitstype 8 Code -Base.convert(::Type{CategoryCode}, x::Integer) = reinterpret(CategoryCode, x%UInt8) -Base.convert{T<:Integer}(::Type{T}, x::CategoryCode) = convert(T, reinterpret(UInt8, x)) -Base.promote_rule{T<:Integer}(::Type{T}, ::Type{CategoryCode}) = T -Base.isless(x::CategoryCode, y::CategoryCode) = isless(UInt8(x), UInt8(y)) -Base.isless(x::CategoryCode, y::Integer) = isless(UInt8(x), y) -Base.isless(x::Integer, y::CategoryCode) = isless(x, UInt8(y)) +"""Unicode character category mask""" +typealias Mask UInt32 + +Base.convert(::Type{Code}, x::Integer) = reinterpret(Code, x%UInt8) +Base.convert{T<:Integer}(::Type{T}, x::Code) = convert(T, reinterpret(UInt8, x)) +Base.promote_rule{T<:Integer}(::Type{T}, ::Type{Code}) = T +Base.isless(x::Code, y::Code) = isless(UInt8(x), UInt8(y)) +Base.isless(x::Code, y::Integer) = isless(UInt8(x), y) +Base.isless(x::Integer, y::Code) = isless(x, UInt8(y)) for (nam, val, cat, typ, des) in ((:Cn, 0, :NotAssignedChar, :Other, "Other, Not assigned"), @@ -88,8 +90,9 @@ for (nam, val, cat, typ, des) in (:Cf, 27, :FormatChar, :Other, "Other, format"), (:Cs, 28, :SurrogateChar, :Other, "Other, surrogate"), (:Co, 29, :PrivateUseChar, :Other, "Other, private use")) - @eval const global $nam = CategoryCode($val) + @eval const global $nam = $(Code(val)) @eval abstract $cat <: $typ + @eval Base.convert(::Type{Code}, ct::$cat) = $(Code(val)) @eval @doc $(string("Unicode Category Code: ",des)) $nam @eval @doc $(string("Unicode Category Type: ",des)) $cat end @@ -101,56 +104,55 @@ const c2t = [NotAssignedChar, UpperCase, LowerCase, TitleCase, ModifierLetter, O InitialQuotePunctuation, FinalQuotePunctuation, OtherPunctuation, MathSymbol, CurrencySymbol, ModifierSymbol, OtherSymbol, SpaceSeparator, LineSeparator, ParagraphSeparator, - ControlChar, FormatChar, SurrogateChar, PrivateUseChar] + ControlChar, FormatChar, SurrogateChar, PrivateUseChar] + +Base.convert(::Type{General}, cat::Code) = c2t[Int(cat)+1] + +Unicode.charprop(Mask, c) = Mask(1< Date: Mon, 14 Dec 2015 11:39:41 -0500 Subject: [PATCH 5/7] Update per comments on use of ?: Add newline Fix indentation (Emacs and tabs) --- base/unicode/properties.jl | 114 ++++++++++++++++++++----------------- base/unicode/utf8proc.jl | 21 +++++-- 2 files changed, 78 insertions(+), 57 deletions(-) diff --git a/base/unicode/properties.jl b/base/unicode/properties.jl index 2b5da80452c54..da80417bdc3bc 100644 --- a/base/unicode/properties.jl +++ b/base/unicode/properties.jl @@ -59,44 +59,49 @@ Base.isless(x::Code, y::Code) = isless(UInt8(x), UInt8(y)) Base.isless(x::Code, y::Integer) = isless(UInt8(x), y) Base.isless(x::Integer, y::Code) = isless(x, UInt8(y)) -for (nam, val, cat, typ, des) in - ((:Cn, 0, :NotAssignedChar, :Other, "Other, Not assigned"), - (:Lu, 1, :UpperCase, :Upper, "Letter, uppercase"), - (:Ll, 2, :LowerCase, :Letter, "Letter, lowercase"), - (:Lt, 3, :TitleCase, :Upper, "Letter, titlecase"), - (:Lm, 4, :ModifierLetter, :Letter, "Letter, modifier"), - (:Lo, 5, :OtherLetter, :Letter, "Letter, other"), - (:Mn, 6, :NonSpacingMark, :Mark, "Mark, nonspacing"), - (:Mc, 7, :CombiningMark, :Mark, "Mark, spacing combining"), - (:Me, 8, :EnclosingMark, :Mark, "Mark, enclosing"), - (:Nd, 9, :DecimalDigit, :Number, "Number, decimal digit"), - (:Nl, 10, :NumericLetter, :Number, "Number, letter"), - (:No, 11, :OtherNumber, :Number, "Number, other"), - (:Pc, 12, :ConnectorPunctuation, :Punctuation, "Punctuation, connector"), - (:Pd, 13, :DashPunctuation, :Punctuation, "Punctuation, dash"), - (:Ps, 14, :OpenPunctuation, :Punctuation, "Punctuation, open"), - (:Pe, 15, :ClosePunctuation, :Punctuation, "Punctuation, close"), - (:Pi, 16, :InitialQuotePunctuation, :Punctuation, "Punctuation, initial quote"), - (:Pf, 17, :FinalQuotePunctuation, :Punctuation, "Punctuation, final quote"), - (:Po, 18, :OtherPunctuation, :Punctuation, "Punctuation, other"), - (:Sm, 19, :MathSymbol, :Symbol, "Symbol, math"), - (:Sc, 20, :CurrencySymbol, :Symbol, "Symbol, currency"), - (:Sk, 21, :ModifierSymbol, :Symbol, "Symbol, modifier"), - (:So, 22, :OtherSymbol, :Symbol, "Symbol, other"), - (:Zs, 23, :SpaceSeparator, :Separator, "Separator, space"), - (:Zl, 24, :LineSeparator, :Separator, "Separator, line"), - (:Zp, 25, :ParagraphSeparator, :Separator, "Separator, paragraph"), - (:Cc, 26, :ControlChar, :Other, "Other, control"), - (:Cf, 27, :FormatChar, :Other, "Other, format"), - (:Cs, 28, :SurrogateChar, :Other, "Other, surrogate"), - (:Co, 29, :PrivateUseChar, :Other, "Other, private use")) - @eval const global $nam = $(Code(val)) - @eval abstract $cat <: $typ - @eval Base.convert(::Type{Code}, ct::$cat) = $(Code(val)) - @eval @doc $(string("Unicode Category Code: ",des)) $nam - @eval @doc $(string("Unicode Category Type: ",des)) $cat +let c2t = DataType[] + for (nam, val, cat, typ, des) in + ((:Cn, 0, :NotAssignedChar, :Other, "Other, Not assigned"), + (:Lu, 1, :UpperCase, :Upper, "Letter, uppercase"), + (:Ll, 2, :LowerCase, :Letter, "Letter, lowercase"), + (:Lt, 3, :TitleCase, :Upper, "Letter, titlecase"), + (:Lm, 4, :ModifierLetter, :Letter, "Letter, modifier"), + (:Lo, 5, :OtherLetter, :Letter, "Letter, other"), + (:Mn, 6, :NonSpacingMark, :Mark, "Mark, nonspacing"), + (:Mc, 7, :CombiningMark, :Mark, "Mark, spacing combining"), + (:Me, 8, :EnclosingMark, :Mark, "Mark, enclosing"), + (:Nd, 9, :DecimalDigit, :Number, "Number, decimal digit"), + (:Nl, 10, :NumericLetter, :Number, "Number, letter"), + (:No, 11, :OtherNumber, :Number, "Number, other"), + (:Pc, 12, :ConnectorPunctuation, :Punctuation, "Punctuation, connector"), + (:Pd, 13, :DashPunctuation, :Punctuation, "Punctuation, dash"), + (:Ps, 14, :OpenPunctuation, :Punctuation, "Punctuation, open"), + (:Pe, 15, :ClosePunctuation, :Punctuation, "Punctuation, close"), + (:Pi, 16, :InitialQuotePunctuation, :Punctuation, "Punctuation, initial quote"), + (:Pf, 17, :FinalQuotePunctuation, :Punctuation, "Punctuation, final quote"), + (:Po, 18, :OtherPunctuation, :Punctuation, "Punctuation, other"), + (:Sm, 19, :MathSymbol, :Symbol, "Symbol, math"), + (:Sc, 20, :CurrencySymbol, :Symbol, "Symbol, currency"), + (:Sk, 21, :ModifierSymbol, :Symbol, "Symbol, modifier"), + (:So, 22, :OtherSymbol, :Symbol, "Symbol, other"), + (:Zs, 23, :SpaceSeparator, :Separator, "Separator, space"), + (:Zl, 24, :LineSeparator, :Separator, "Separator, line"), + (:Zp, 25, :ParagraphSeparator, :Separator, "Separator, paragraph"), + (:Cc, 26, :ControlChar, :Other, "Other, control"), + (:Cf, 27, :FormatChar, :Other, "Other, format"), + (:Cs, 28, :SurrogateChar, :Other, "Other, surrogate"), + (:Co, 29, :PrivateUseChar, :Other, "Other, private use")) + @eval const global $nam = $(Code(val)) + @eval abstract $cat <: $typ + @eval push!($c2t, $cat) + @eval Base.convert(::Type{Code}, ct::$cat) = $(Code(val)) + @eval @doc $(string("Unicode Category Code: ",des)) $nam + @eval @doc $(string("Unicode Category Type: ",des)) $cat + end + @eval const global code2general = $c2t end +#= const c2t = [NotAssignedChar, UpperCase, LowerCase, TitleCase, ModifierLetter, OtherLetter, NonSpacingMark, CombiningMark, EnclosingMark, DecimalDigit, NumericLetter, OtherNumber, @@ -104,22 +109,29 @@ const c2t = [NotAssignedChar, UpperCase, LowerCase, TitleCase, ModifierLetter, O InitialQuotePunctuation, FinalQuotePunctuation, OtherPunctuation, MathSymbol, CurrencySymbol, ModifierSymbol, OtherSymbol, SpaceSeparator, LineSeparator, ParagraphSeparator, - ControlChar, FormatChar, SurrogateChar, PrivateUseChar] + ControlChar, FormatChar, SurrogateChar, PrivateUseChar] +=# -Base.convert(::Type{General}, cat::Code) = c2t[Int(cat)+1] +Base.convert(::Type{General}, cat::Code) = code2general[Int(cat)+1] Unicode.charprop(Mask, c) = Mask(1< Date: Tue, 15 Dec 2015 14:01:08 -0500 Subject: [PATCH 6/7] Remove types for general categories --- base/unicode/properties.jl | 192 ++++++++++++++++++------------------- 1 file changed, 93 insertions(+), 99 deletions(-) diff --git a/base/unicode/properties.jl b/base/unicode/properties.jl index da80417bdc3bc..4297a40d86386 100644 --- a/base/unicode/properties.jl +++ b/base/unicode/properties.jl @@ -22,36 +22,9 @@ function charprop end module Category -"""Unicode character category type""" -abstract General <: Unicode.Property - -"""Unicode 'Letter' character category""" -abstract Letter <: General -"""Unicode 'Mark' character category""" -abstract Mark <: General -"""Unicode 'Number' character category""" -abstract Number <: General -"""Unicode 'Punctuation' character category""" -abstract Punctuation <: General -"""Unicode 'Symbol' character category""" -abstract Symbol <: General -"""Unicode 'Separator' character category""" -abstract Separator <: General -"""Unicode 'Other' character category""" -abstract Other <: General - -"""Unicode uppercase & titlecase letters""" -abstract Upper <: Letter - -"""Unicode alphabetic and numeric""" -typealias AlphaNumeric Union{Letter, Number} - """Unicode character category code (0-29)""" bitstype 8 Code -"""Unicode character category mask""" -typealias Mask UInt32 - Base.convert(::Type{Code}, x::Integer) = reinterpret(Code, x%UInt8) Base.convert{T<:Integer}(::Type{T}, x::Code) = convert(T, reinterpret(UInt8, x)) Base.promote_rule{T<:Integer}(::Type{T}, ::Type{Code}) = T @@ -59,79 +32,100 @@ Base.isless(x::Code, y::Code) = isless(UInt8(x), UInt8(y)) Base.isless(x::Code, y::Integer) = isless(UInt8(x), y) Base.isless(x::Integer, y::Code) = isless(x, UInt8(y)) -let c2t = DataType[] - for (nam, val, cat, typ, des) in - ((:Cn, 0, :NotAssignedChar, :Other, "Other, Not assigned"), - (:Lu, 1, :UpperCase, :Upper, "Letter, uppercase"), - (:Ll, 2, :LowerCase, :Letter, "Letter, lowercase"), - (:Lt, 3, :TitleCase, :Upper, "Letter, titlecase"), - (:Lm, 4, :ModifierLetter, :Letter, "Letter, modifier"), - (:Lo, 5, :OtherLetter, :Letter, "Letter, other"), - (:Mn, 6, :NonSpacingMark, :Mark, "Mark, nonspacing"), - (:Mc, 7, :CombiningMark, :Mark, "Mark, spacing combining"), - (:Me, 8, :EnclosingMark, :Mark, "Mark, enclosing"), - (:Nd, 9, :DecimalDigit, :Number, "Number, decimal digit"), - (:Nl, 10, :NumericLetter, :Number, "Number, letter"), - (:No, 11, :OtherNumber, :Number, "Number, other"), - (:Pc, 12, :ConnectorPunctuation, :Punctuation, "Punctuation, connector"), - (:Pd, 13, :DashPunctuation, :Punctuation, "Punctuation, dash"), - (:Ps, 14, :OpenPunctuation, :Punctuation, "Punctuation, open"), - (:Pe, 15, :ClosePunctuation, :Punctuation, "Punctuation, close"), - (:Pi, 16, :InitialQuotePunctuation, :Punctuation, "Punctuation, initial quote"), - (:Pf, 17, :FinalQuotePunctuation, :Punctuation, "Punctuation, final quote"), - (:Po, 18, :OtherPunctuation, :Punctuation, "Punctuation, other"), - (:Sm, 19, :MathSymbol, :Symbol, "Symbol, math"), - (:Sc, 20, :CurrencySymbol, :Symbol, "Symbol, currency"), - (:Sk, 21, :ModifierSymbol, :Symbol, "Symbol, modifier"), - (:So, 22, :OtherSymbol, :Symbol, "Symbol, other"), - (:Zs, 23, :SpaceSeparator, :Separator, "Separator, space"), - (:Zl, 24, :LineSeparator, :Separator, "Separator, line"), - (:Zp, 25, :ParagraphSeparator, :Separator, "Separator, paragraph"), - (:Cc, 26, :ControlChar, :Other, "Other, control"), - (:Cf, 27, :FormatChar, :Other, "Other, format"), - (:Cs, 28, :SurrogateChar, :Other, "Other, surrogate"), - (:Co, 29, :PrivateUseChar, :Other, "Other, private use")) - @eval const global $nam = $(Code(val)) - @eval abstract $cat <: $typ - @eval push!($c2t, $cat) - @eval Base.convert(::Type{Code}, ct::$cat) = $(Code(val)) - @eval @doc $(string("Unicode Category Code: ",des)) $nam - @eval @doc $(string("Unicode Category Type: ",des)) $cat - end - @eval const global code2general = $c2t +"""Unicode character category mask""" +bitstype 32 Mask + +Base.convert(::Type{Mask}, x::Integer) = reinterpret(Mask, x%UInt32) +Base.convert{T<:Integer}(::Type{T}, x::Mask) = convert(T, reinterpret(UInt32, x)) +Base.promote_rule{T<:Integer}(::Type{T}, ::Type{Mask}) = T + +Base.convert(::Type{Mask}, c::Code) = Mask(1< Date: Tue, 15 Dec 2015 18:14:34 -0500 Subject: [PATCH 7/7] Remove Category from exports Updated tests to use Unicode.Category --- base/exports.jl | 1 - test/unicode/properties.jl | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/base/exports.jl b/base/exports.jl index fb0d3b39beeca..f07a8441669ff 100644 --- a/base/exports.jl +++ b/base/exports.jl @@ -22,7 +22,6 @@ export Serializer, Docs, Markdown, - Category, Unicode, # Types diff --git a/test/unicode/properties.jl b/test/unicode/properties.jl index 58170c1328d43..66b010176f2c4 100644 --- a/test/unicode/properties.jl +++ b/test/unicode/properties.jl @@ -143,9 +143,9 @@ end # check handling of CN category constants let c_ll = 'β', c_cn = '\u038B' - @test charprop(Category.Code, c_ll) == Category.Ll + @test charprop(Unicode.Category.Code, c_ll) == Unicode.Category.Ll # check codepoint with category code CN - @test charprop(Category.Code, c_cn) == Category.Cn + @test charprop(Unicode.Category.Code, c_cn) == Unicode.Category.Cn end # Make sure fastplus is called for coverage