@@ -12,71 +12,78 @@ isvalid(ch::Char) = isvalid(Char, ch)
12
12
13
13
# Unicode General Category constants
14
14
15
+ module Cat
16
+ export Property, CharType, CharCode
17
+
15
18
""" Unicode character properties"""
16
- abstract UnicodeProperty
17
- """ Unicode character categories"""
18
- abstract CharCategory <: UnicodeProperty
19
-
20
- """ Unicode letter character category"""
21
- abstract CatLetter <: CharCategory
22
- """ Unicode Mark character category"""
23
- abstract CatMark <: CharCategory
24
- """ Unicode Numeric character category"""
25
- abstract CatNumber <: CharCategory
26
- """ Unicode Punctuation character category"""
27
- abstract CatPunctuation <: CharCategory
28
- """ Unicode Symbol character category"""
29
- abstract CatSymbol <: CharCategory
30
- """ Unicode Separator character category"""
31
- abstract CatSeparator <: CharCategory
32
- """ Unicode Other character category"""
33
- abstract CatOther <: CharCategory
19
+ abstract Property
20
+
21
+ """ Unicode character category type"""
22
+ abstract CharType <: Property
23
+
24
+ """ Unicode 'Letter' character category"""
25
+ abstract Letter <: CharType
26
+ """ Unicode 'Mark' character category"""
27
+ abstract Mark <: CharType
28
+ """ Unicode 'Number' character category"""
29
+ abstract Number <: CharType
30
+ """ Unicode 'Punctuation' character category"""
31
+ abstract Punctuation <: CharType
32
+ """ Unicode 'Symbol' character category"""
33
+ abstract Symbol <: CharType
34
+ """ Unicode 'Separator' character category"""
35
+ abstract Separator <: CharType
36
+ """ Unicode 'Other' character category"""
37
+ abstract Other <: CharType
34
38
35
39
""" Unicode uppercase & titlecase letters"""
36
- abstract CatUpper <: CatLetter
40
+ abstract Upper <: Letter
41
+
42
+ """ Unicode character category code (0-29)"""
43
+ bitstype 8 CharCode
37
44
38
- """ Unicode Character Category Code (0-29) """
39
- bitstype 8 CharCategoryCode
45
+ end # module Cat
46
+ import . Cat : Property, CharType, CharCode
40
47
41
- convert (:: Type{CharCategoryCode } , x:: Integer ) = reinterpret (CharCategoryCode , x% UInt8)
42
- convert {T<:Integer} (:: Type{T} , x:: CharCategoryCode ) = convert (T, reinterpret (UInt8, x))
43
- promote_rule {T<:Integer} (:: Type{T} , :: Type{CharCategoryCode } ) = T
44
- isless (x:: CharCategoryCode , y:: CharCategoryCode ) = isless (UInt32 (x), UInt32 (y))
45
- isless (x:: CharCategoryCode , y:: Integer ) = isless (UInt32 (x), y)
46
- isless (x:: Integer , y:: CharCategoryCode ) = isless (x, UInt32 (y))
48
+ convert (:: Type{CharCode } , x:: Integer ) = reinterpret (CharCode , x% UInt8)
49
+ convert {T<:Integer} (:: Type{T} , x:: CharCode ) = convert (T, reinterpret (UInt8, x))
50
+ promote_rule {T<:Integer} (:: Type{T} , :: Type{CharCode } ) = T
51
+ isless (x:: CharCode , y:: CharCode ) = isless (UInt8 (x), UInt8 (y))
52
+ isless (x:: CharCode , y:: Integer ) = isless (UInt8 (x), y)
53
+ isless (x:: Integer , y:: CharCode ) = isless (x, UInt8 (y))
47
54
48
55
for (nam, val, cat, typ, des) in
49
- ((:Cn , 0 , :NotAssignedChar , CatOther , " Other, Not assigned" ),
50
- (:Lu , 1 , :UpperCase , CatUpper , " Letter, uppercase" ),
51
- (:Ll , 2 , :LowerCase , CatLetter , " Letter, lowercase" ),
52
- (:Lt , 3 , :TitleCase , CatUpper , " Letter, titlecase" ),
53
- (:Lm , 4 , :ModifierLetter , CatLetter , " Letter, modifier" ),
54
- (:Lo , 5 , :OtherLetter , CatLetter , " Letter, other" ),
55
- (:Mn , 6 , :NonSpacingMark , CatMark , " Mark, nonspacing" ),
56
- (:Mc , 7 , :CombiningMark , CatMark , " Mark, spacing combining" ),
57
- (:Me , 8 , :EnclosingMark , CatMark , " Mark, enclosing" ),
58
- (:Nd , 9 , :DecimalDigit , CatNumber , " Number, decimal digit" ),
59
- (:Nl , 10 , :NumericLetter , CatNumber , " Number, letter" ),
60
- (:No , 11 , :OtherNumber , CatNumber , " Number, other" ),
61
- (:Pc , 12 , :ConnectorPunct , CatPunctuation , " Punctuation, connector" ),
62
- (:Pd , 13 , :DashPunct , CatPunctuation , " Punctuation, dash" ),
63
- (:Ps , 14 , :OpenPunct , CatPunctuation , " Punctuation, open" ),
64
- (:Pe , 15 , :ClosePunct , CatPunctuation , " Punctuation, close" ),
65
- (:Pi , 16 , :BegQuotePunct , CatPunctuation , " Punctuation, initial quote" ),
66
- (:Pf , 17 , :EndQuotePunct , CatPunctuation , " Punctuation, final quote" ),
67
- (:Po , 18 , :OtherPunct , CatPunctuation , " Punctuation, other" ),
68
- (:Sm , 19 , :MathSymbol , CatSymbol , " Symbol, math" ),
69
- (:Sc , 20 , :CurrencySymbol , CatSymbol , " Symbol, currency" ),
70
- (:Sk , 21 , :ModifierSymbol , CatSymbol , " Symbol, modifier" ),
71
- (:So , 22 , :OtherSymbol , CatSymbol , " Symbol, other" ),
72
- (:Zs , 23 , :SpaceSeparator , CatSeparator , " Separator, space" ),
73
- (:Zl , 24 , :LineSeparator , CatSeparator , " Separator, line" ),
74
- (:Zp , 25 , :ParagraphSeparator , CatSeparator , " Separator, paragraph" ),
75
- (:Cc , 26 , :ControlChar , CatOther , " Other, control" ),
76
- (:Cf , 27 , :FormatChar , CatOther , " Other, format" ),
77
- (:Cs , 28 , :SurrogateChar , CatOther , " Other, surrogate" ),
78
- (:Co , 29 , :PrivateUseChar , CatOther , " Other, private use" ))
79
- @eval const global $ nam = CharCategoryCode ($ val)
56
+ ((:Cn , 0 , :NotAssignedChar , Cat . Other , " Other, Not assigned" ),
57
+ (:Lu , 1 , :UpperCase , Cat . Upper , " Letter, uppercase" ),
58
+ (:Ll , 2 , :LowerCase , Cat . Letter , " Letter, lowercase" ),
59
+ (:Lt , 3 , :TitleCase , Cat . Upper , " Letter, titlecase" ),
60
+ (:Lm , 4 , :ModifierLetter , Cat . Letter , " Letter, modifier" ),
61
+ (:Lo , 5 , :OtherLetter , Cat . Letter , " Letter, other" ),
62
+ (:Mn , 6 , :NonSpacingMark , Cat . Mark , " Mark, nonspacing" ),
63
+ (:Mc , 7 , :CombiningMark , Cat . Mark , " Mark, spacing combining" ),
64
+ (:Me , 8 , :EnclosingMark , Cat . Mark , " Mark, enclosing" ),
65
+ (:Nd , 9 , :DecimalDigit , Cat . Number , " Number, decimal digit" ),
66
+ (:Nl , 10 , :NumericLetter , Cat . Number , " Number, letter" ),
67
+ (:No , 11 , :OtherNumber , Cat . Number , " Number, other" ),
68
+ (:Pc , 12 , :ConnectorPunct , Cat . Punctuation , " Punctuation, connector" ),
69
+ (:Pd , 13 , :DashPunct , Cat . Punctuation , " Punctuation, dash" ),
70
+ (:Ps , 14 , :OpenPunct , Cat . Punctuation , " Punctuation, open" ),
71
+ (:Pe , 15 , :ClosePunct , Cat . Punctuation , " Punctuation, close" ),
72
+ (:Pi , 16 , :BegQuotePunct , Cat . Punctuation , " Punctuation, initial quote" ),
73
+ (:Pf , 17 , :EndQuotePunct , Cat . Punctuation , " Punctuation, final quote" ),
74
+ (:Po , 18 , :OtherPunct , Cat . Punctuation , " Punctuation, other" ),
75
+ (:Sm , 19 , :MathSymbol , Cat . Symbol , " Symbol, math" ),
76
+ (:Sc , 20 , :CurrencySymbol , Cat . Symbol , " Symbol, currency" ),
77
+ (:Sk , 21 , :ModifierSymbol , Cat . Symbol , " Symbol, modifier" ),
78
+ (:So , 22 , :OtherSymbol , Cat . Symbol , " Symbol, other" ),
79
+ (:Zs , 23 , :SpaceSeparator , Cat . Separator , " Separator, space" ),
80
+ (:Zl , 24 , :LineSeparator , Cat . Separator , " Separator, line" ),
81
+ (:Zp , 25 , :ParagraphSeparator , Cat . Separator , " Separator, paragraph" ),
82
+ (:Cc , 26 , :ControlChar , Cat . Other , " Other, control" ),
83
+ (:Cf , 27 , :FormatChar , Cat . Other , " Other, format" ),
84
+ (:Cs , 28 , :SurrogateChar , Cat . Other , " Other, surrogate" ),
85
+ (:Co , 29 , :PrivateUseChar , Cat . Other , " Other, private use" ))
86
+ @eval const global $ nam = CharCode ($ val)
80
87
@eval export $ cat
81
88
@eval abstract $ cat <: $typ
82
89
@eval @doc $ (string (" Unicode Category Code: " ,des)) $ nam
@@ -94,54 +101,53 @@ const c2t = [NotAssignedChar, UpperCase, LowerCase, TitleCase, ModifierLetter, O
94
101
95
102
# ###########################################################################
96
103
97
-
98
104
"""
99
105
Return various Unicode properties for character
100
106
"""
101
107
function charprop end
102
108
103
- charprop (:: Type{CharCategory } , c) = c2t[Int (charprop (CharCategoryCode , c))+ 1 ]
109
+ charprop (:: Type{CharType } , c) = c2t[Int (charprop (CharCode , c))+ 1 ]
104
110
105
- is_assigned_char (c) = charprop (CharCategoryCode , c) != Cn
111
+ is_assigned_char (c) = charprop (CharCode , c) != Cn
106
112
107
113
# # libc character class predicates ##
108
114
109
- islower (c:: Char ) = charprop (CharCategoryCode , c) == Ll
115
+ islower (c:: Char ) = charprop (CharCode , c) == Ll
110
116
111
117
# true for Unicode upper and mixed case
112
- isupper (c:: Char ) = (ccode = charprop (CharCategoryCode , c)) == Lu || ccode == Lt
118
+ isupper (c:: Char ) = (ccode = charprop (CharCode , c)) == Lu || ccode == Lt
113
119
114
120
isdigit (c:: Char ) = (' 0' <= c <= ' 9' )
115
- isalpha (c:: Char ) = (Lu <= charprop (CharCategoryCode , c) <= Lo)
116
- isnumber (c:: Char ) = (Nd <= charprop (CharCategoryCode , c) <= No)
117
- isalnum (c:: Char ) = (Lu <= (ccode = charprop (CharCategoryCode , c)) <= Lo) || (Nd <= ccode <= No)
121
+ isalpha (c:: Char ) = (Lu <= charprop (CharCode , c) <= Lo)
122
+ isnumber (c:: Char ) = (Nd <= charprop (CharCode , c) <= No)
123
+ isalnum (c:: Char ) = (Lu <= (ccode = charprop (CharCode , c)) <= Lo) || (Nd <= ccode <= No)
118
124
119
125
# These are about 3 times slower, because the isa method
120
126
# is much slower than checking if an integer is within range (or two ranges)
121
127
# If that is sped up, then these, which are more readable, could replace the other forms.
122
128
#=
123
- isalpha(c::Char) = charprop(CharCategory , c) <: CatLetter
124
- isnumber(c::Char) = charprop(CharCategory , c) <: CatNumber
125
- isupper(c::Char) = charprop(CharCategory , c) <: CatUpper
126
- isalnum(c::Char) = charprop(CharCategory , c) <: Union{CatLetter, CatNumber}
127
- ispunct(c::Char) = charprop(CharCategory , c) <: CatPunctuation
129
+ isalpha(c::Char) = charprop(CharType , c) <: CatLetter
130
+ isnumber(c::Char) = charprop(CharType , c) <: CatNumber
131
+ isupper(c::Char) = charprop(CharType , c) <: CatUpper
132
+ isalnum(c::Char) = charprop(CharType , c) <: Union{CatLetter, CatNumber}
133
+ ispunct(c::Char) = charprop(CharType , c) <: CatPunctuation
128
134
=#
129
135
130
136
# following C++ only control characters from the Latin-1 subset return true
131
137
iscntrl (c:: Char ) = (c <= Char (0x1f ) || Char (0x7f ) <= c <= Char (0x9f ))
132
138
133
- ispunct (c:: Char ) = (Pc <= charprop (CharCategoryCode , c) <= Po)
139
+ ispunct (c:: Char ) = (Pc <= charprop (CharCode , c) <= Po)
134
140
135
141
# \u85 is the Unicode Next Line (NEL) character
136
142
# the check for \ufffd allows for branch removal on ASCIIStrings
137
143
@inline isspace (c:: Char ) =
138
144
(c == ' ' || ' \t ' <= c <= ' \r ' || c == ' \u 85' ||
139
- (' \u a0' <= c && c != ' \u fffd' && charprop (CharCategoryCode , c) == Zs))
145
+ (' \u a0' <= c && c != ' \u fffd' && charprop (CharCode , c) == Zs))
140
146
141
- isprint (c:: Char ) = (Lu <= charprop (CharCategoryCode , c) <= Zs)
147
+ isprint (c:: Char ) = (Lu <= charprop (CharCode , c) <= Zs)
142
148
143
149
# true in principle if a printer would use ink
144
- isgraph (c:: Char ) = (Lu <= charprop (CharCategoryCode , c) <= So)
150
+ isgraph (c:: Char ) = (Lu <= charprop (CharCode , c) <= So)
145
151
146
152
for name = (" alnum" , " alpha" , " cntrl" , " digit" , " number" , " graph" ,
147
153
" lower" , " print" , " punct" , " space" , " upper" )
0 commit comments