Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unicode character properties #14387

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions base/exports.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ export
Serializer,
Docs,
Markdown,
Unicode,

# Types
AbstractChannel,
Expand Down Expand Up @@ -116,6 +117,7 @@ export
SymTridiagonal,
Timer,
Tridiagonal,
UnicodeProperty,
UnitRange,
UpperTriangular,
UTF16String,
Expand Down Expand Up @@ -818,6 +820,7 @@ export
bits,
bytes2hex,
bytestring,
charprop,
charwidth,
chomp,
chop,
Expand Down
4 changes: 2 additions & 2 deletions base/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -176,15 +176,15 @@ function read(s::IO, ::Type{Char})
end

# mimic utf8.next function
trailing = Base.utf8_trailing[ch+1]
trailing = Unicode.utf8_trailing[ch+1]
c::UInt32 = 0
for j = 1:trailing
c += ch
c <<= 6
ch = read(s, UInt8)
end
c += ch
c -= Base.utf8_offset[trailing+1]
c -= Unicode.utf8_offset[trailing+1]
Char(c)
end

Expand Down
15 changes: 14 additions & 1 deletion base/unicode.jl
Original file line number Diff line number Diff line change
@@ -1,10 +1,23 @@
# This file is a part of Julia. License is MIT: http://julialang.org/license

module Unicode
import Base: string, convert, write, length, endof, next, reverseind, lastidx, reverse, isvalid,
sizeof, unsafe_convert, map, getindex, search, rsearch, pointer, containsnul,
lowercase, uppercase, eltype, isless, promote_rule, ==

export UnicodeError, UTF16String, UTF32String, unsafe_checkstring, checkstring,
utf8, utf16, utf32, containsnul, WString, wstring, charprop, Category,
is_assigned_char, islower, isupper, isdigit, isalpha, isnumber, isalnum, iscntrl,
ispunct, isspace, isprint, isgraph,
isgraphemebreak, GraphemeIterator, normalize_string, graphemes, charwidth

include("unicode/UnicodeError.jl")
include("unicode/types.jl")
include("unicode/checkstring.jl")
include("unicode/utf8.jl")
include("unicode/utf16.jl")
include("unicode/utf32.jl")
include("unicode/properties.jl")
include("unicode/utf8proc.jl")
importall .UTF8proc
end
importall .Unicode
43 changes: 22 additions & 21 deletions base/unicode/UnicodeError.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,31 @@

## Error messages for Unicode / UTF support

const UTF_ERR_SHORT = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)"
const UTF_ERR_CONT = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)"
const UTF_ERR_LONG = "invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)"
const UTF_ERR_NOT_LEAD = "not a leading Unicode surrogate code unit at index <<1>> (0x<<2>>)"
const UTF_ERR_NOT_TRAIL = "not a trailing Unicode surrogate code unit at index <<1>> (0x<<2>>)"
const UTF_ERR_NOT_SURROGATE = "not a valid Unicode surrogate code unit at index <<1>> (0x<<2>>"
const UTF_ERR_MISSING_SURROGATE = "missing trailing Unicode surrogate code unit after index <<1>> (0x<<2>>)"
const UTF_ERR_INVALID = "invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)"
const UTF_ERR_SURROGATE = "surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)"
const UTF_ERR_NULL_16_TERMINATE = "UTF16String data must be NULL-terminated"
const UTF_ERR_NULL_32_TERMINATE = "UTF32String data must be NULL-terminated"
const UTF_ERR_ODD_BYTES_16 = "UTF16String can't have odd number of bytes <<1>>"
const UTF_ERR_ODD_BYTES_32 = "UTF32String must have multiple of 4 bytes <<1>>"
const UTF_ERR_INVALID_CHAR = "invalid Unicode character (0x<<2>> > 0x10ffff)"
const UTF_ERR_INVALID_8 = "invalid UTF-8 data"
const UTF_ERR_INVALID_16 = "invalid UTF-16 data"
const UTF_ERR_INVALID_INDEX = "invalid character index"
const UTF_ERR_MAP_CHAR = "map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"
const ERR_SHORT = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)"
const ERR_CONT = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)"
const ERR_LONG = "invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)"
const ERR_NOT_LEAD = "not a leading Unicode surrogate code unit at index <<1>> (0x<<2>>)"
const ERR_NOT_TRAIL = "not a trailing Unicode surrogate code unit at index <<1>> (0x<<2>>)"
const ERR_NOT_SURROGATE = "not a valid Unicode surrogate code unit at index <<1>> (0x<<2>>"
const ERR_MISSING_SURROGATE = "missing trailing Unicode surrogate code unit after index <<1>> (0x<<2>>)"
const ERR_INVALID = "invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)"
const ERR_SURROGATE = "surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)"
const ERR_NULL_16_TERMINATE = "UTF16String data must be NULL-terminated"
const ERR_NULL_32_TERMINATE = "UTF32String data must be NULL-terminated"
const ERR_ODD_BYTES_16 = "UTF16String can't have odd number of bytes <<1>>"
const ERR_ODD_BYTES_32 = "UTF32String must have multiple of 4 bytes <<1>>"
const ERR_INVALID_CHAR = "invalid Unicode character (0x<<2>> > 0x10ffff)"
const ERR_INVALID_8 = "invalid UTF-8 data"
const ERR_INVALID_16 = "invalid UTF-16 data"
const ERR_INVALID_INDEX = "invalid character index"
const ERR_MAP_CHAR = "map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"

type UnicodeError <: Exception
errmsg::AbstractString ##< A UTF_ERR_ message
errmsg::AbstractString ##< A Unicode.ERR_ message
errpos::Int32 ##< Position of invalid character
errchr::UInt32 ##< Invalid character
end

show(io::IO, exc::UnicodeError) = print(io, replace(replace(string("UnicodeError: ",exc.errmsg),
"<<1>>",string(exc.errpos)),"<<2>>",hex(exc.errchr)))
Base.show(io::IO, exc::UnicodeError) =
print(io, replace(replace(string("UnicodeError: ",exc.errmsg),
"<<1>>",string(exc.errpos)),"<<2>>",hex(exc.errchr)))
40 changes: 20 additions & 20 deletions base/unicode/checkstring.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ const UTF_SURROGATE = 32 ##< surrogate pairs present
## Get a UTF-8 continuation byte, give error if invalid, return updated character value
@inline function get_continuation(ch::UInt32, byt::UInt8, pos)
if !is_valid_continuation(byt)
throw(UnicodeError(UTF_ERR_CONT, pos, byt))
throw(UnicodeError(ERR_CONT, pos, byt))
end
(ch << 6) | (byt & 0x3f)
end
Expand Down Expand Up @@ -73,7 +73,7 @@ function unsafe_checkstring(dat::Vector{UInt8},
# Check UTF-8 encoding
if ch < 0xe0
# 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff)
(pos > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
(pos > endpos) && throw(UnicodeError(ERR_SHORT, pos, ch))
byt, pos = next(dat, pos)
ch = get_continuation(ch & 0x3f, byt, pos)
if ch > 0x7f
Expand All @@ -84,28 +84,28 @@ function unsafe_checkstring(dat::Vector{UInt8},
elseif (ch == 0) && accept_long_null
flags |= UTF_LONG
else
throw(UnicodeError(UTF_ERR_LONG, pos, ch))
throw(UnicodeError(ERR_LONG, pos, ch))
end
elseif ch < 0xf0
# 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff)
(pos + 1 > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
(pos + 1 > endpos) && throw(UnicodeError(ERR_SHORT, pos, ch))
byt, pos = next(dat, pos)
ch = get_continuation(ch & 0x0f, byt, pos)
byt, pos = next(dat, pos)
ch = get_continuation(ch, byt, pos)
# check for surrogate pairs, make sure correct
if is_surrogate_codeunit(ch)
!is_surrogate_lead(ch) && throw(UnicodeError(UTF_ERR_NOT_LEAD, pos-2, ch))
!is_surrogate_lead(ch) && throw(UnicodeError(ERR_NOT_LEAD, pos-2, ch))
# next character *must* be a trailing surrogate character
(pos + 2 > endpos) && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, pos-2, ch))
(pos + 2 > endpos) && throw(UnicodeError(ERR_MISSING_SURROGATE, pos-2, ch))
byt, pos = next(dat, pos)
(byt != 0xed) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos, byt))
(byt != 0xed) && throw(UnicodeError(ERR_NOT_TRAIL, pos, byt))
byt, pos = next(dat, pos)
surr = get_continuation(0x0000d, byt, pos)
byt, pos = next(dat, pos)
surr = get_continuation(surr, byt, pos)
!is_surrogate_trail(surr) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos-2, surr))
!accept_surrogates && throw(UnicodeError(UTF_ERR_SURROGATE, pos-2, surr))
!is_surrogate_trail(surr) && throw(UnicodeError(ERR_NOT_TRAIL, pos-2, surr))
!accept_surrogates && throw(UnicodeError(ERR_SURROGATE, pos-2, surr))
flags |= UTF_SURROGATE
num4byte += 1
elseif ch > 0x07ff
Expand All @@ -114,23 +114,23 @@ function unsafe_checkstring(dat::Vector{UInt8},
flags |= UTF_LONG
num2byte += 1
else
throw(UnicodeError(UTF_ERR_LONG, pos-2, ch))
throw(UnicodeError(ERR_LONG, pos-2, ch))
end
elseif ch < 0xf5
# 4-byte UTF-8 sequence (i.e. characters > 0xffff)
(pos + 2 > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
(pos + 2 > endpos) && throw(UnicodeError(ERR_SHORT, pos, ch))
byt, pos = next(dat, pos)
ch = get_continuation(ch & 0x07, byt, pos)
byt, pos = next(dat, pos)
ch = get_continuation(ch, byt, pos)
byt, pos = next(dat, pos)
ch = get_continuation(ch, byt, pos)
if ch > 0x10ffff
throw(UnicodeError(UTF_ERR_INVALID, pos-3, ch))
throw(UnicodeError(ERR_INVALID, pos-3, ch))
elseif ch > 0xffff
num4byte += 1
elseif is_surrogate_codeunit(ch)
throw(UnicodeError(UTF_ERR_SURROGATE, pos-3, ch))
throw(UnicodeError(ERR_SURROGATE, pos-3, ch))
elseif accept_long_char
# This is an overly long encoded character
flags |= UTF_LONG
Expand All @@ -140,10 +140,10 @@ function unsafe_checkstring(dat::Vector{UInt8},
num2byte += 1
end
else
throw(UnicodeError(UTF_ERR_LONG, pos-2, ch))
throw(UnicodeError(ERR_LONG, pos-2, ch))
end
else
throw(UnicodeError(UTF_ERR_INVALID, pos, ch))
throw(UnicodeError(ERR_INVALID, pos, ch))
end
end
end
Expand Down Expand Up @@ -174,22 +174,22 @@ function unsafe_checkstring{T <: Union{Vector{UInt16}, Vector{UInt32}, AbstractS
num2byte += 1
flags |= UTF_UNICODE2
elseif ch > 0x0ffff
(ch > 0x10ffff) && throw(UnicodeError(UTF_ERR_INVALID, pos, ch))
(ch > 0x10ffff) && throw(UnicodeError(ERR_INVALID, pos, ch))
num4byte += 1
elseif !is_surrogate_codeunit(ch)
num3byte += 1
elseif is_surrogate_lead(ch)
pos > endpos && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, pos, ch))
pos > endpos && throw(UnicodeError(ERR_MISSING_SURROGATE, pos, ch))
# next character *must* be a trailing surrogate character
ch, pos = next(dat, pos)
!is_surrogate_trail(ch) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos, ch))
!is_surrogate_trail(ch) && throw(UnicodeError(ERR_NOT_TRAIL, pos, ch))
num4byte += 1
if T != Vector{UInt16}
!accept_surrogates && throw(UnicodeError(UTF_ERR_SURROGATE, pos, ch))
!accept_surrogates && throw(UnicodeError(ERR_SURROGATE, pos, ch))
flags |= UTF_SURROGATE
end
else
throw(UnicodeError(UTF_ERR_NOT_LEAD, pos, ch))
throw(UnicodeError(ERR_NOT_LEAD, pos, ch))
end
end
end
Expand Down
Loading