diff --git a/base/exports.jl b/base/exports.jl index 7547a09d791a0..db68d7eea920b 100644 --- a/base/exports.jl +++ b/base/exports.jl @@ -165,6 +165,7 @@ export SystemError, TypeError, AssertionError, + UnicodeError, # Global constants and variables ARGS, diff --git a/base/sysimg.jl b/base/sysimg.jl index 73ead07c577f3..e299a2d0b90b0 100644 --- a/base/sysimg.jl +++ b/base/sysimg.jl @@ -84,6 +84,8 @@ include("iterator.jl") include("osutils.jl") # strings & printing +include("utferror.jl") +include("utftypes.jl") include("char.jl") include("ascii.jl") include("utf8.jl") diff --git a/base/utf16.jl b/base/utf16.jl index 59c1e37cc799a..11c22f9dde65e 100644 --- a/base/utf16.jl +++ b/base/utf16.jl @@ -1,15 +1,5 @@ # This file is a part of Julia. License is MIT: http://julialang.org/license -immutable UTF16String <: AbstractString - data::Array{UInt16,1} # includes 16-bit NULL termination after string chars - function UTF16String(data::Vector{UInt16}) - if length(data) < 1 || data[end] != 0 - throw(ArgumentError("UTF16String data must be NULL-terminated")) - end - new(data) - end -end - utf16_is_lead(c::UInt16) = (c & 0xfc00) == 0xd800 utf16_is_trail(c::UInt16) = (c & 0xfc00) == 0xdc00 utf16_is_surrogate(c::UInt16) = (c & 0xf800) == 0xd800 @@ -39,7 +29,7 @@ function next(s::UTF16String, i::Int) elseif length(s.data)-1 > i && utf16_is_lead(s.data[i]) && utf16_is_trail(s.data[i+1]) return utf16_get_supplementary(s.data[i], s.data[i+1]), i+2 end - throw(ArgumentError("invalid UTF-16 character index")) + throw(UnicodeError(UTF_ERR_INVALID_INDEX,0,0)) end function reverseind(s::UTF16String, i::Integer) @@ -74,7 +64,7 @@ function encode16(s::AbstractString) push!(buf, UInt16(0xd7c0 + (c>>10))) push!(buf, UInt16(0xdc00 + (c & 0x3ff))) else - throw(ArgumentError("invalid Unicode character (0x$(hex(c)) > 0x10ffff)")) + throw(UnicodeError(UTF_ERR_INVALID_CHAR, 0, ch)) end end push!(buf, 0) # NULL termination @@ -111,7 +101,7 @@ function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16}) end function convert(::Type{UTF16String}, data::AbstractVector{UInt16}) - !isvalid(UTF16String, data) && throw(ArgumentError("invalid UTF16 data")) + !isvalid(UTF16String, data) && throw(UnicodeError(UTF_ERR_INVALID_16,0,0)) len = length(data) d = Array(UInt16, len + 1) d[end] = 0 # NULL terminate @@ -126,7 +116,7 @@ convert(T::Type{UTF16String}, data::AbstractArray{Int16}) = function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8}) isempty(bytes) && return UTF16String(UInt16[0]) - isodd(length(bytes)) && throw(ArgumentError("odd number of bytes")) + isodd(length(bytes)) && throw(UnicodeError(UTF_ERR_ODD_BYTES_16, length(bytes), 0)) data = reinterpret(UInt16, bytes) # check for byte-order mark (BOM): if data[1] == 0xfeff # native byte order @@ -142,7 +132,7 @@ function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8}) copy!(d,1, data,1, length(data)) # assume native byte order end d[end] = 0 # NULL terminate - !isvalid(UTF16String, d) && throw(ArgumentError("invalid UTF16 data")) + !isvalid(UTF16String, d) && throw(UnicodeError(UTF_ERR_INVALID_16,0,0)) UTF16String(d) end diff --git a/base/utf32.jl b/base/utf32.jl index 419e104e33dfb..c12c396610131 100644 --- a/base/utf32.jl +++ b/base/utf32.jl @@ -1,19 +1,5 @@ # This file is a part of Julia. License is MIT: http://julialang.org/license -## UTF-32 in the native byte order, i.e. plain old character arrays ## - -immutable UTF32String <: DirectIndexString - data::Vector{Char} # includes 32-bit NULL termination after string chars - - function UTF32String(a::Vector{Char}) - if length(a) < 1 || a[end] != Char(0) - throw(ArgumentError("UTF32String data must be NULL-terminated")) - end - new(a) - end -end -UTF32String(data::Vector{UInt32}) = UTF32String(reinterpret(Char, data)) - next(s::UTF32String, i::Int) = (s.data[i], i+1) endof(s::UTF32String) = length(s.data) - 1 length(s::UTF32String) = length(s.data) - 1 @@ -65,7 +51,7 @@ unsafe_convert{T<:Union(Int32,UInt32,Char)}(::Type{Ptr{T}}, s::UTF32String) = function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8}) isempty(bytes) && return UTF32String(Char[0]) - length(bytes) & 3 != 0 && throw(ArgumentError("need multiple of 4 bytes")) + length(bytes) & 3 != 0 && throw(UnicodeError(UTF_ERR_ODD_BYTES_32,0,0)) data = reinterpret(Char, bytes) # check for byte-order mark (BOM): if data[1] == Char(0x0000feff) # native byte order @@ -91,8 +77,6 @@ function isvalid(::Type{UTF32String}, str::Union(Vector{Char}, Vector{UInt32})) return true end isvalid(str::Vector{Char}) = isvalid(UTF32String, str) -isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(str::T) = isvalid(T, str.data) -isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(::Type{T}, str::T) = isvalid(T, str.data) utf32(p::Ptr{Char}, len::Integer) = utf32(pointer_to_array(p, len)) utf32(p::Union(Ptr{UInt32}, Ptr{Int32}), len::Integer) = utf32(convert(Ptr{Char}, p), len) @@ -110,7 +94,7 @@ function map(f, s::UTF32String) for i = 1:(length(d)-1) c2 = f(d[i]) if !isa(c2, Char) - throw(ArgumentError("map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead")) + throw(UnicodeError(UTF_ERR_MAP_CHAR, 0, 0)) end out[i] = (c2::Char) end diff --git a/base/utf8.jl b/base/utf8.jl index f288a1c0fa3dd..d029485d2fc34 100644 --- a/base/utf8.jl +++ b/base/utf8.jl @@ -72,7 +72,7 @@ function next(s::UTF8String, i::Int) end if 0 < j && i <= j+utf8_trailing[d[j]+1] <= length(d) # b is a continuation byte of a valid UTF-8 character - throw(ArgumentError("invalid UTF-8 character index")) + throw(UnicodeError(UTF_ERR_CONT, i, d[j])) end # move past 1 byte in case the data is actually Latin-1 return '\ufffd', i+1 @@ -198,7 +198,7 @@ function reverse(s::UTF8String) out = similar(s.data) if ccall(:u8_reverse, Cint, (Ptr{UInt8}, Ptr{UInt8}, Csize_t), out, s.data, length(out)) == 1 - throw(ArgumentError("invalid UTF-8 data")) + throw(UnicodeError(UTF_ERR_INVALID_8,0,0)) end UTF8String(out) end @@ -212,7 +212,7 @@ write(io::IO, s::UTF8String) = write(io, s.data) utf8(x) = convert(UTF8String, x) convert(::Type{UTF8String}, s::UTF8String) = s convert(::Type{UTF8String}, s::ASCIIString) = UTF8String(s.data) -convert(::Type{UTF8String}, a::Array{UInt8,1}) = isvalid(UTF8String, a) ? UTF8String(a) : throw(ArgumentError("invalid UTF-8 sequence")) +convert(::Type{UTF8String}, a::Array{UInt8,1}) = isvalid(UTF8String, a) ? UTF8String(a) : throw(UnicodeError(UTF_ERR_INVALID_8)) function convert(::Type{UTF8String}, a::Array{UInt8,1}, invalids_as::AbstractString) l = length(a) idx = 1 diff --git a/base/utferror.jl b/base/utferror.jl new file mode 100644 index 0000000000000..27b36e45b44fb --- /dev/null +++ b/base/utferror.jl @@ -0,0 +1,30 @@ +# This file is a part of Julia. License is MIT: http://julialang.org/license + +##\brief Error messages for Unicode / UTF support + +const UTF_ERR_SHORT = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)" +const UTF_ERR_CONT = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)" +const UTF_ERR_LONG = "invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)" +const UTF_ERR_NOT_LEAD = "not a leading Unicode surrogate code unit at index <<1>> (0x<<2>>)" +const UTF_ERR_NOT_TRAIL = "not a trailing Unicode surrogate code unit at index <<1>> (0x<<2>>)" +const UTF_ERR_NOT_SURROGATE = "not a valid Unicode surrogate code unit at index <<1>> (0x<<2>>" +const UTF_ERR_MISSING_SURROGATE = "missing trailing Unicode surrogate code unit after index <<1>> (0x<<2>>)" +const UTF_ERR_INVALID = "invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)" +const UTF_ERR_SURROGATE = "surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)" +const UTF_ERR_NULL_16_TERMINATE = "UTF16String data must be NULL-terminated" +const UTF_ERR_NULL_32_TERMINATE = "UTF32String data must be NULL-terminated" +const UTF_ERR_ODD_BYTES_16 = "UTF16String can't have odd number of bytes <<1>>" +const UTF_ERR_ODD_BYTES_32 = "UTF32String must have multiple of 4 bytes <<1>>" +const UTF_ERR_INVALID_CHAR = "invalid Unicode character (0x<<2>> > 0x10ffff)" +const UTF_ERR_INVALID_8 = "invalid UTF-8 data" +const UTF_ERR_INVALID_16 = "invalid UTF-16 data" +const UTF_ERR_INVALID_INDEX = "invalid character index" +const UTF_ERR_MAP_CHAR = "map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead" + +type UnicodeError <: Exception + errmsg::AbstractString ##< A UTF_ERR_ message + errpos::Int32 ##< Position of invalid character + errchr::UInt32 ##< Invalid character +end + +show(io::IO, exc::UnicodeError) = print(io, replace(replace(exc.errmsg,"<<1>>",string(exc.errpos)),"<<2>>",hex(exc.errchr))) diff --git a/base/utftypes.jl b/base/utftypes.jl new file mode 100644 index 0000000000000..749f1bd774832 --- /dev/null +++ b/base/utftypes.jl @@ -0,0 +1,34 @@ +# This file is a part of Julia. License is MIT: http://julialang.org/license + +##\brief Base UTF16String type, has 16-bit NULL termination word after data, native byte order +# +# \throws UnicodeError + +immutable UTF16String <: AbstractString + data::Vector{UInt16} # includes 16-bit NULL termination after string chars + function UTF16String(data::Vector{UInt16}) + if length(data) < 1 || data[end] != 0 + throw(UnicodeError(UTF_ERR_NULL_16_TERMINATE, 0, 0)) + end + new(data) + end +end + +##\brief Base UTF32String type, has 32-bit NULL termination word after data, native byte order +# +# \throws UnicodeError + +immutable UTF32String <: DirectIndexString + data::Vector{Char} # includes 32-bit NULL termination after string chars + + function UTF32String(data::Vector{Char}) + if length(data) < 1 || data[end] != Char(0) + throw(UnicodeError(UTF_ERR_NULL_32_TERMINATE, 0, 0)) + end + new(data) + end +end +UTF32String(data::Vector{UInt32}) = UTF32String(reinterpret(Char, data)) + +isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(str::T) = isvalid(T, str.data) +isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(::Type{T}, str::T) = isvalid(T, str.data) diff --git a/test/unicode.jl b/test/unicode.jl index 5d3a64285ca83..6af8e8e63a527 100644 --- a/test/unicode.jl +++ b/test/unicode.jl @@ -10,7 +10,8 @@ u16 = utf16(u8) @test collect(u8) == collect(u16) @test u8 == utf16(u16.data[1:end-1]) == utf16(copy!(Array(UInt8, 18), 1, reinterpret(UInt8, u16.data), 1, 18)) @test u8 == utf16(pointer(u16)) == utf16(convert(Ptr{Int16}, pointer(u16))) -@test_throws ArgumentError utf16(utf32(Char(0x120000))) +@test_throws UnicodeError utf16(utf32(Char(0x120000))) +@test_throws UnicodeError utf16(UInt8[1,2,3]) # UTF32 u32 = utf32(u8) @@ -21,6 +22,7 @@ u32 = utf32(u8) @test collect(u8) == collect(u32) @test u8 == utf32(u32.data[1:end-1]) == utf32(copy!(Array(UInt8, 20), 1, reinterpret(UInt8, u32.data), 1, 20)) @test u8 == utf32(pointer(u32)) == utf32(convert(Ptr{Int32}, pointer(u32))) +@test_throws UnicodeError utf32(UInt8[1,2,3]) # Wstring w = wstring(u8)