Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve Unicode related error messages #11573

Merged
merged 1 commit into from
Jun 5, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions base/exports.jl
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ export
SystemError,
TypeError,
AssertionError,
UnicodeError,

# Global constants and variables
ARGS,
Expand Down
2 changes: 2 additions & 0 deletions base/sysimg.jl
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ include("iterator.jl")
include("osutils.jl")

# strings & printing
include("utferror.jl")
include("utftypes.jl")
include("char.jl")
include("ascii.jl")
include("utf8.jl")
Expand Down
20 changes: 5 additions & 15 deletions base/utf16.jl
Original file line number Diff line number Diff line change
@@ -1,15 +1,5 @@
# This file is a part of Julia. License is MIT: http://julialang.org/license

immutable UTF16String <: AbstractString
data::Array{UInt16,1} # includes 16-bit NULL termination after string chars
function UTF16String(data::Vector{UInt16})
if length(data) < 1 || data[end] != 0
throw(ArgumentError("UTF16String data must be NULL-terminated"))
end
new(data)
end
end

utf16_is_lead(c::UInt16) = (c & 0xfc00) == 0xd800
utf16_is_trail(c::UInt16) = (c & 0xfc00) == 0xdc00
utf16_is_surrogate(c::UInt16) = (c & 0xf800) == 0xd800
Expand Down Expand Up @@ -39,7 +29,7 @@ function next(s::UTF16String, i::Int)
elseif length(s.data)-1 > i && utf16_is_lead(s.data[i]) && utf16_is_trail(s.data[i+1])
return utf16_get_supplementary(s.data[i], s.data[i+1]), i+2
end
throw(ArgumentError("invalid UTF-16 character index"))
throw(UnicodeError(UTF_ERR_INVALID_INDEX,0,0))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess the GC frame concern still apply here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@yuyichao Please get that fixed! 😀

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've actually got a new idea this morning and would like to try out. (Sth like a throw local noinline). Maybe that can bring down the gap between #11284 and #11508 .

end

function reverseind(s::UTF16String, i::Integer)
Expand Down Expand Up @@ -74,7 +64,7 @@ function encode16(s::AbstractString)
push!(buf, UInt16(0xd7c0 + (c>>10)))
push!(buf, UInt16(0xdc00 + (c & 0x3ff)))
else
throw(ArgumentError("invalid Unicode character (0x$(hex(c)) > 0x10ffff)"))
throw(UnicodeError(UTF_ERR_INVALID_CHAR, 0, ch))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This one's actually less information because of losing the > 0x10ffff, right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, that was one that I added myself (my very first PR). I can put it back in.
(I was thinking of using the same error message for cases like 0xd800 <= ch <= 0xdfff,
but that's just a false economy)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you put in the extra info and don't mind taking it out, sounds fine by me.

end
end
push!(buf, 0) # NULL termination
Expand Down Expand Up @@ -111,7 +101,7 @@ function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16})
end

function convert(::Type{UTF16String}, data::AbstractVector{UInt16})
!isvalid(UTF16String, data) && throw(ArgumentError("invalid UTF16 data"))
!isvalid(UTF16String, data) && throw(UnicodeError(UTF_ERR_INVALID_16,0,0))
len = length(data)
d = Array(UInt16, len + 1)
d[end] = 0 # NULL terminate
Expand All @@ -126,7 +116,7 @@ convert(T::Type{UTF16String}, data::AbstractArray{Int16}) =

function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
isempty(bytes) && return UTF16String(UInt16[0])
isodd(length(bytes)) && throw(ArgumentError("odd number of bytes"))
isodd(length(bytes)) && throw(UnicodeError(UTF_ERR_ODD_BYTES_16, length(bytes), 0))
data = reinterpret(UInt16, bytes)
# check for byte-order mark (BOM):
if data[1] == 0xfeff # native byte order
Expand All @@ -142,7 +132,7 @@ function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
copy!(d,1, data,1, length(data)) # assume native byte order
end
d[end] = 0 # NULL terminate
!isvalid(UTF16String, d) && throw(ArgumentError("invalid UTF16 data"))
!isvalid(UTF16String, d) && throw(UnicodeError(UTF_ERR_INVALID_16,0,0))
UTF16String(d)
end

Expand Down
20 changes: 2 additions & 18 deletions base/utf32.jl
Original file line number Diff line number Diff line change
@@ -1,19 +1,5 @@
# This file is a part of Julia. License is MIT: http://julialang.org/license

## UTF-32 in the native byte order, i.e. plain old character arrays ##

immutable UTF32String <: DirectIndexString
data::Vector{Char} # includes 32-bit NULL termination after string chars

function UTF32String(a::Vector{Char})
if length(a) < 1 || a[end] != Char(0)
throw(ArgumentError("UTF32String data must be NULL-terminated"))
end
new(a)
end
end
UTF32String(data::Vector{UInt32}) = UTF32String(reinterpret(Char, data))

next(s::UTF32String, i::Int) = (s.data[i], i+1)
endof(s::UTF32String) = length(s.data) - 1
length(s::UTF32String) = length(s.data) - 1
Expand Down Expand Up @@ -65,7 +51,7 @@ unsafe_convert{T<:Union(Int32,UInt32,Char)}(::Type{Ptr{T}}, s::UTF32String) =

function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8})
isempty(bytes) && return UTF32String(Char[0])
length(bytes) & 3 != 0 && throw(ArgumentError("need multiple of 4 bytes"))
length(bytes) & 3 != 0 && throw(UnicodeError(UTF_ERR_ODD_BYTES_32,0,0))
data = reinterpret(Char, bytes)
# check for byte-order mark (BOM):
if data[1] == Char(0x0000feff) # native byte order
Expand All @@ -91,8 +77,6 @@ function isvalid(::Type{UTF32String}, str::Union(Vector{Char}, Vector{UInt32}))
return true
end
isvalid(str::Vector{Char}) = isvalid(UTF32String, str)
isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(str::T) = isvalid(T, str.data)
isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(::Type{T}, str::T) = isvalid(T, str.data)

utf32(p::Ptr{Char}, len::Integer) = utf32(pointer_to_array(p, len))
utf32(p::Union(Ptr{UInt32}, Ptr{Int32}), len::Integer) = utf32(convert(Ptr{Char}, p), len)
Expand All @@ -110,7 +94,7 @@ function map(f, s::UTF32String)
for i = 1:(length(d)-1)
c2 = f(d[i])
if !isa(c2, Char)
throw(ArgumentError("map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"))
throw(UnicodeError(UTF_ERR_MAP_CHAR, 0, 0))
end
out[i] = (c2::Char)
end
Expand Down
6 changes: 3 additions & 3 deletions base/utf8.jl
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ function next(s::UTF8String, i::Int)
end
if 0 < j && i <= j+utf8_trailing[d[j]+1] <= length(d)
# b is a continuation byte of a valid UTF-8 character
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know you didn't write this, but should this comment be referring to d instead of b?

nevermind

throw(ArgumentError("invalid UTF-8 character index"))
throw(UnicodeError(UTF_ERR_CONT, i, d[j]))
end
# move past 1 byte in case the data is actually Latin-1
return '\ufffd', i+1
Expand Down Expand Up @@ -198,7 +198,7 @@ function reverse(s::UTF8String)
out = similar(s.data)
if ccall(:u8_reverse, Cint, (Ptr{UInt8}, Ptr{UInt8}, Csize_t),
out, s.data, length(out)) == 1
throw(ArgumentError("invalid UTF-8 data"))
throw(UnicodeError(UTF_ERR_INVALID_8,0,0))
end
UTF8String(out)
end
Expand All @@ -212,7 +212,7 @@ write(io::IO, s::UTF8String) = write(io, s.data)
utf8(x) = convert(UTF8String, x)
convert(::Type{UTF8String}, s::UTF8String) = s
convert(::Type{UTF8String}, s::ASCIIString) = UTF8String(s.data)
convert(::Type{UTF8String}, a::Array{UInt8,1}) = isvalid(UTF8String, a) ? UTF8String(a) : throw(ArgumentError("invalid UTF-8 sequence"))
convert(::Type{UTF8String}, a::Array{UInt8,1}) = isvalid(UTF8String, a) ? UTF8String(a) : throw(UnicodeError(UTF_ERR_INVALID_8))
function convert(::Type{UTF8String}, a::Array{UInt8,1}, invalids_as::AbstractString)
l = length(a)
idx = 1
Expand Down
30 changes: 30 additions & 0 deletions base/utferror.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# This file is a part of Julia. License is MIT: http://julialang.org/license

##\brief Error messages for Unicode / UTF support
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see what you did there. At least the backslash is less confusing here than @. @jakebolewski what do you think, can we live with this? Or merge anyway and delete it afterwards if we really don't like the doxygen-style comments?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I see the point about looking like a macro... that was done because in Python, for a docstring, you must use @ instead of \, because \ is an escape character inside a string.
Maybe for Julia, to put doxygen information into doc strings, we'd need to use a different start character, to avoid the confusion with macros, and then convert it to @ before passing them to
the doxygen processor.
The only difference now between "plain text" and working doxygen (it actually is the valid Python doxygen style), is the added \. Whether or not doxygen use becomes prevalent (see #11123) in the rest of julia, that information is very useful, so I'd hope you'd accept having a few extra \ characters...
If nobody starts documenting the code in Base, it never is going to happen!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comments are awesome. Extra annotations in the comments that don't do anything yet and don't mean anything to the majority of us who've probably never used doxygen are just noise.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, but to me, they help a lot when working on the code, even without the very nice stuff that doxygen builds from them... (have you ever tried doxygen on a very large project? It really is helpful...)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, but you're contributing to a pre-existing large project that doesn't use doxygen (yet, maybe that will change but that's not for this PR to worry about), and at least 2 core contributors have told you to knock it off with the doxygen annotations.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@one-more-minute You've convinced me... (as a matter of fact, I've worked for a while doing just that, pulling information out of unstructured data... I should have thought of that!).
However, don't you need at least some standardization of tags, so you could pick out which part documents the parameters, which part the return value, which part the errors thrown (if any)?
For example: I've seen ### Arguments:, ### Returns:, Inputs:, Outputs:, ### Arguments, ### Keyword Arguments, etc... as you see, no consistency...
It doesn't even have to be complete consistency, it can be a many-to-one mapping... fairly free form, as long as there aren't too many options for the metadata parser to figure out, or for people to understand.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would at least give other Julia contributors time to evaluate the format, good or bad.

Well I've been evaluating by spending more time than I'd like reviewing your PR's, and so far I don't like the doxygen markup. But I don't care enough to let it block this PR which is otherwise worth merging, so I'm just going to do that now.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And I was just going to change this to using something more along the lines of Mike's idea... 😀 I guess I can do that in a later PR, as soon as the dust as settled a bit on the parsing Markdown ideas... Thanks! (and thanks for taking the time, and pushing back on the doxygen format, because now I will push for something better for documentation, which will still allow me to generate output for doxygen, but which fits better with Julia, and which I hope you do like)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

However, don't you need at least some standardization of tags

Sure, I think it's completely reasonable to settle on some conventions eventually (these things usually happen naturally anyway). I think @catawbasam is right about side issues, though, so let's keep that discussion focussed in #8966 so that things can move forward here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tkelman and I've been spending more time than I'd like changing things so frequently 😀 but nobody has been holding a gun to either of our heads to make you review, or me to respond to your reviews. A long process yes, and I'm sorry for the time it has taken, but I think in the end, the final result for julia is much better. Thanks again for your time.


const UTF_ERR_SHORT = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)"
const UTF_ERR_CONT = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)"
const UTF_ERR_LONG = "invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)"
const UTF_ERR_NOT_LEAD = "not a leading Unicode surrogate code unit at index <<1>> (0x<<2>>)"
const UTF_ERR_NOT_TRAIL = "not a trailing Unicode surrogate code unit at index <<1>> (0x<<2>>)"
const UTF_ERR_NOT_SURROGATE = "not a valid Unicode surrogate code unit at index <<1>> (0x<<2>>"
const UTF_ERR_MISSING_SURROGATE = "missing trailing Unicode surrogate code unit after index <<1>> (0x<<2>>)"
const UTF_ERR_INVALID = "invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)"
const UTF_ERR_SURROGATE = "surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)"
const UTF_ERR_NULL_16_TERMINATE = "UTF16String data must be NULL-terminated"
const UTF_ERR_NULL_32_TERMINATE = "UTF32String data must be NULL-terminated"
const UTF_ERR_ODD_BYTES_16 = "UTF16String can't have odd number of bytes <<1>>"
const UTF_ERR_ODD_BYTES_32 = "UTF32String must have multiple of 4 bytes <<1>>"
const UTF_ERR_INVALID_CHAR = "invalid Unicode character (0x<<2>> > 0x10ffff)"
const UTF_ERR_INVALID_8 = "invalid UTF-8 data"
const UTF_ERR_INVALID_16 = "invalid UTF-16 data"
const UTF_ERR_INVALID_INDEX = "invalid character index"
const UTF_ERR_MAP_CHAR = "map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"

type UnicodeError <: Exception
errmsg::AbstractString ##< A UTF_ERR_ message
errpos::Int32 ##< Position of invalid character
errchr::UInt32 ##< Invalid character
end

show(io::IO, exc::UnicodeError) = print(io, replace(replace(exc.errmsg,"<<1>>",string(exc.errpos)),"<<2>>",hex(exc.errchr)))
34 changes: 34 additions & 0 deletions base/utftypes.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# This file is a part of Julia. License is MIT: http://julialang.org/license

##\brief Base UTF16String type, has 16-bit NULL termination word after data, native byte order
#
# \throws UnicodeError

immutable UTF16String <: AbstractString
data::Vector{UInt16} # includes 16-bit NULL termination after string chars
function UTF16String(data::Vector{UInt16})
if length(data) < 1 || data[end] != 0
throw(UnicodeError(UTF_ERR_NULL_16_TERMINATE, 0, 0))
end
new(data)
end
end

##\brief Base UTF32String type, has 32-bit NULL termination word after data, native byte order
#
# \throws UnicodeError

immutable UTF32String <: DirectIndexString
data::Vector{Char} # includes 32-bit NULL termination after string chars

function UTF32String(data::Vector{Char})
if length(data) < 1 || data[end] != Char(0)
throw(UnicodeError(UTF_ERR_NULL_32_TERMINATE, 0, 0))
end
new(data)
end
end
UTF32String(data::Vector{UInt32}) = UTF32String(reinterpret(Char, data))

isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(str::T) = isvalid(T, str.data)
isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(::Type{T}, str::T) = isvalid(T, str.data)
4 changes: 3 additions & 1 deletion test/unicode.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ u16 = utf16(u8)
@test collect(u8) == collect(u16)
@test u8 == utf16(u16.data[1:end-1]) == utf16(copy!(Array(UInt8, 18), 1, reinterpret(UInt8, u16.data), 1, 18))
@test u8 == utf16(pointer(u16)) == utf16(convert(Ptr{Int16}, pointer(u16)))
@test_throws ArgumentError utf16(utf32(Char(0x120000)))
@test_throws UnicodeError utf16(utf32(Char(0x120000)))
@test_throws UnicodeError utf16(UInt8[1,2,3])

# UTF32
u32 = utf32(u8)
Expand All @@ -21,6 +22,7 @@ u32 = utf32(u8)
@test collect(u8) == collect(u32)
@test u8 == utf32(u32.data[1:end-1]) == utf32(copy!(Array(UInt8, 20), 1, reinterpret(UInt8, u32.data), 1, 20))
@test u8 == utf32(pointer(u32)) == utf32(convert(Ptr{Int32}, pointer(u32)))
@test_throws UnicodeError utf32(UInt8[1,2,3])

# Wstring
w = wstring(u8)
Expand Down