From 8ca9d0ea9b872686438dc9afe6b968e00a7430ad Mon Sep 17 00:00:00 2001 From: Carlo Baldassi Date: Mon, 23 Jul 2018 03:40:41 +0200 Subject: [PATCH] Fix errors on julia 0.7, drop 0.5 support all tests pass without deprecation warnings; some deprecation warnings still remain though --- .travis.yml | 2 +- README.md | 1 + REQUIRE | 4 +- appveyor.yml | 4 +- src/LegacyStrings.jl | 123 ++++++++++++++++++++++++------------------- src/ascii.jl | 20 +++++-- src/directindex.jl | 12 +++-- src/rep.jl | 33 +++++++++--- src/rev.jl | 27 ++++++++-- src/support.jl | 21 ++++---- src/unicodeerror.jl | 11 ++++ src/utf16.jl | 45 +++++++++++----- src/utf32.jl | 46 +++++++++------- src/utf8.jl | 32 +++++++---- test/runtests.jl | 97 ++++++++++++++++++---------------- 15 files changed, 304 insertions(+), 174 deletions(-) create mode 100644 src/unicodeerror.jl diff --git a/.travis.yml b/.travis.yml index 25f5db4..51d0355 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,8 +4,8 @@ os: - linux - osx julia: - - 0.5 - 0.6 + - 0.7 - nightly notifications: email: false diff --git a/README.md b/README.md index 4a6edc7..2d72871 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ [![Julia 0.5 Status](http://pkg.julialang.org/badges/LegacyStrings_0.5.svg)](http://pkg.julialang.org/?pkg=LegacyStrings&ver=0.5) [![Julia 0.6 Status](http://pkg.julialang.org/badges/LegacyStrings_0.6.svg)](http://pkg.julialang.org/?pkg=LegacyStrings&ver=0.6) +[![Julia 0.7 Status](http://pkg.julialang.org/badges/LegacyStrings_0.7.svg)](http://pkg.julialang.org/?pkg=LegacyStrings&ver=0.7) The LegacyStrings package provides compatibility string types from Julia 0.5 (and earlier), which were removed in subsequent versions, including: diff --git a/REQUIRE b/REQUIRE index 8a3f6b8..5b16f97 100644 --- a/REQUIRE +++ b/REQUIRE @@ -1,2 +1,2 @@ -julia 0.5 -Compat 0.18.0 +julia 0.6 +Compat 0.67 diff --git a/appveyor.yml b/appveyor.yml index 112521c..0e046e9 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,9 +1,9 @@ environment: matrix: - - JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x86/0.5/julia-0.5-latest-win32.exe" - - JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x64/0.5/julia-0.5-latest-win64.exe" - JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x86/0.6/julia-0.6-latest-win32.exe" - JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x64/0.6/julia-0.6-latest-win64.exe" + - JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x86/0.7/julia-0.7-latest-win32.exe" + - JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x64/0.7/julia-0.7-latest-win64.exe" - JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x86/julia-latest-win32.exe" - JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x64/julia-latest-win64.exe" diff --git a/src/LegacyStrings.jl b/src/LegacyStrings.jl index 0f448b8..684fe0f 100644 --- a/src/LegacyStrings.jl +++ b/src/LegacyStrings.jl @@ -23,7 +23,6 @@ export import Base: containsnul, convert, - endof, getindex, isvalid, lcfirst, @@ -31,7 +30,9 @@ import Base: lowercase, map, next, + nextind, pointer, + prevind, reverse, reverseind, rsearch, @@ -45,70 +46,82 @@ import Base: write using Compat +using Compat: IOBuffer +import Compat: + lastindex, + codeunit, + ncodeunits - if isdefined(Base, :lastidx) - import Base: lastidx - end +if isdefined(Base, :iterate) + import Base: iterate +end - if isdefined(Base, :DirectIndexString) - using Base: DirectIndexString - else - include("directindex.jl") - end +if isdefined(Base, :UnicodeError) + import Base: UnicodeError +else + include("unicodeerror.jl") +end - if VERSION >= v"0.5.0-" - immutable ASCIIString <: DirectIndexString - data::Vector{UInt8} - ASCIIString(data::String) = new(Vector{UInt8}(data)) - ASCIIString(data) = new(data) - end +if isdefined(Base, :DirectIndexString) + using Base: DirectIndexString +else + include("directindex.jl") +end - immutable UTF8String <: AbstractString - data::Vector{UInt8} - UTF8String(data::String) = new(Vector{UInt8}(data)) - UTF8String(data) = new(data) - end +struct ASCIIString <: DirectIndexString + data::Vector{UInt8} + ASCIIString(data::String) = new(Vector{UInt8}(codeunits(data))) + ASCIIString(data) = new(data) +end - immutable UTF16String <: AbstractString - data::Vector{UInt16} # includes 16-bit NULL termination after string chars - function UTF16String(data::Vector{UInt16}) - if length(data) < 1 || data[end] != 0 - throw(UnicodeError(UTF_ERR_NULL_16_TERMINATE, 0, 0)) - end - new(data) - end +struct UTF8String <: AbstractString + data::Vector{UInt8} + UTF8String(data::String) = new(Vector{UInt8}(codeunits(data))) + UTF8String(data) = new(data) +end + +struct UTF16String <: AbstractString + data::Vector{UInt16} # includes 16-bit NULL termination after string chars + function UTF16String(data::Vector{UInt16}) + if length(data) < 1 || data[end] != 0 + throw(UnicodeError(UTF_ERR_NULL_16_TERMINATE, 0, 0)) end + new(data) + end +end - immutable UTF32String <: DirectIndexString - data::Vector{UInt32} # includes 32-bit NULL termination after string chars - function UTF32String(data::Vector{UInt32}) - if length(data) < 1 || data[end] != 0 - throw(UnicodeError(UTF_ERR_NULL_32_TERMINATE, 0, 0)) - end - new(data) - end +struct UTF32String <: DirectIndexString + data::Vector{UInt32} # includes 32-bit NULL termination after string chars + function UTF32String(data::Vector{UInt32}) + if length(data) < 1 || data[end] != 0 + throw(UnicodeError(UTF_ERR_NULL_32_TERMINATE, 0, 0)) end + new(data) + end +end - const ByteString = Union{ASCIIString,UTF8String} +const ByteString = Union{ASCIIString,UTF8String} - include("support.jl") - include("ascii.jl") - include("utf8.jl") - include("utf16.jl") - include("utf32.jl") - else - using Base: UTF_ERR_SHORT, checkstring - end +include("support.jl") +include("ascii.jl") +include("utf8.jl") +include("utf16.jl") +include("utf32.jl") +include("rep.jl") - if isdefined(Base, :RepString) - using Base: RepString - else - include("rep.jl") - end +if isdefined(Base, :RevString) + using Base: RevString +else + include("rev.jl") +end + +const AllLegacyStringTypes = Union{ASCIIString,UTF8String,UTF16String,UTF32String,RepString,RevString} + +codeunit(s::SubString{<:AllLegacyStringTypes}) = codeunit(s.string) +ncodeunits(s::SubString{<:AllLegacyStringTypes}) = isdefined(s, :ncodeunits) ? s.ncodeunits : s.endof + +if !isdefined(Base, :iterate) + iterate(s::Union{String,SubString,AllLegacyStringTypes}, i::Int) = next(s, i) +end - if isdefined(Base, :RevString) - using Base: RevString - else - include("rev.jl") - end end # module diff --git a/src/ascii.jl b/src/ascii.jl index e5542e7..0d7ac78 100644 --- a/src/ascii.jl +++ b/src/ascii.jl @@ -2,9 +2,20 @@ ## required core functionality ## -endof(s::ASCIIString) = length(s.data) +lastindex(s::ASCIIString) = length(s.data) getindex(s::ASCIIString, i::Int) = (x=s.data[i]; ifelse(x < 0x80, Char(x), '\ufffd')) +codeunit(s::ASCIIString) = UInt8 +ncodeunits(s::ASCIIString) = length(s.data) + +if isdefined(Base, :iterate) + import Base: iterate + function iterate(s::ASCIIString, i::Int = firstindex(s)) + i > ncodeunits(s) && return nothing + return next(s, i) + end +end + ## overload methods for efficiency ## bytestring(s::ASCIIString) = s @@ -29,7 +40,7 @@ function string(c::ASCIIString...) for s in c n += length(s.data) end - v = Vector{UInt8}(n) + v = Vector{UInt8}(undef, n) o = 1 for s in c ls = length(s.data) @@ -97,12 +108,15 @@ write(io::IO, s::ASCIIString) = write(io, s.data) ascii(x) = convert(ASCIIString, x) convert(::Type{ASCIIString}, s::ASCIIString) = s -convert(::Type{ASCIIString}, s::String) = ascii(Vector{UInt8}(s)) +convert(::Type{ASCIIString}, s::String) = ascii(codeunits(s)) convert(::Type{ASCIIString}, s::UTF8String) = ascii(s.data) convert(::Type{ASCIIString}, a::Vector{UInt8}) = begin isvalid(ASCIIString,a) || throw(ArgumentError("invalid ASCII sequence")) return ASCIIString(a) end +if isdefined(Base, :codeunits) + convert(::Type{ASCIIString}, a::Base.CodeUnits{UInt8,String}) = convert(ASCIIString, Vector{UInt8}(a)) +end ascii(p::Ptr{UInt8}) = ascii(p, p == C_NULL ? Csize_t(0) : ccall(:strlen, Csize_t, (Ptr{UInt8},), p)) diff --git a/src/directindex.jl b/src/directindex.jl index 2d98de2..a6430ba 100644 --- a/src/directindex.jl +++ b/src/directindex.jl @@ -6,10 +6,12 @@ next(s::DirectIndexString, i::Int) = (s[i],i+1) length(s::DirectIndexString) = endof(s) -isvalid(s::DirectIndexString, i::Integer) = (start(s) <= i <= endof(s)) +isvalid(s::DirectIndexString, i::Integer) = (firstindex(s) <= i <= lastindex(s)) -prevind(s::DirectIndexString, i::Integer) = Int(i)-1 -nextind(s::DirectIndexString, i::Integer) = Int(i)+1 +prevind(s::DirectIndexString, i::Int) = i-1 +nextind(s::DirectIndexString, i::Int) = i+1 +prevind(s::DirectIndexString, i::Integer) = prevind(s, i) +nextind(s::DirectIndexString, i::Integer) = nextind(s, i) function prevind(s::DirectIndexString, i::Integer, nchar::Integer) nchar > 0 || throw(ArgumentError("nchar must be greater than 0")) @@ -24,9 +26,9 @@ end ind2chr(s::DirectIndexString, i::Integer) = begin checkbounds(s,i); i end chr2ind(s::DirectIndexString, i::Integer) = begin checkbounds(s,i); i end -length(s::SubString{<:DirectIndexString}) = endof(s) +length(s::SubString{<:DirectIndexString}) = lastindex(s) -isvalid(s::SubString{<:DirectIndexString}, i::Integer) = (start(s) <= i <= endof(s)) +isvalid(s::SubString{<:DirectIndexString}, i::Integer) = (firstindex(s) <= i <= ncodeunits(s)) ind2chr(s::SubString{<:DirectIndexString}, i::Integer) = begin checkbounds(s,i); i end chr2ind(s::SubString{<:DirectIndexString}, i::Integer) = begin checkbounds(s,i); i end diff --git a/src/rep.jl b/src/rep.jl index 98fd52e..77b41d1 100644 --- a/src/rep.jl +++ b/src/rep.jl @@ -1,23 +1,32 @@ # This file includes code that was formerly a part of Julia. License is MIT: http://julialang.org/license -immutable RepString <: AbstractString +struct RepString <: AbstractString string::AbstractString repeat::Integer end -function endof(s::RepString) - e = endof(s.string) - (next(s.string,e)[2]-1) * (s.repeat-1) + e +function lastindex(s::RepString) + e = lastindex(s.string) + (iterate(s.string,e)[2]-1) * (s.repeat-1) + e end length(s::RepString) = length(s.string)*s.repeat sizeof(s::RepString) = sizeof(s.string)*s.repeat +function isvalid(s::RepString, i::Int) + 1 ≤ i ≤ ncodeunits(s) || return false + j = 1 + while j < i + _, j = iterate(s, j) + end + return j == i +end + function next(s::RepString, i::Int) if i < 1 throw(BoundsError(s, i)) end - e = endof(s.string) - sz = next(s.string,e)[2]-1 + e = lastindex(s.string) + sz = iterate(s.string,e)[2]-1 r, j = divrem(i-1, sz) j += 1 @@ -26,8 +35,18 @@ function next(s::RepString, i::Int) throw(BoundsError(s, i)) end - c, k = next(s.string, j) + c, k = iterate(s.string, j) c, k-j+i end +codeunit(s::RepString) = codeunit(s.string) +ncodeunits(s::RepString) = ncodeunits(s.string) * s.repeat + +if isdefined(Base, :iterate) + function iterate(s::RepString, i::Int = firstindex(s)) + i > ncodeunits(s) && return nothing + return next(s, i) + end +end + convert(::Type{RepString}, s::AbstractString) = RepString(s,1) diff --git a/src/rev.jl b/src/rev.jl index 0aeacea..66acda2 100644 --- a/src/rev.jl +++ b/src/rev.jl @@ -2,18 +2,37 @@ ## reversed strings without data movement ## -immutable RevString{T<:AbstractString} <: AbstractString +struct RevString{T<:AbstractString} <: AbstractString string::T end -endof(s::RevString) = endof(s.string) +lastindex(s::RevString) = lastindex(s.string) length(s::RevString) = length(s.string) sizeof(s::RevString) = sizeof(s.string) function next(s::RevString, i::Int) - n = endof(s); j = n-i+1 + n = lastindex(s); j = n-i+1 (s.string[j], n-prevind(s.string,j)+1) end +codeunit(s::RevString) = codeunit(s.string) +ncodeunits(s::RevString) = ncodeunits(s.string) + +if isdefined(Base, :iterate) + function iterate(s::RevString, i::Int = firstindex(s)) + i > lastindex(s) && return nothing + return next(s, i) + end +end + +function isvalid(s::RevString, i::Int) + 1 ≤ i ≤ ncodeunits(s) || return false + j = 1 + while j < i + _, j = iterate(s, j) + end + return j == i +end + reverse(s::RevString) = s.string -reverseind(s::RevString, i::Integer) = endof(s) - i + 1 +reverseind(s::RevString, i::Integer) = lastindex(s) - i + 1 diff --git a/src/support.jl b/src/support.jl index d809767..3761131 100644 --- a/src/support.jl +++ b/src/support.jl @@ -59,7 +59,7 @@ Input Arguments: Optional Input Arguments: * `pos` start position (defaults to 1) -* `endpos` end position (defaults to `endof(dat)`) +* `endpos` end position (defaults to `lastindex(dat)`) Keyword Arguments: @@ -79,7 +79,7 @@ function unsafe_checkstring end function unsafe_checkstring(dat::AbstractVector{UInt8}, pos = 1, - endpos = endof(dat) + endpos = length(dat) ; accept_long_null = true, accept_surrogates = true, @@ -183,12 +183,12 @@ function unsafe_checkstring(dat::AbstractVector{UInt8}, return totalchar, flags, num4byte, num3byte, num2byte end -@compat AbstractString1632{Tel<:Union{UInt16,UInt32}} = Union{AbstractVector{Tel}, AbstractString} +AbstractString1632{Tel<:Union{UInt16,UInt32}} = Union{AbstractVector{Tel}, AbstractString} function unsafe_checkstring( dat::AbstractString1632, pos = 1, - endpos = endof(dat) + endpos = lastindex(dat) ; accept_long_null = true, accept_surrogates = true, @@ -246,7 +246,7 @@ Input Arguments: Optional Input Arguments: * `startpos` start position (defaults to 1) -* `endpos` end position (defaults to `endof(dat)`) +* `endpos` end position (defaults to `lastindex(dat)`) Keyword Arguments: @@ -265,18 +265,19 @@ Throws: function checkstring end # No need to check bounds if using defaults -checkstring(dat; kwargs...) = unsafe_checkstring(dat, 1, endof(dat); kwargs...) +checkstring(dat::AbstractString; kwargs...) = unsafe_checkstring(dat, 1, lastindex(dat); kwargs...) +checkstring(dat; kwargs...) = unsafe_checkstring(dat, 1, length(dat); kwargs...) # Make sure that beginning and end positions are bounds checked -function checkstring(dat, startpos, endpos = endof(dat); kwargs...) +function checkstring(dat, startpos, endpos = lastindex(dat); kwargs...) checkbounds(dat,startpos) checkbounds(dat,endpos) endpos < startpos && throw(ArgumentError("End position ($endpos) is less than start position ($startpos)")) unsafe_checkstring(dat, startpos, endpos; kwargs...) end -isvalid{T<:Union{ASCIIString,UTF8String,UTF16String,UTF32String}}(str::T) = isvalid(T, str.data) -isvalid{T<:Union{ASCIIString,UTF8String,UTF16String,UTF32String}}(::Type{T}, str::T) = isvalid(T, str.data) +isvalid(str::T) where {T<:Union{ASCIIString,UTF8String,UTF16String,UTF32String}} = isvalid(T, str.data) +isvalid(::Type{T}, str::T) where {T<:Union{ASCIIString,UTF8String,UTF16String,UTF32String}} = isvalid(T, str.data) byte_string_classify(data::Vector{UInt8}) = ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), data, length(data)) @@ -291,7 +292,7 @@ isvalid(::Type{UTF8String}, s::Union{Vector{UInt8},ByteString}) = byte_string_cl bytestring() = ASCIIString("") function bytestring(s::AbstractString...) str = Base.print_to_string(s...) - data = Vector{UInt8}(str) + data = Vector{UInt8}(codeunits(str)) isvalid(ASCIIString, data) ? ASCIIString(data) : UTF8String(data) end bytestring(s::Vector{UInt8}) = bytestring(String(s)) diff --git a/src/unicodeerror.jl b/src/unicodeerror.jl new file mode 100644 index 0000000..89dcae0 --- /dev/null +++ b/src/unicodeerror.jl @@ -0,0 +1,11 @@ +## Error messages for Unicode / UTF support + +struct UnicodeError <: Exception + errmsg::AbstractString ##< A UTF_ERR_ message + errpos::Int32 ##< Position of invalid character + errchr::UInt32 ##< Invalid character +end + +show(io::IO, exc::UnicodeError) = print(io, replace(replace(string("UnicodeError: ",exc.errmsg), + "<<1>>" => string(exc.errpos)), + "<<2>>" => string(exc.errchr, base=16))) diff --git a/src/utf16.jl b/src/utf16.jl index c072721..3230e11 100644 --- a/src/utf16.jl +++ b/src/utf16.jl @@ -1,9 +1,9 @@ # This file includes code that was formerly a part of Julia. License is MIT: http://julialang.org/license # Quickly copy and set trailing \0 -@inline function fast_utf_copy{S <: Union{UTF16String, UTF32String}, T <: Union{UInt16, UInt32}}( - ::Type{S}, ::Type{T}, len, dat, flag::Bool=false) - S(setindex!(copy!(Vector{T}(len+1), 1, dat, 1, flag ? len : len+1), 0, len+1)) +@inline function fast_utf_copy(::Type{S}, ::Type{T}, len, dat, flag::Bool=false) where + {S <: Union{UTF16String, UTF32String}, T <: Union{UInt16, UInt32}} + S(setindex!(copyto!(Vector{T}(undef, len+1), 1, dat, 1, flag ? len : len+1), 0, len+1)) end # Get rest of character ch from 3-byte UTF-8 sequence in dat @@ -41,13 +41,16 @@ function length(s::UTF16String) cnum end -function endof(s::UTF16String) +function lastindex(s::UTF16String) d = s.data i = length(d) - 1 i == 0 && return i return is_surrogate_codeunit(d[i]) ? i-1 : i end +codeunit(s::UTF16String) = UInt16 +ncodeunits(s::UTF16String) = length(s.data) - 1 + get_supplementary(lead::Unsigned, trail::Unsigned) = (UInt32(lead-0xd7f7)<<10 + trail) function next(s::UTF16String, i::Int) @@ -61,6 +64,13 @@ function next(s::UTF16String, i::Int) Char(get_supplementary(ch, ct)), i+2 end +if isdefined(Base, :iterate) + function iterate(s::UTF16String, i::Int = firstindex(s)) + i > ncodeunits(s) && return nothing + return next(s, i) + end +end + function reverseind(s::UTF16String, i::Integer) j = length(s.data) - i return is_surrogate_trail(s.data[j]) ? j-1 : j @@ -86,6 +96,17 @@ end sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16) +function isvalid(s::UTF16String, i::Int) + (i < 1 || i > ncodeunits(s)) && return false + if is_surrogate_lead(s.data[i]) && is_surrogate_trail(s.data[i+1]) + return true + elseif is_surrogate_codeunit(s.data[i]) + return false + else + return true + end +end + function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16}) i = 1 n = length(data) # this may include NULL termination; that's okay @@ -103,7 +124,7 @@ end function convert(::Type{UTF16String}, str::AbstractString) len, flags, num4byte = unsafe_checkstring(str) - buf = Vector{UInt16}(len+num4byte+1) + buf = Vector{UInt16}(undef, len+num4byte+1) out = 0 @inbounds for ch in str c = UInt32(ch) @@ -126,10 +147,10 @@ function convert(::Type{UTF16String}, str::UTF8String) # Check that is correct UTF-8 encoding and get number of words needed len, flags, num4byte = unsafe_checkstring(dat) len += num4byte - buf = Vector{UInt16}(len+1) + buf = Vector{UInt16}(undef, len+1) @inbounds buf[len+1] = 0 # Optimize case where no characters > 0x7f - flags == 0 && @inbounds return UTF16String(copy!(buf, dat)) + flags == 0 && @inbounds return UTF16String(copyto!(buf, dat)) out = 0 pos = 0 @inbounds while out < len @@ -163,7 +184,7 @@ function convert(::Type{UTF8String}, str::UTF16String) len <= 1 && return empty_utf8 # get number of bytes to allocate len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat, 1, len-1) - flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len)) + flags == 0 && @inbounds return UTF8String(copyto!(Vector{UInt8}(undef, len), 1, dat, 1, len)) return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3) end @@ -180,7 +201,7 @@ Returns: * `UTF16String` """ function encode_to_utf16(dat, len) - buf = Vector{UInt16}(len) + buf = Vector{UInt16}(undef, len) @inbounds buf[len] = 0 # NULL termination out = 0 pos = 0 @@ -206,7 +227,7 @@ convert(::Type{Array{UInt16}}, str::UTF16String) = str.data convert(::Type{UTF16String}, str::UTF16String) = str -unsafe_convert{T<:Union{Int16,UInt16}}(::Type{Ptr{T}}, s::UTF16String) = +unsafe_convert(::Type{Ptr{T}}, s::UTF16String) where {T<:Union{Int16,UInt16}} = convert(Ptr{T}, pointer(s)) convert(T::Type{UTF16String}, data::AbstractArray{UInt16}) = @@ -237,7 +258,7 @@ function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8}) swap = false end len = nb ÷ 2 - offset - d = Vector{UInt16}(len + 1) + d = Vector{UInt16}(undef, len + 1) if swap @inbounds for i in 1:len ib = i + offset @@ -246,7 +267,7 @@ function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8}) d[i] = (UInt16(bhi) << 8) | blo end else - unsafe_copy!(Ptr{UInt8}(pointer(d)), pointer(bytes, offset * 2 + 1), len * 2) + unsafe_copyto!(Ptr{UInt8}(pointer(d)), pointer(bytes, offset * 2 + 1), len * 2) end d[end] = 0 # NULL terminate !isvalid(UTF16String, d) && throw(UnicodeError(UTF_ERR_INVALID_16,0,0)) diff --git a/src/utf32.jl b/src/utf32.jl index f97b8dc..f44f280 100644 --- a/src/utf32.jl +++ b/src/utf32.jl @@ -4,9 +4,19 @@ UTF32String(data::Vector{Char}) = UTF32String(reinterpret(UInt32, data)) # UTF-32 basic functions next(s::UTF32String, i::Int) = (Char(s.data[i]), i+1) -endof(s::UTF32String) = length(s.data) - 1 +lastindex(s::UTF32String) = length(s.data) - 1 length(s::UTF32String) = length(s.data) - 1 +codeunit(s::UTF32String) = UInt32 +ncodeunits(s::UTF32String) = length(s.data) + +if isdefined(Base, :iterate) + function iterate(s::UTF32String, i::Int = firstindex(s)) + i > length(s) && return nothing + return next(s, i) + end +end + reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s))) sizeof(s::UTF32String) = sizeof(s.data) - sizeof(UInt32) @@ -18,7 +28,7 @@ convert(::Type{UTF32String}, s::UTF32String) = s function convert(::Type{UTF32String}, str::AbstractString) len, flags = unsafe_checkstring(str) - buf = Vector{UInt32}(len+1) + buf = Vector{UInt32}(undef, len+1) out = 0 @inbounds for ch in str ; buf[out += 1] = ch ; end @inbounds buf[out + 1] = 0 # NULL termination @@ -32,7 +42,7 @@ function convert(::Type{UTF8String}, str::UTF32String) len <= 1 && return empty_utf8 # get number of bytes to allocate len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat, 1, len-1) - flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len)) + flags == 0 && @inbounds return UTF8String(copyto!(Vector{UInt8}(undef, len), 1, dat, 1, len)) return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3) end @@ -45,7 +55,7 @@ function convert(::Type{UTF32String}, str::UTF8String) # Optimize case where no characters > 0x7f flags == 0 && @inbounds return fast_utf_copy(UTF32String, UInt32, len, dat, true) # has multi-byte UTF-8 sequences - buf = Vector{UInt32}(len+1) + buf = Vector{UInt32}(undef, len+1) @inbounds buf[len+1] = 0 # NULL termination local ch::UInt32, surr::UInt32 out = 0 @@ -89,9 +99,9 @@ function convert(::Type{UTF32String}, str::UTF16String) # get number of words to create len, flags, num4byte = unsafe_checkstring(dat, 1, len>>>1) # No surrogate pairs, do optimized copy - (flags & UTF_UNICODE4) == 0 && @inbounds return UTF32String(copy!(Vector{UInt32}(len), dat)) + (flags & UTF_UNICODE4) == 0 && @inbounds return UTF32String(copyto!(Vector{UInt32}(undef, len), dat)) local ch::UInt32 - buf = Vector{UInt32}(len) + buf = Vector{UInt32}(undef, len) out = 0 pos = 0 @inbounds while out < len @@ -111,7 +121,7 @@ function convert(::Type{UTF16String}, str::UTF32String) # get number of words to allocate len, flags, num4byte = unsafe_checkstring(dat, 1, len>>>2) # optimized path, no surrogates - num4byte == 0 && @inbounds return UTF16String(copy!(Vector{UInt16}(len), dat)) + num4byte == 0 && @inbounds return UTF16String(copyto!(Vector{UInt16}(undef, len), dat)) return encode_to_utf16(dat, len + num4byte) end @@ -130,12 +140,12 @@ convert(::Type{UTF32String}, data::AbstractVector{Int32}) = convert(::Type{UTF32String}, data::AbstractVector{Char}) = convert(UTF32String, map(UInt32, data)) -convert{T<:AbstractString, S<:Union{UInt32,Char,Int32}}(::Type{T}, v::AbstractVector{S}) = +convert(::Type{T}, v::AbstractVector{S}) where {T<:AbstractString, S<:Union{UInt32,Char,Int32}} = convert(T, utf32(v)) # specialize for performance reasons: -function convert{T<:ByteString, S<:Union{UInt32,Char,Int32}}(::Type{T}, data::AbstractVector{S}) - s = IOBuffer(Vector{UInt8}(length(data)), true, true) +function convert(::Type{T}, data::AbstractVector{S}) where {T<:ByteString, S<:Union{UInt32,Char,Int32}} + s = IOBuffer(Vector{UInt8}(undef, length(data)), read=true, write=true) truncate(s,0) for x in data print(s, Char(x)) @@ -146,7 +156,7 @@ end convert(::Type{Vector{UInt32}}, str::UTF32String) = str.data convert(::Type{Array{UInt32}}, str::UTF32String) = str.data -unsafe_convert{T<:Union{UInt32,Int32,Char}}(::Type{Ptr{T}}, s::UTF32String) = +unsafe_convert(::Type{Ptr{T}}, s::UTF32String) where {T<:Union{UInt32,Int32,Char}} = convert(Ptr{T}, pointer(s)) function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8}) @@ -168,7 +178,7 @@ function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8}) swap = false end len = nb ÷ 4 - offset - d = Vector{UInt32}(len + 1) + d = Vector{UInt32}(undef, len + 1) if swap @inbounds for i in 1:len ib = i + offset @@ -179,7 +189,7 @@ function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8}) d[i] = (b1 << 24) | (b2 << 16) | (b3 << 8) | b4 end else - unsafe_copy!(Ptr{UInt8}(pointer(d)), pointer(bytes, offset * 4 + 1), len * 4) + unsafe_copyto!(Ptr{UInt8}(pointer(d)), pointer(bytes, offset * 4 + 1), len * 4) end d[end] = 0 # NULL terminate UTF32String(d) @@ -221,7 +231,7 @@ end # Definitions for C compatible strings, that don't allow embedded # '\0', and which are terminated by a '\0' containsnul(s::ByteString) = containsnul(unsafe_convert(Ptr{Cchar}, s), sizeof(s)) -containsnul(s::Union{UTF16String,UTF32String}) = findfirst(s.data, 0) != length(s.data) +containsnul(s::Union{UTF16String,UTF32String}) = findfirst(isequal(0), s.data) != length(s.data) if sizeof(Cwchar_t) == 2 const WString = UTF16String @@ -247,10 +257,10 @@ pointer(x::ByteString, i::Integer) = pointer(x.data)+(i-1) pointer(x::Union{UTF16String,UTF32String}, i::Integer) = pointer(x)+(i-1)*sizeof(eltype(x.data)) # pointer conversions of SubString of ASCII/UTF8/UTF16/UTF32: -pointer{T<:ByteString}(x::SubString{T}) = pointer(x.string.data) + x.offset -pointer{T<:ByteString}(x::SubString{T}, i::Integer) = pointer(x.string.data) + x.offset + (i-1) -pointer{T<:Union{UTF16String,UTF32String}}(x::SubString{T}) = pointer(x.string.data) + x.offset*sizeof(eltype(x.string.data)) -pointer{T<:Union{UTF16String,UTF32String}}(x::SubString{T}, i::Integer) = pointer(x.string.data) + (x.offset + (i-1))*sizeof(eltype(x.string.data)) +pointer(x::SubString{T}) where {T<:ByteString} = pointer(x.string.data) + x.offset +pointer(x::SubString{T}, i::Integer) where {T<:ByteString} = pointer(x.string.data) + x.offset + (i-1) +pointer(x::SubString{T}) where {T<:Union{UTF16String,UTF32String}} = pointer(x.string.data) + x.offset*sizeof(eltype(x.string.data)) +pointer(x::SubString{T}, i::Integer) where {T<:Union{UTF16String,UTF32String}} = pointer(x.string.data) + (x.offset + (i-1))*sizeof(eltype(x.string.data)) """ utf32(s) diff --git a/src/utf8.jl b/src/utf8.jl index 79818df..1916c8d 100644 --- a/src/utf8.jl +++ b/src/utf8.jl @@ -21,7 +21,7 @@ const utf8_trailing = [ ## required core functionality ## -function endof(s::UTF8String) +function lastindex(s::UTF8String) d = s.data i = length(d) i == 0 && return i @@ -31,6 +31,9 @@ function endof(s::UTF8String) i end +codeunit(s::UTF8String) = UInt8 +ncodeunits(s::UTF8String) = length(s.data) + function length(s::UTF8String) d = s.data cnum = 0 @@ -71,6 +74,13 @@ function next(s::UTF8String, i::Int) Char(c), i end +if isdefined(Base, :iterate) + function iterate(s::UTF8String, i::Int = firstindex(s)) + i > ncodeunits(s) && return nothing + return next(s, i) + end +end + function first_utf8_byte(ch::Char) c = UInt32(ch) c < 0x80 ? c%UInt8 : @@ -97,7 +107,7 @@ sizeof(s::UTF8String) = sizeof(s.data) lastidx(s::UTF8String) = length(s.data) isvalid(s::UTF8String, i::Integer) = - (1 <= i <= endof(s.data)) && !is_valid_continuation(s.data[i]) + (1 <= i <= lastindex(s.data)) && !is_valid_continuation(s.data[i]) const empty_utf8 = UTF8String(UInt8[]) @@ -150,7 +160,7 @@ function string(a::ByteString...) return a[1]::UTF8String end # ^^ at least one must be UTF-8 or the ASCII-only method would get called - data = Vector{UInt8}(0) + data = Vector{UInt8}(undef, 0) for d in a append!(data,d.data) end @@ -161,7 +171,7 @@ function reverse(s::UTF8String) dat = s.data n = length(dat) n <= 1 && return s - buf = Vector{UInt8}(n) + buf = Vector{UInt8}(undef, n) out = n pos = 1 @inbounds while out > 0 @@ -199,7 +209,7 @@ utf8(x) = convert(UTF8String, x) convert(::Type{UTF8String}, s::UTF8String) = s convert(::Type{UTF8String}, s::ASCIIString) = UTF8String(s.data) convert(::Type{SubString{UTF8String}}, s::SubString{ASCIIString}) = - SubString(utf8(s.string), s.offset+1, s.endof+s.offset) + SubString(utf8(s.string), s.offset+1, ncodeunits(s)+s.offset) function convert(::Type{UTF8String}, dat::Vector{UInt8}) # handle zero length string quickly @@ -208,11 +218,11 @@ function convert(::Type{UTF8String}, dat::Vector{UInt8}) len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat) if (flags & (UTF_LONG | UTF_SURROGATE)) == 0 len = sizeof(dat) - @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len)) + @inbounds return UTF8String(copyto!(Vector{UInt8}(undef, len), 1, dat, 1, len)) end # Copy, but eliminate over-long encodings and surrogate pairs len += num2byte + num3byte*2 + num4byte*3 - buf = Vector{UInt8}(len) + buf = Vector{UInt8}(undef, len) out = 0 pos = 0 @inbounds while out < len @@ -277,6 +287,10 @@ function convert(::Type{UTF8String}, a::Vector{UInt8}, invalids_as::AbstractStri end convert(::Type{UTF8String}, s::AbstractString) = utf8(bytestring(s)) +if isdefined(Base, :CodeUnits) + convert(::Type{UTF8String}, s::Base.CodeUnits{UInt8,String}) = convert(UTF8String, Vector{UInt8}(s)) +end + """ Converts an already validated vector of `UInt16` or `UInt32` to a `UTF8String` @@ -289,8 +303,8 @@ Returns: * `UTF8String` """ -function encode_to_utf8{T<:Union{UInt16, UInt32}}(::Type{T}, dat, len) - buf = Vector{UInt8}(len) +function encode_to_utf8(::Type{T}, dat, len) where {T<:Union{UInt16, UInt32}} + buf = Vector{UInt8}(undef, len) out = 0 pos = 0 @inbounds while out < len diff --git a/test/runtests.jl b/test/runtests.jl index 640c06b..1755cdf 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,12 +1,14 @@ # This file includes code that was formerly a part of Julia. License is MIT: http://julialang.org/license -using Base.Test using Compat +using Compat.Test using Compat: view, String -importall LegacyStrings +using LegacyStrings +using LegacyStrings: ASCIIString, UTF8String # override Compat's version import LegacyStrings: ascii, checkstring, + UnicodeError, UTF_ERR_SHORT # types @@ -23,11 +25,7 @@ badstring32 = UInt32['a'] # Unicode errors let io = IOBuffer() show(io, UnicodeError(UTF_ERR_SHORT, 1, 10)) - if VERSION >= v"0.5.0-dev+1956" - check = "UnicodeError: invalid UTF-8 sequence starting at index 1 (0xa missing one or more continuation bytes)" - else - check = "UnicodeError: invalid UTF-8 sequence starting at index 1 (0xa) missing one or more continuation bytes)" - end + check = "UnicodeError: invalid UTF-8 sequence starting at index 1 (0xa missing one or more continuation bytes)" @test String(take!(io)) == check end @@ -215,7 +213,7 @@ let str = UTF8String(b"this is a test\xed\x80") @test_throws BoundsError getindex(str, 17:18) @test_throws BoundsError getindex(str, 2:17) @test_throws UnicodeError getindex(str, 16:17) - @test string(Char(0x110000)) == "\ufffd" + # @test string(Char(0x110000)) == "\ufffd" sa = SubString{ASCIIString}(ascii("This is a silly test"), 1, 14) s8 = convert(SubString{UTF8String}, sa) @test typeof(s8) == SubString{UTF8String} @@ -240,37 +238,39 @@ end ## UTF-16 tests -u8 = "\U10ffff\U1d565\U1d7f6\U00066\U2008a" -u16 = utf16(u8) -@test sizeof(u16) == 18 -@test length(u16.data) == 10 && u16.data[end] == 0 -@test length(u16) == 5 -@test utf8(u16) == u8 -@test collect(u8) == collect(u16) -@test u8 == utf16(u16.data[1:end-1]) == utf16(copy!(Vector{UInt8}(18), 1, reinterpret(UInt8, u16.data), 1, 18)) -@test u8 == utf16(pointer(u16)) == utf16(convert(Ptr{Int16}, pointer(u16))) -@test_throws UnicodeError utf16(utf32(Char(0x120000))) -@test_throws UnicodeError utf16(UInt8[1,2,3]) - -@test convert(UTF16String, "test") == "test" -@test convert(UTF16String, u16) == u16 -@test convert(UTF16String, UInt16[[0x65, 0x66] [0x67, 0x68]]) == "efgh" -@test convert(UTF16String, Int16[[0x65, 0x66] [0x67, 0x68]]) == "efgh" -@test map(lowercase, utf16("TEST\U1f596")) == "test\U1f596" -@test typeof(Base.unsafe_convert(Ptr{UInt16}, utf16("test"))) == Ptr{UInt16} +let u8 = "\U10ffff\U1d565\U1d7f6\U00066\U2008a" + u16 = utf16(u8) + @test sizeof(u16) == 18 + @test length(u16.data) == 10 && u16.data[end] == 0 + @test length(u16) == 5 + @test utf8(u16) == u8 + @test collect(u8) == collect(u16) + @test u8 == utf16(u16.data[1:end-1]) == utf16(copyto!(Vector{UInt8}(undef, 18), 1, reinterpret(UInt8, u16.data), 1, 18)) + @test u8 == utf16(pointer(u16)) == utf16(convert(Ptr{Int16}, pointer(u16))) + @test_throws UnicodeError utf16(utf32(Char(0x120000))) + @test_throws UnicodeError utf16(UInt8[1,2,3]) + + @test convert(UTF16String, "test") == "test" + @test convert(UTF16String, u16) == u16 + @test convert(UTF16String, UInt16[[0x65, 0x66] [0x67, 0x68]]) == "efgh" + @test convert(UTF16String, Int16[[0x65, 0x66] [0x67, 0x68]]) == "efgh" + @test map(lowercase, utf16("TEST\U1f596")) == "test\U1f596" + @test typeof(Base.unsafe_convert(Ptr{UInt16}, utf16("test"))) == Ptr{UInt16} +end ## UTF-32 tests -u8 = "\U10ffff\U1d565\U1d7f6\U00066\U2008a" -u32 = utf32(u8) -@test sizeof(u32) == 20 -@test length(u32.data) == 6 && u32.data[end] == 0 -@test length(u32) == 5 -@test utf8(u32) == u8 -@test collect(u8) == collect(u32) -@test u8 == utf32(u32.data[1:end-1]) == utf32(copy!(Vector{UInt8}(20), 1, reinterpret(UInt8, u32.data), 1, 20)) -@test u8 == utf32(pointer(u32)) == utf32(convert(Ptr{Int32}, pointer(u32))) -@test_throws UnicodeError utf32(UInt8[1,2,3]) +let u8 = "\U10ffff\U1d565\U1d7f6\U00066\U2008a" + u32 = utf32(u8) + @test sizeof(u32) == 20 + @test length(u32.data) == 6 && u32.data[end] == 0 + @test length(u32) == 5 + @test utf8(u32) == u8 + @test collect(u8) == collect(u32) + @test u8 == utf32(u32.data[1:end-1]) == utf32(copyto!(Vector{UInt8}(undef, 20), 1, reinterpret(UInt8, u32.data), 1, 20)) + @test u8 == utf32(pointer(u32)) == utf32(convert(Ptr{Int32}, pointer(u32))) + @test_throws UnicodeError utf32(UInt8[1,2,3]) +end # issue #11551 (#11004,#10959) function tstcvt(strUTF8::UTF8String, strUTF16::UTF16String, strUTF32::UTF32String) @@ -423,10 +423,11 @@ for T in (UTF8String, UTF16String, UTF32String) end # Wstring -u8 = "\U10ffff\U1d565\U1d7f6\U00066\U2008a" -w = wstring(u8) -@test length(w) == 5 && utf8(w) == u8 && collect(u8) == collect(w) -@test u8 == WString(w.data) +let u8 = "\U10ffff\U1d565\U1d7f6\U00066\U2008a" + w = wstring(u8) + @test length(w) == 5 && utf8(w) == u8 && collect(u8) == collect(w) + @test u8 == WString(w.data) +end # 12268 for (fun, S, T) in ((utf16, UInt16, UTF16String), (utf32, UInt32, UTF32String)) @@ -448,8 +449,8 @@ for (fun, S, T) in ((utf16, UInt16, UTF16String), (utf32, UInt32, UTF32String)) @test Base.containsnul(x) @test Base.containsnul(tst) # map - @test_throws UnicodeError map(islower, x) - @test_throws ArgumentError map(islower, tst) + @test_throws UnicodeError map(islowercase, x) + @test_throws ArgumentError map(islowercase, tst) # SubArray conversion subarr = view(cmp, 1:6) @test convert(T, subarr) == str[4:end] @@ -537,18 +538,22 @@ let srep = RepString("Σβ",2) s="Σβ" - ss=SubString(s,1,endof(s)) + ss=SubString(s,1,lastindex(s)) @test ss^2 == "ΣβΣβ" @test RepString(ss,2) == "ΣβΣβ" - @test endof(srep) == 7 + @test lastindex(srep) == 7 @test next(srep, 3) == ('β',5) @test next(srep, 7) == ('β',9) @test srep[7] == 'β' - @test_throws BoundsError srep[8] + @static if VERSION < v"0.7.0-DEV.2924" + @test_throws BoundsError srep[8] + else + @test_throws StringIndexError srep[8] + end end @@ -572,7 +577,7 @@ let rs = RevString(s) r = reverse(s) @test r == rs - ri = search(r, c) + ri = something(findfirst(isequal(c), r), 0) @test c == s[reverseind(s, ri)] == r[ri] end end