From b9504db896bb7ee4af988d106004cc5e704d7d44 Mon Sep 17 00:00:00 2001 From: Jeff Bezanson Date: Thu, 21 Dec 2017 19:54:09 -0500 Subject: [PATCH] deprecate Vector<->String conversion in favor of something safer fixes #24388 --- base/c.jl | 14 +++++++------- base/deprecated.jl | 15 +++++++++++++-- base/exports.jl | 1 + base/io.jl | 2 +- base/iobuffer.jl | 2 +- base/loading.jl | 4 ++-- base/repl/LineEdit.jl | 2 +- base/replutil.jl | 2 +- base/strings/basic.jl | 4 ++-- base/strings/io.jl | 9 ++++++--- base/strings/search.jl | 4 ++-- base/strings/string.jl | 30 +++++++++++++++++++++++++++++- test/char.jl | 2 +- test/core.jl | 2 +- test/misc.jl | 2 +- test/random.jl | 2 +- test/read.jl | 14 +++++++------- test/strings/basic.jl | 4 ++-- 18 files changed, 79 insertions(+), 36 deletions(-) diff --git a/base/c.jl b/base/c.jl index 61163a93fcbc4..3ed671b70c744 100644 --- a/base/c.jl +++ b/base/c.jl @@ -127,7 +127,7 @@ cconvert(::Type{Cstring}, s::AbstractString) = cconvert(Cstring, String(s)::String) function cconvert(::Type{Cwstring}, s::AbstractString) - v = transcode(Cwchar_t, Vector{UInt8}(String(s))) + v = transcode(Cwchar_t, String(s)) !isempty(v) && v[end] == 0 || push!(v, 0) return v end @@ -140,7 +140,7 @@ containsnul(p::Ptr, len) = containsnul(s::String) = containsnul(unsafe_convert(Ptr{Cchar}, s), sizeof(s)) containsnul(s::AbstractString) = '\0' in s -function unsafe_convert(::Type{Cstring}, s::Union{String,Vector{UInt8}}) +function unsafe_convert(::Type{Cstring}, s::Union{String,AbstractVector{UInt8}}) p = unsafe_convert(Ptr{Cchar}, s) containsnul(p, sizeof(s)) && throw(ArgumentError("embedded NULs are not allowed in C strings: $(repr(s))")) @@ -174,7 +174,7 @@ same argument. This is only available on Windows. """ function cwstring(s::AbstractString) - bytes = Vector{UInt8}(String(s)) + bytes = StringBytes(String(s)) 0 in bytes && throw(ArgumentError("embedded NULs are not allowed in C strings: $(repr(s))")) return push!(transcode(UInt16, bytes), 0) end @@ -202,19 +202,19 @@ Only conversion to/from UTF-8 is currently supported. """ function transcode end -transcode(::Type{T}, src::Vector{T}) where {T<:Union{UInt8,UInt16,UInt32,Int32}} = src +transcode(::Type{T}, src::AbstractVector{T}) where {T<:Union{UInt8,UInt16,UInt32,Int32}} = src transcode(::Type{T}, src::String) where {T<:Union{Int32,UInt32}} = T[T(c) for c in src] -transcode(::Type{T}, src::Vector{UInt8}) where {T<:Union{Int32,UInt32}} = transcode(T, String(src)) +transcode(::Type{T}, src::Union{Vector{UInt8},StringBytes}) where {T<:Union{Int32,UInt32}} = transcode(T, String(src)) function transcode(::Type{UInt8}, src::Vector{<:Union{Int32,UInt32}}) buf = IOBuffer() for c in src; print(buf, Char(c)); end take!(buf) end transcode(::Type{String}, src::String) = src -transcode(T, src::String) = transcode(T, Vector{UInt8}(src)) +transcode(T, src::String) = transcode(T, StringBytes(src)) transcode(::Type{String}, src) = String(transcode(UInt8, src)) -function transcode(::Type{UInt16}, src::Vector{UInt8}) +function transcode(::Type{UInt16}, src::Union{Vector{UInt8},StringBytes}) dst = UInt16[] i, n = 1, length(src) n > 0 || return dst diff --git a/base/deprecated.jl b/base/deprecated.jl index 950e27afd2f26..77a6a7d6d7f8e 100644 --- a/base/deprecated.jl +++ b/base/deprecated.jl @@ -1555,8 +1555,19 @@ export hex2num @deprecate ctranspose adjoint @deprecate ctranspose! adjoint! -@deprecate convert(::Type{Vector{UInt8}}, s::AbstractString) Vector{UInt8}(s) -@deprecate convert(::Type{Array{UInt8}}, s::AbstractString) Vector{UInt8}(s) +function convert(::Union{Type{Vector{UInt8}}, Type{Array{UInt8}}}, s::AbstractString) + depwarn("Strings can no longer be `convert`ed to byte arrays. Use `unsafe_wrap` or `StringBytes` instead.", :Type) + unsafe_wrap(Vector{UInt8}, String(s)) +end +function (::Type{Vector{UInt8}})(s::String) + depwarn("Vector{UInt8}(s::String) will copy data in the future. To avoid copying, use `unsafe_wrap` or `StringBytes` instead.", :Type) + unsafe_wrap(Vector{UInt8}, s) +end +function (::Type{Array{UInt8}})(s::String) + depwarn("Array{UInt8}(s::String) will copy data in the future. To avoid copying, use `unsafe_wrap` or `StringBytes` instead.", :Type) + unsafe_wrap(Vector{UInt8}, s) +end + @deprecate convert(::Type{Vector{Char}}, s::AbstractString) Vector{Char}(s) @deprecate convert(::Type{Symbol}, s::AbstractString) Symbol(s) @deprecate convert(::Type{String}, s::Symbol) String(s) diff --git a/base/exports.jl b/base/exports.jl index 2da4595fc35ca..dabdb067266c4 100644 --- a/base/exports.jl +++ b/base/exports.jl @@ -109,6 +109,7 @@ export StridedMatrix, StridedVecOrMat, StridedVector, + StringBytes, SubArray, SubString, Symmetric, diff --git a/base/io.jl b/base/io.jl index ae71210cb1336..248e37eba766e 100644 --- a/base/io.jl +++ b/base/io.jl @@ -706,7 +706,7 @@ function readuntil(io::IO, target::AbstractString) # decide how we can index target if target isa String # convert String to a utf8-byte-iterator - target = Vector{UInt8}(target) + target = StringBytes(target) #elseif applicable(codeunit, target) # TODO: a more general version of above optimization # would be to permit accessing any string via codeunit diff --git a/base/iobuffer.jl b/base/iobuffer.jl index 00635942ff561..b94b1f045e895 100644 --- a/base/iobuffer.jl +++ b/base/iobuffer.jl @@ -27,7 +27,7 @@ function GenericIOBuffer(data::T, readable::Bool, writable::Bool, seekable::Bool end # allocate Vector{UInt8}s for IOBuffer storage that can efficiently become Strings -StringVector(n::Integer) = Vector{UInt8}(_string_n(n)) +StringVector(n::Integer) = unsafe_wrap(Vector{UInt8}, _string_n(n)) # IOBuffers behave like Files. They are typically readable and writable. They are seekable. (They can be appendable). diff --git a/base/loading.jl b/base/loading.jl index cf1472437d9d2..be54febbe3ad3 100644 --- a/base/loading.jl +++ b/base/loading.jl @@ -64,12 +64,12 @@ elseif Sys.isapple() break end # Hack to compensate for inability to create a string from a subarray with no allocations. - Vector{UInt8}(path_basename) == casepreserved_basename && return true + StringBytes(path_basename) == casepreserved_basename && return true # If there is no match, it's possible that the file does exist but HFS+ # performed unicode normalization. See https://developer.apple.com/library/mac/qa/qa1235/_index.html. isascii(path_basename) && return false - Vector{UInt8}(Unicode.normalize(path_basename, :NFD)) == casepreserved_basename + StringBytes(Unicode.normalize(path_basename, :NFD)) == casepreserved_basename end else # Generic fallback that performs a slow directory listing. diff --git a/base/repl/LineEdit.jl b/base/repl/LineEdit.jl index fa3688c6f59f0..0bad5cc996ef8 100644 --- a/base/repl/LineEdit.jl +++ b/base/repl/LineEdit.jl @@ -657,7 +657,7 @@ function edit_splice!(s, r::Region=region(s), ins::AbstractString = ""; rigid_ma elseif buf.mark >= B buf.mark += sizeof(ins) - B + A end - ret = splice!(buf.data, A+1:B, Vector{UInt8}(ins)) # position(), etc, are 0-indexed + ret = splice!(buf.data, A+1:B, Base.StringBytes(String(ins))) # position(), etc, are 0-indexed buf.size = buf.size + sizeof(ins) - B + A adjust_pos && seek(buf, position(buf) + sizeof(ins)) String(ret) diff --git a/base/replutil.jl b/base/replutil.jl index 8ad88cef24d0e..cfac1979c8f23 100644 --- a/base/replutil.jl +++ b/base/replutil.jl @@ -283,7 +283,7 @@ function showerror(io::IO, ex::ErrorException) print(io, ex.msg) if ex.msg == "type String has no field data" println(io) - print(io, "Use `Vector{UInt8}(str)` instead.") + print(io, "Use `StringBytes(str)` instead.") end end showerror(io::IO, ex::KeyError) = print(io, "KeyError: key $(repr(ex.key)) not found") diff --git a/base/strings/basic.jl b/base/strings/basic.jl index 703e455a6306b..5fba0015a994f 100644 --- a/base/strings/basic.jl +++ b/base/strings/basic.jl @@ -185,8 +185,8 @@ checkbounds(s::AbstractString, I::Union{Integer,AbstractArray}) = string() = "" string(s::AbstractString) = s -(::Type{Vector{UInt8}})(s::AbstractString) = Vector{UInt8}(String(s)) -(::Type{Array{UInt8}})(s::AbstractString) = Vector{UInt8}(s) +(::Type{Vector{UInt8}})(s::AbstractString) = unsafe_wrap(Vector{UInt8}, String(s)) +(::Type{Array{UInt8}})(s::AbstractString) = unsafe_wrap(Vector{UInt8}, String(s)) (::Type{Vector{Char}})(s::AbstractString) = collect(s) Symbol(s::AbstractString) = Symbol(String(s)) diff --git a/base/strings/io.jl b/base/strings/io.jl index 538d7a55242e6..b41a418557cd7 100644 --- a/base/strings/io.jl +++ b/base/strings/io.jl @@ -188,8 +188,8 @@ julia> String(take!(io)) "Haho" ``` """ -IOBuffer(str::String) = IOBuffer(Vector{UInt8}(str)) -IOBuffer(s::SubString{String}) = IOBuffer(view(Vector{UInt8}(s.string), s.offset + 1 : s.offset + sizeof(s))) +IOBuffer(str::String) = IOBuffer(StringBytes(str)) +IOBuffer(s::SubString{String}) = IOBuffer(view(StringBytes(s.string), s.offset + 1 : s.offset + sizeof(s))) # join is implemented using IO @@ -373,7 +373,10 @@ function unescape_string(io, s::AbstractString) end end -macro b_str(s); :(Vector{UInt8}($(unescape_string(s)))); end +macro b_str(s) + v = unsafe_wrap(Vector{UInt8}, unescape_string(s)) + QuoteNode(v) +end """ @raw_str -> String diff --git a/base/strings/search.jl b/base/strings/search.jl index 9f895da99e066..f920b77520252 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -31,7 +31,7 @@ function search(a::ByteArray, b::Char, i::Integer = 1) if isascii(b) search(a,UInt8(b),i) else - search(a,Vector{UInt8}(string(b)),i).start + search(a,unsafe_wrap(Vector{UInt8},string(b)),i).start end end @@ -62,7 +62,7 @@ function rsearch(a::ByteArray, b::Char, i::Integer = length(a)) if isascii(b) rsearch(a,UInt8(b),i) else - rsearch(a,Vector{UInt8}(string(b)),i).start + rsearch(a,unsafe_wrap(Vector{UInt8},string(b)),i).start end end diff --git a/base/strings/string.jl b/base/strings/string.jl index 888cda75c6046..85e5c17468a63 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -60,7 +60,35 @@ This representation is often appropriate for passing strings to C. String(s::AbstractString) = print_to_string(s) String(s::Symbol) = unsafe_string(unsafe_convert(Ptr{UInt8}, s)) -(::Type{Vector{UInt8}})(s::String) = ccall(:jl_string_to_array, Ref{Vector{UInt8}}, (Any,), s) +unsafe_wrap(::Type{Vector{UInt8}}, s::String) = ccall(:jl_string_to_array, Ref{Vector{UInt8}}, (Any,), s) + +""" + StringBytes(s::String) + +Wrap a `String` (without copying) in an immutable vector-like object that accesses the bytes +of the string's representation. +""" +struct StringBytes <: AbstractVector{UInt8} + s::String +end + +sizeof(s::StringBytes) = sizeof(s.s) +length(s::StringBytes) = sizeof(s.s) +size(s::StringBytes) = (length(s),) +@propagate_inbounds getindex(s::StringBytes, i::Int) = codeunit(s.s, i) +IndexStyle(::Type{StringBytes}) = IndexLinear() +start(s::StringBytes) = 1 +next(s::StringBytes, i) = (@_propagate_inbounds_meta; (s[i], i+1)) +done(s::StringBytes, i) = (@_inline_meta; i == length(s)+1) + +(::Type{Vector{UInt8}})(s::StringBytes) = copyto!(Vector{UInt8}(uninitialized, length(s)), s) + +unsafe_convert(::Type{Ptr{UInt8}}, s::StringBytes) = unsafe_convert(Ptr{UInt8}, s.s) +unsafe_convert(::Type{Ptr{Int8}}, s::StringBytes) = unsafe_convert(Ptr{Int8}, s.s) + +write(io::IO, s::StringBytes) = write(io, s.s) + +String(s::StringBytes) = s.s ## low-level functions ## diff --git a/test/char.jl b/test/char.jl index 9163122c95659..1684d6339917e 100644 --- a/test/char.jl +++ b/test/char.jl @@ -201,7 +201,7 @@ end @testset "read incomplete character at end of stream or file" begin local file = tempname() local iob = IOBuffer([0xf0]) - local bytes(c::Char) = Vector{UInt8}(string(c)) + local bytes(c::Char) = Base.StringBytes(string(c)) @test bytes(read(iob, Char)) == [0xf0] @test eof(iob) try diff --git a/test/core.jl b/test/core.jl index 407d6c137257f..68b00969a094f 100644 --- a/test/core.jl +++ b/test/core.jl @@ -4114,7 +4114,7 @@ b = "aaa" c = [0x2, 0x1, 0x3] @test check_nul(a) -@test check_nul(Vector{UInt8}(b)) +@test check_nul(unsafe_wrap(Vector{UInt8},b)) @test check_nul(c) d = [0x2, 0x1, 0x3] @test check_nul(d) diff --git a/test/misc.jl b/test/misc.jl index 12099f2aac4bf..440e032c3eb0f 100644 --- a/test/misc.jl +++ b/test/misc.jl @@ -394,7 +394,7 @@ end let s = "abcα🐨\0x\0" for T in (UInt8, UInt16, UInt32, Int32) - @test transcode(T, s) == transcode(T, Vector{UInt8}(s)) + @test transcode(T, s) == transcode(T, Base.StringBytes(s)) @test transcode(String, transcode(T, s)) == s end end diff --git a/test/random.jl b/test/random.jl index 9c25bb00648a7..f03382ab9f244 100644 --- a/test/random.jl +++ b/test/random.jl @@ -605,7 +605,7 @@ let b = ['0':'9';'A':'Z';'a':'z'] @test length(randstring(rng...)) == 8 @test length(randstring(rng..., 20)) == 20 @test issubset(randstring(rng...), b) - for c = ['a':'z', "qwèrtï", Set(Vector{UInt8}("gcat"))], + for c = ['a':'z', "qwèrtï", Set(Base.StringBytes("gcat"))], len = [8, 20] s = len == 8 ? randstring(rng..., c) : randstring(rng..., c, len) @test length(s) == len diff --git a/test/read.jl b/test/read.jl index c72ac33deb9b4..960176108a861 100644 --- a/test/read.jl +++ b/test/read.jl @@ -165,7 +165,7 @@ for (name, f) in l @test readuntil(io(t), s) == m @test readuntil(io(t), SubString(s, start(s), endof(s))) == m @test readuntil(io(t), GenericString(s)) == m - @test readuntil(io(t), Vector{UInt8}(s)) == Vector{UInt8}(m) + @test readuntil(io(t), unsafe_wrap(Vector{UInt8},s)) == unsafe_wrap(Vector{UInt8},m) @test readuntil(io(t), collect(s)::Vector{Char}) == Vector{Char}(m) end cleanup() @@ -223,7 +223,7 @@ for (name, f) in l verbose && println("$name read...") - @test read(io()) == Vector{UInt8}(text) + @test read(io()) == unsafe_wrap(Vector{UInt8},text) @test read(io()) == read(filename) @@ -331,7 +331,7 @@ for (name, f) in l @test read("$filename.to", String) == text verbose && println("$name write(::IOBuffer, ...)") - to = IOBuffer(copy(Vector{UInt8}(text)), false, true) + to = IOBuffer(copy(Base.StringBytes(text)), false, true) write(to, io()) @test String(take!(to)) == text @@ -400,14 +400,14 @@ test_read_nbyte() let s = "qwerty" - @test read(IOBuffer(s)) == Vector{UInt8}(s) - @test read(IOBuffer(s), 10) == Vector{UInt8}(s) - @test read(IOBuffer(s), 1) == Vector{UInt8}(s)[1:1] + @test read(IOBuffer(s)) == Base.StringBytes(s) + @test read(IOBuffer(s), 10) == Base.StringBytes(s) + @test read(IOBuffer(s), 1) == Base.StringBytes(s)[1:1] # Test growing output array x = UInt8[] n = readbytes!(IOBuffer(s), x, 10) - @test x == Vector{UInt8}(s) + @test x == Base.StringBytes(s) @test n == length(x) end diff --git a/test/strings/basic.jl b/test/strings/basic.jl index 092828c6c4c76..385d8986d046d 100644 --- a/test/strings/basic.jl +++ b/test/strings/basic.jl @@ -206,7 +206,7 @@ struct tstStringType <: AbstractString data::Array{UInt8,1} end @testset "AbstractString functions" begin - tstr = tstStringType(Vector{UInt8}("12")) + tstr = tstStringType(unsafe_wrap(Vector{UInt8},"12")) @test_throws MethodError ncodeunits(tstr) @test_throws MethodError codeunit(tstr) @test_throws MethodError codeunit(tstr, 1) @@ -697,7 +697,7 @@ end end end -@test Vector{UInt8}("\xcc\xdd\xee\xff\x80") == [0xcc,0xdd,0xee,0xff,0x80] +@test unsafe_wrap(Vector{UInt8},"\xcc\xdd\xee\xff\x80") == [0xcc,0xdd,0xee,0xff,0x80] @test next("a", 1)[2] == 2 @test nextind("a", 1) == 2