diff --git a/NEWS.md b/NEWS.md index 792016be2b39c..d9acd5337cb42 100644 --- a/NEWS.md +++ b/NEWS.md @@ -640,6 +640,9 @@ Breaking changes This section lists changes that do not have deprecation warnings. + * Behavior of functions `search` and `searchindex` is now more consistent in corner + cases which may lead to a different return value or an error thrown ([#?????]). + * The constructor of `SubString` now checks if the requsted view range is defined by valid indices in the parent `AbstractString` ([#22511]). diff --git a/base/strings/search.jl b/base/strings/search.jl index 23f813ea28b26..08cc838488a42 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -3,18 +3,39 @@ const Chars = Union{Char,Tuple{Vararg{Char}},AbstractVector{Char},Set{Char}} """ - search(string::AbstractString, chars::Chars, [start::Integer]) + search(string, chars, [start::Integer]) Search for the first occurrence of the given characters within the given string. The second argument may be a single character, a vector or a set of characters, a string, or a regular expression (though regular expressions are only allowed on contiguous strings, such as ASCII -or UTF-8 strings). The third argument optionally specifies a starting index. The return -value is a range of indexes where the matching sequence is found, such that `s[search(s,x)] == x`: +or UTF-8 strings). +Alternatively the first and the second arguments may be arrays of `Int8` or `UInt8`. +The third argument optionally specifies a starting index. +If it is not a valid index into the first argument then error is thrown. +The return value when the second argument is a string or a vector of bytes is the samllest +range of indexes where the matching sequence is found, such that `s[search(s,x)] == x`, +or `0:-1` if no such range of indexes exits: -`search(string, "substring")` = `start:end` such that `string[start:end] == "substring"`, or -`0:-1` if unmatched. +`search(string, "substring", i)` = `start:end` such that `string[start:end] == "substring"` +and `start >= i`. -`search(string, 'c')` = `index` such that `string[index] == 'c'`, or `0` if unmatched. +In particular a search for a zero length second argument returns `i:(i-1)`. + +If the first argument is empty and third argument is not given `0:-1` is returned. + +The return value when the second argument is a signle character, a vector or a set of +characters is the smallest index for which the match is found, i.e. `s[search(s,x)] in x`, +or `0` if no such index exists. + +`search(string, ['a', 'b'], i)` = `index` such that `string[index] in ['a', 'b']`. + +In particular a search for `Char[]` or `Set{Char}()` as a second argument returns `0`. + +If the first argument is empty and third argument is not given `0` is returned. + +Additionally the second argument can be `Int8` or `UInt8` in this case if the first argument +is `String` then it is searched as if it were an array of bytes and a returned match might +point to an invalid index for a given string. # Examples ```jldoctest @@ -23,18 +44,25 @@ julia> search("Hello to the world", "z") julia> search("JuliaLang","Julia") 1:5 + +julia> search(Int8[1,2,3], Int8[2,3]) +2:3 + +julia> search("Hello to the world", 'z') +0 + +julia> search("Hello to the world", ['e', 'l']) +2 + +julia> search("", ['e', 'l']) +0 ``` """ function search(s::AbstractString, c::Chars, i::Integer) - if isempty(c) - return 1 <= i <= nextind(s,endof(s)) ? i : - throw(BoundsError(s, i)) - end - if i < 1 || i > nextind(s,endof(s)) - throw(BoundsError(s, i)) - end - while !done(s,i) - d, j = next(s,i) + isvalid(s, i) || throw(ArgumentError("index $i is invalid")) + isempty(c) && return 0 + while !done(s, i) + d, j = next(s, i) if d in c return i end @@ -42,84 +70,75 @@ function search(s::AbstractString, c::Chars, i::Integer) end return 0 end -search(s::AbstractString, c::Chars) = search(s,c,start(s)) -in(c::Char, s::AbstractString) = (search(s,c)!=0) +search(s::AbstractString, c::Chars) = s == "" ? 0 : search(s, c, start(s)) + +in(c::Char, s::AbstractString) = search(s,c) != 0 function _searchindex(s, t, i) - if isempty(t) - return 1 <= i <= nextind(s,endof(s)) ? i : - throw(BoundsError(s, i)) - end + # assumes that caller has checked if i is a valid index + isempty(t) && return i + t1, j2 = next(t,start(t)) while true - i = search(s,t1,i) - if i == 0 return 0 end - c, ii = next(s,i) + i = search(s, t1, i) + i == 0 && return 0 + c, ii = next(s, i) j = j2; k = ii matched = true - while !done(t,j) - if done(s,k) + while !done(t, j) + if done(s, k) matched = false break end - c, k = next(s,k) - d, j = next(t,j) + c, k = next(s, k) + d, j = next(t, j) if c != d matched = false break end end - if matched - return i - end + matched && return i i = ii end end -_searchindex(s, t::Char, i) = search(s, t, i) - function _search_bloom_mask(c) UInt64(1) << (c & 63) end -_nthbyte(s::String, i) = codeunit(s, i) -_nthbyte(a::ByteArray, i) = a[i] +_nthbyte(s::String, i) = @inbounds codeunit(s, i) +_nthbyte(a::ByteArray, i) = @inbounds a[i] function _searchindex(s::Union{String,ByteArray}, t::Union{String,ByteArray}, i) - n = sizeof(t) - m = sizeof(s) + # assumes that caller has checked if i is a valid index + isempty(t) && return i - if n == 0 - return 1 <= i <= m+1 ? max(1, i) : 0 - elseif m == 0 - return 0 - elseif n == 1 - return search(s, _nthbyte(t,1), i) - end + m = sizeof(s) + n = sizeof(t) + n == 0 && return i + n == 1 && return search(s, _nthbyte(t, 1), i) w = m - n - if w < 0 || i - 1 > w - return 0 - end + w < i - 1 && return 0 bloom_mask = UInt64(0) skip = n - 1 - tlast = _nthbyte(t,n) + tlast = _nthbyte(t, n) for j in 1:n - bloom_mask |= _search_bloom_mask(_nthbyte(t,j)) - if _nthbyte(t,j) == tlast && j < n + bloom_mask |= _search_bloom_mask(_nthbyte(t, j)) + if _nthbyte(t, j) == tlast && j < n skip = n - j - 1 end end i -= 1 while i <= w - if _nthbyte(s,i+n) == tlast + if _nthbyte(s, i + n) == tlast # check candidate j = 0 while j < n - 1 - if _nthbyte(s,i+j+1) != _nthbyte(t,j+1) + if _nthbyte(s, i + j + 1) != _nthbyte(t, j + 1) break end j += 1 @@ -127,63 +146,77 @@ function _searchindex(s::Union{String,ByteArray}, t::Union{String,ByteArray}, i) # match found if j == n - 1 - return i+1 + return i + 1 end # no match, try to rule out the next character - if i < w && bloom_mask & _search_bloom_mask(_nthbyte(s,i+n+1)) == 0 + if i < w && bloom_mask & _search_bloom_mask(_nthbyte(s, i + n + 1)) == 0 i += n else i += skip end elseif i < w - if bloom_mask & _search_bloom_mask(_nthbyte(s,i+n+1)) == 0 + if bloom_mask & _search_bloom_mask(_nthbyte(s, i + n + 1)) == 0 i += n end end i += 1 end - 0 end -searchindex(s::ByteArray, t::ByteArray, i) = _searchindex(s,t,i) - """ - searchindex(s::AbstractString, substring, [start::Integer]) + searchindex(s, t, [start::Integer]) -Similar to [`search`](@ref), but return only the start index at which -the substring is found, or `0` if it is not. +Similar to [`search`](@ref), but return only the start index at which the second argument +is found, or `0` if it is not. # Examples ```jldoctest julia> searchindex("Hello to the world", "z") 0 -julia> searchindex("JuliaLang","Julia") +julia> searchindex("JuliaLang", "Julia") 1 -julia> searchindex("JuliaLang","Lang") +julia> searchindex("JuliaLang", "Lang") 6 + +julia> searchindex(Int8[1, 2, 3], Int8[3], 2) +3 + +julia> searchindex("abc", 'b', 3) +0 + +julia> searchindex("", 'b', 3) +0 ``` """ -searchindex(s::AbstractString, t::AbstractString, i::Integer) = _searchindex(s,t,i) -searchindex(s::AbstractString, t::AbstractString) = searchindex(s,t,start(s)) -searchindex(s::AbstractString, c::Char, i::Integer) = _searchindex(s,c,i) -searchindex(s::AbstractString, c::Char) = searchindex(s,c,start(s)) +searchindex(s::AbstractString, c::Chars, i::Integer) = search(s, c, i) +searchindex(s::AbstractString, c::Chars) = s == "" ? 0 : search(s, c, start(s)) +function searchindex(s::ByteArray, t::ByteArray, i::Integer) + if !(1 <= i <= length(s)) + throw(BoundsError(s, i)) + end + _searchindex(s, t, i) +end +searchindex(s::ByteArray, t::ByteArray) = length(s) == 0 ? 0 : searchindex(s, t, 1) -function searchindex(s::String, t::String, i::Integer=1) - # Check for fast case of a single byte - # (for multi-byte UTF-8 sequences, use searchindex on byte arrays instead) +function searchindex(s::AbstractString, t::AbstractString, i::Integer) + # Check for fast case of a single character + # (for multi-character sequences, use searchindex on byte arrays instead) if endof(t) == 1 search(s, t[1], i) else + isvalid(s, i) || throw(ArgumentError("index $i is invalid")) _searchindex(s, t, i) end end +searchindex(s::AbstractString, t::AbstractString) = s == "" ? 0 : searchindex(s, t, start(s)) + function _search(s, t, i::Integer) - idx = searchindex(s,t,i) + idx = searchindex(s, t, i) if isempty(t) idx:idx-1 else @@ -191,8 +224,10 @@ function _search(s, t, i::Integer) end end -search(s::AbstractString, t::AbstractString, i::Integer=start(s)) = _search(s, t, i) -search(s::ByteArray, t::ByteArray, i::Integer=start(s)) = _search(s, t, i) +search(s::AbstractString, t::AbstractString, i::Integer) = _search(s, t, i) +search(s::AbstractString, t::AbstractString) = s == "" ? (0:-1) : _search(s, t, start(s)) +search(s::ByteArray, t::ByteArray, i::Integer) = _search(s, t, i) +search(s::ByteArray, t::ByteArray) = length(s)==0 ? (0:-1) : _search(s, t, 1) function rsearch(s::AbstractString, c::Chars) j = search(RevString(s), c) diff --git a/base/strings/string.jl b/base/strings/string.jl index 45d1a05b352ca..504e42c95c744 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -306,14 +306,8 @@ function getindex(s::String, r::UnitRange{Int}) unsafe_string(pointer(s,i), j-i) end -function search(s::String, c::Char, i::Integer = 1) - if i < 1 || i > sizeof(s) - i == sizeof(s) + 1 && return 0 - throw(BoundsError(s, i)) - end - @inbounds if is_valid_continuation(codeunit(s,i)) - throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, codeunit(s,i))) - end +function search(s::String, c::Char, i::Integer) + isvalid(s, i) || throw(ArgumentError("index $i is invalid")) c < Char(0x80) && return search(s, c%UInt8, i) while true i = search(s, first_utf8_byte(c), i) @@ -321,27 +315,28 @@ function search(s::String, c::Char, i::Integer = 1) i = next(s,i)[2] end end +search(s::String, c::Char) = s == "" ? 0 : search(s, c, 1) -function search(a::Union{String,ByteArray}, b::Union{Int8,UInt8}, i::Integer = 1) - if i < 1 - throw(BoundsError(a, i)) - end +function search(a::Union{String,ByteArray}, b::Union{Int8,UInt8}, i::Integer) n = sizeof(a) - if i > n - return i == n+1 ? 0 : throw(BoundsError(a, i)) + if !(1 <= i <= n) + throw(BoundsError(s, i)) end p = pointer(a) - q = ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-1, b, n-i+1) - q == C_NULL ? 0 : Int(q-p+1) + q = ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p + i - 1, b, n - i + 1) + q == C_NULL ? 0 : Int(q - p + 1) end +search(a::Union{String,ByteArray}, b::Union{Int8,UInt8}) = + sizeof(a) == 0 ? 0 : search(a, b, 1) -function search(a::ByteArray, b::Char, i::Integer = 1) +function search(a::ByteArray, b::Char, i::Integer) if isascii(b) - search(a,UInt8(b),i) + search(a, UInt8(b), i) else - search(a,Vector{UInt8}(string(b)),i).start + search(a, Vector{UInt8}(string(b)), i).start end end +search(a::ByteArray, b::Char) = length(a) == 0 ? 0 : search(a, b, 1) function rsearch(s::String, c::Char, i::Integer = sizeof(s)) c < Char(0x80) && return rsearch(s, c%UInt8, i)