JuliaLang · bkamins · Oct 13, 2017 · Oct 13, 2017 · Oct 13, 2017 · nalimilan
diff --git a/NEWS.md b/NEWS.md
@@ -640,6 +640,9 @@ Breaking changes
 
 This section lists changes that do not have deprecation warnings.
 
+  * Behavior of functions `search` and `searchindex` is now more consistent in corner
+    cases which may lead to a different return value or an error thrown ([#?????]).
+
   * The constructor of `SubString` now checks if the requsted view range
     is defined by valid indices in the parent `AbstractString` ([#22511]).
 

diff --git a/base/strings/search.jl b/base/strings/search.jl
@@ -3,18 +3,39 @@
 const Chars = Union{Char,Tuple{Vararg{Char}},AbstractVector{Char},Set{Char}}
 
 """
-    search(string::AbstractString, chars::Chars, [start::Integer])
+    search(string, chars, [start::Integer])
 
 Search for the first occurrence of the given characters within the given string. The second
 argument may be a single character, a vector or a set of characters, a string, or a regular
 expression (though regular expressions are only allowed on contiguous strings, such as ASCII
-or UTF-8 strings). The third argument optionally specifies a starting index. The return
-value is a range of indexes where the matching sequence is found, such that `s[search(s,x)] == x`:
+or UTF-8 strings).
+Alternatively the first and the second arguments may be arrays of `Int8` or `UInt8`.
+The third argument optionally specifies a starting index.
+If it is not a valid index into the first argument then error is thrown.
+The return value when the second argument is a string or a vector of bytes is the samllest
+range of indexes where the matching sequence is found, such that `s[search(s,x)] == x`,
+or `0:-1` if no such range of indexes exits:
 
-`search(string, "substring")` = `start:end` such that `string[start:end] == "substring"`, or
-`0:-1` if unmatched.
+`search(string, "substring", i)` = `start:end` such that `string[start:end] == "substring"`
+and `start >= i`.
 
-`search(string, 'c')` = `index` such that `string[index] == 'c'`, or `0` if unmatched.
+In particular a search for a zero length second argument returns `i:(i-1)`.
+
+If the first argument is empty and third argument is not given `0:-1` is returned.
+
+The return value when the second argument is a signle character, a vector or a set of
+characters is the smallest index for which the match is found, i.e. `s[search(s,x)] in x`,
+or `0` if no such index exists.
+
+`search(string, ['a', 'b'], i)` = `index` such that `string[index] in ['a', 'b']`.
+
+In particular a search for `Char[]` or `Set{Char}()` as a second argument returns `0`.
+
+If the first argument is empty and third argument is not given `0` is returned.
+
+Additionally the second argument can be `Int8` or `UInt8` in this case if the first argument
+is `String` then it is searched as if it were an array of bytes and a returned match might
+point to an invalid index for a given string.
 
 # Examples
 ```jldoctest
@@ -23,176 +44,190 @@ julia> search("Hello to the world", "z")
 
 julia> search("JuliaLang","Julia")
 1:5
+
+julia> search(Int8[1,2,3], Int8[2,3])
+2:3
+
+julia> search("Hello to the world", 'z')
+0
+
+julia> search("Hello to the world", ['e', 'l'])
+2
+
+julia> search("", ['e', 'l'])
+0
 ```
 """
 function search(s::AbstractString, c::Chars, i::Integer)
-    if isempty(c)
-        return 1 <= i <= nextind(s,endof(s)) ? i :
-               throw(BoundsError(s, i))
-    end
-    if i < 1 || i > nextind(s,endof(s))
-        throw(BoundsError(s, i))
-    end
-    while !done(s,i)
-        d, j = next(s,i)
+    isvalid(s, i) || throw(ArgumentError("index $i is invalid"))
+    isempty(c) && return 0
+    while !done(s, i)
+        d, j = next(s, i)
         if d in c
             return i
         end
         i = j
     end
     return 0
 end
-search(s::AbstractString, c::Chars) = search(s,c,start(s))
 
-in(c::Char, s::AbstractString) = (search(s,c)!=0)
+search(s::AbstractString, c::Chars) = s == "" ? 0 : search(s, c, start(s))
+
+in(c::Char, s::AbstractString) = search(s,c) != 0
 
 function _searchindex(s, t, i)
-    if isempty(t)
-        return 1 <= i <= nextind(s,endof(s)) ? i :
-               throw(BoundsError(s, i))
-    end
+    # assumes that caller has checked if i is a valid index
+    isempty(t) && return i
+
     t1, j2 = next(t,start(t))
     while true
-        i = search(s,t1,i)
-        if i == 0 return 0 end
-        c, ii = next(s,i)
+        i = search(s, t1, i)
+        i == 0 && return 0
+        c, ii = next(s, i)
         j = j2; k = ii
         matched = true
-        while !done(t,j)
-            if done(s,k)
+        while !done(t, j)
+            if done(s, k)
                 matched = false
                 break
             end
-            c, k = next(s,k)
-            d, j = next(t,j)
+            c, k = next(s, k)
+            d, j = next(t, j)
             if c != d
                 matched = false
                 break
             end
         end
-        if matched
-            return i
-        end
+        matched && return i
         i = ii
     end
 end
 
-_searchindex(s, t::Char, i) = search(s, t, i)
-
 function _search_bloom_mask(c)
     UInt64(1) << (c & 63)
 end
 
-_nthbyte(s::String, i) = codeunit(s, i)
-_nthbyte(a::ByteArray, i) = a[i]
+_nthbyte(s::String, i) = @inbounds codeunit(s, i)
+_nthbyte(a::ByteArray, i) = @inbounds a[i]
 
 function _searchindex(s::Union{String,ByteArray}, t::Union{String,ByteArray}, i)
-    n = sizeof(t)
-    m = sizeof(s)
+    # assumes that caller has checked if i is a valid index
+    isempty(t) && return i
 
-    if n == 0
-        return 1 <= i <= m+1 ? max(1, i) : 0
-    elseif m == 0
-        return 0
-    elseif n == 1
-        return search(s, _nthbyte(t,1), i)
-    end
+    m = sizeof(s)
+    n = sizeof(t)
 
+    n == 0 && return i
+    n == 1 && return search(s, _nthbyte(t, 1), i)
     w = m - n
-    if w < 0 || i - 1 > w
-        return 0
-    end
+    w < i - 1 && return 0
 
     bloom_mask = UInt64(0)
     skip = n - 1
-    tlast = _nthbyte(t,n)
+    tlast = _nthbyte(t, n)
     for j in 1:n
-        bloom_mask |= _search_bloom_mask(_nthbyte(t,j))
-        if _nthbyte(t,j) == tlast && j < n
+        bloom_mask |= _search_bloom_mask(_nthbyte(t, j))
+        if _nthbyte(t, j) == tlast && j < n
             skip = n - j - 1
         end
     end
 
     i -= 1
     while i <= w
-        if _nthbyte(s,i+n) == tlast
+        if _nthbyte(s, i + n) == tlast
             # check candidate
             j = 0
             while j < n - 1
-                if _nthbyte(s,i+j+1) != _nthbyte(t,j+1)
+                if _nthbyte(s, i + j + 1) != _nthbyte(t, j + 1)
                     break
                 end
                 j += 1
             end
 
             # match found
             if j == n - 1
-                return i+1
+                return i + 1
             end
 
             # no match, try to rule out the next character
-            if i < w && bloom_mask & _search_bloom_mask(_nthbyte(s,i+n+1)) == 0
+            if i < w && bloom_mask & _search_bloom_mask(_nthbyte(s, i + n + 1)) == 0
                 i += n
             else
                 i += skip
             end
         elseif i < w
-            if bloom_mask & _search_bloom_mask(_nthbyte(s,i+n+1)) == 0
+            if bloom_mask & _search_bloom_mask(_nthbyte(s, i + n + 1)) == 0
                 i += n
             end
         end
         i += 1
     end
-
     0
 end
 
-searchindex(s::ByteArray, t::ByteArray, i) = _searchindex(s,t,i)
-
 """
-    searchindex(s::AbstractString, substring, [start::Integer])
+    searchindex(s, t, [start::Integer])
 
-Similar to [`search`](@ref), but return only the start index at which
-the substring is found, or `0` if it is not.
+Similar to [`search`](@ref), but return only the start index at which the second argument
+is found, or `0` if it is not.
 
 # Examples
 ```jldoctest
 julia> searchindex("Hello to the world", "z")
 0
 
-julia> searchindex("JuliaLang","Julia")
+julia> searchindex("JuliaLang", "Julia")
 1
 
-julia> searchindex("JuliaLang","Lang")
+julia> searchindex("JuliaLang", "Lang")
 6
+
+julia> searchindex(Int8[1, 2, 3], Int8[3], 2)
+3
+
+julia> searchindex("abc", 'b', 3)
+0
+
+julia> searchindex("", 'b', 3)
+0
 ```
 """
-searchindex(s::AbstractString, t::AbstractString, i::Integer) = _searchindex(s,t,i)
-searchindex(s::AbstractString, t::AbstractString) = searchindex(s,t,start(s))
-searchindex(s::AbstractString, c::Char, i::Integer) = _searchindex(s,c,i)
-searchindex(s::AbstractString, c::Char) = searchindex(s,c,start(s))
+searchindex(s::AbstractString, c::Chars, i::Integer) = search(s, c, i)
+searchindex(s::AbstractString, c::Chars) = s == "" ? 0 : search(s, c, start(s))
+function searchindex(s::ByteArray, t::ByteArray, i::Integer)
+    if !(1 <= i <= length(s))
+        throw(BoundsError(s, i))
+    end
+    _searchindex(s, t, i)
+end
+searchindex(s::ByteArray, t::ByteArray) = length(s) == 0 ? 0 : searchindex(s, t, 1)
 
-function searchindex(s::String, t::String, i::Integer=1)
-    # Check for fast case of a single byte
-    # (for multi-byte UTF-8 sequences, use searchindex on byte arrays instead)
+function searchindex(s::AbstractString, t::AbstractString, i::Integer)
+    # Check for fast case of a single character
+    # (for multi-character sequences, use searchindex on byte arrays instead)
     if endof(t) == 1
         search(s, t[1], i)
     else
+        isvalid(s, i) || throw(ArgumentError("index $i is invalid"))
         _searchindex(s, t, i)
     end
 end
 
+searchindex(s::AbstractString, t::AbstractString) = s == "" ? 0 : searchindex(s, t, start(s))
+
 function _search(s, t, i::Integer)
-    idx = searchindex(s,t,i)
+    idx = searchindex(s, t, i)
     if isempty(t)
         idx:idx-1
     else
         idx:(idx > 0 ? idx + endof(t) - 1 : -1)
     end
 end
 
-search(s::AbstractString, t::AbstractString, i::Integer=start(s)) = _search(s, t, i)
-search(s::ByteArray, t::ByteArray, i::Integer=start(s)) = _search(s, t, i)
+search(s::AbstractString, t::AbstractString, i::Integer) = _search(s, t, i)
+search(s::AbstractString, t::AbstractString) = s == "" ? (0:-1) : _search(s, t, start(s))
+search(s::ByteArray, t::ByteArray, i::Integer) = _search(s, t, i)
+search(s::ByteArray, t::ByteArray) = length(s)==0 ? (0:-1) : _search(s, t, 1)
 
 function rsearch(s::AbstractString, c::Chars)
     j = search(RevString(s), c)

diff --git a/base/strings/string.jl b/base/strings/string.jl
@@ -306,42 +306,37 @@ function getindex(s::String, r::UnitRange{Int})
     unsafe_string(pointer(s,i), j-i)
 end
 
-function search(s::String, c::Char, i::Integer = 1)
-    if i < 1 || i > sizeof(s)
-        i == sizeof(s) + 1 && return 0
-        throw(BoundsError(s, i))
-    end
-    @inbounds if is_valid_continuation(codeunit(s,i))
-        throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, codeunit(s,i)))
-    end
+function search(s::String, c::Char, i::Integer)
+    isvalid(s, i) || throw(ArgumentError("index $i is invalid"))
     c < Char(0x80) && return search(s, c%UInt8, i)
     while true
         i = search(s, first_utf8_byte(c), i)
         (i==0 || s[i] == c) && return i
         i = next(s,i)[2]
     end
 end
+search(s::String, c::Char) = s == "" ? 0 : search(s, c, 1)
 
-function search(a::Union{String,ByteArray}, b::Union{Int8,UInt8}, i::Integer = 1)
-    if i < 1
-        throw(BoundsError(a, i))
-    end
+function search(a::Union{String,ByteArray}, b::Union{Int8,UInt8}, i::Integer)
     n = sizeof(a)
-    if i > n
-        return i == n+1 ? 0 : throw(BoundsError(a, i))
+    if !(1 <= i <= n)
+        throw(BoundsError(s, i))
     end
     p = pointer(a)
-    q = ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-1, b, n-i+1)
-    q == C_NULL ? 0 : Int(q-p+1)
+    q = ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p + i - 1, b, n - i + 1)
+    q == C_NULL ? 0 : Int(q - p + 1)
 end
+search(a::Union{String,ByteArray}, b::Union{Int8,UInt8}) =
+    sizeof(a) == 0 ? 0 : search(a, b, 1)
 
-function search(a::ByteArray, b::Char, i::Integer = 1)
+function search(a::ByteArray, b::Char, i::Integer)
     if isascii(b)
-        search(a,UInt8(b),i)
+        search(a, UInt8(b), i)
     else
-        search(a,Vector{UInt8}(string(b)),i).start
+        search(a, Vector{UInt8}(string(b)), i).start
     end
 end
+search(a::ByteArray, b::Char) = length(a) == 0 ? 0 : search(a, b, 1)
 
 function rsearch(s::String, c::Char, i::Integer = sizeof(s))
     c < Char(0x80) && return rsearch(s, c%UInt8, i)