-
-
Notifications
You must be signed in to change notification settings - Fork 5.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Make search more consistent #24121
Make search more consistent #24121
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,18 +3,39 @@ | |
const Chars = Union{Char,Tuple{Vararg{Char}},AbstractVector{Char},Set{Char}} | ||
|
||
""" | ||
search(string::AbstractString, chars::Chars, [start::Integer]) | ||
search(string, chars, [start::Integer]) | ||
|
||
Search for the first occurrence of the given characters within the given string. The second | ||
argument may be a single character, a vector or a set of characters, a string, or a regular | ||
expression (though regular expressions are only allowed on contiguous strings, such as ASCII | ||
or UTF-8 strings). The third argument optionally specifies a starting index. The return | ||
value is a range of indexes where the matching sequence is found, such that `s[search(s,x)] == x`: | ||
or UTF-8 strings). | ||
Alternatively the first and the second arguments may be arrays of `Int8` or `UInt8`. | ||
The third argument optionally specifies a starting index. | ||
If it is not a valid index into the first argument then error is thrown. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This goes without saying, I don't think it's worth making the description longer than it already is to mention this. |
||
The return value when the second argument is a string or a vector of bytes is the samllest | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "samllest". "exits". |
||
range of indexes where the matching sequence is found, such that `s[search(s,x)] == x`, | ||
or `0:-1` if no such range of indexes exits: | ||
|
||
`search(string, "substring")` = `start:end` such that `string[start:end] == "substring"`, or | ||
`0:-1` if unmatched. | ||
`search(string, "substring", i)` = `start:end` such that `string[start:end] == "substring"` | ||
and `start >= i`. | ||
|
||
`search(string, 'c')` = `index` such that `string[index] == 'c'`, or `0` if unmatched. | ||
In particular a search for a zero length second argument returns `i:(i-1)`. | ||
|
||
If the first argument is empty and third argument is not given `0:-1` is returned. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Better give the general rule and use |
||
|
||
The return value when the second argument is a signle character, a vector or a set of | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "signle" |
||
characters is the smallest index for which the match is found, i.e. `s[search(s,x)] in x`, | ||
or `0` if no such index exists. | ||
|
||
`search(string, ['a', 'b'], i)` = `index` such that `string[index] in ['a', 'b']`. | ||
|
||
In particular a search for `Char[]` or `Set{Char}()` as a second argument returns `0`. | ||
|
||
If the first argument is empty and third argument is not given `0` is returned. | ||
|
||
Additionally the second argument can be `Int8` or `UInt8` in this case if the first argument | ||
is `String` then it is searched as if it were an array of bytes and a returned match might | ||
point to an invalid index for a given string. | ||
|
||
# Examples | ||
```jldoctest | ||
|
@@ -23,176 +44,190 @@ julia> search("Hello to the world", "z") | |
|
||
julia> search("JuliaLang","Julia") | ||
1:5 | ||
|
||
julia> search(Int8[1,2,3], Int8[2,3]) | ||
2:3 | ||
|
||
julia> search("Hello to the world", 'z') | ||
0 | ||
|
||
julia> search("Hello to the world", ['e', 'l']) | ||
2 | ||
|
||
julia> search("", ['e', 'l']) | ||
0 | ||
``` | ||
""" | ||
function search(s::AbstractString, c::Chars, i::Integer) | ||
if isempty(c) | ||
return 1 <= i <= nextind(s,endof(s)) ? i : | ||
throw(BoundsError(s, i)) | ||
end | ||
if i < 1 || i > nextind(s,endof(s)) | ||
throw(BoundsError(s, i)) | ||
end | ||
while !done(s,i) | ||
d, j = next(s,i) | ||
isvalid(s, i) || throw(ArgumentError("index $i is invalid")) | ||
isempty(c) && return 0 | ||
while !done(s, i) | ||
d, j = next(s, i) | ||
if d in c | ||
return i | ||
end | ||
i = j | ||
end | ||
return 0 | ||
end | ||
search(s::AbstractString, c::Chars) = search(s,c,start(s)) | ||
|
||
in(c::Char, s::AbstractString) = (search(s,c)!=0) | ||
search(s::AbstractString, c::Chars) = s == "" ? 0 : search(s, c, start(s)) | ||
|
||
in(c::Char, s::AbstractString) = search(s,c) != 0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Plase don't change unrelated lines. Same below. This makes it really painful to review. |
||
|
||
function _searchindex(s, t, i) | ||
if isempty(t) | ||
return 1 <= i <= nextind(s,endof(s)) ? i : | ||
throw(BoundsError(s, i)) | ||
end | ||
# assumes that caller has checked if i is a valid index | ||
isempty(t) && return i | ||
|
||
t1, j2 = next(t,start(t)) | ||
while true | ||
i = search(s,t1,i) | ||
if i == 0 return 0 end | ||
c, ii = next(s,i) | ||
i = search(s, t1, i) | ||
i == 0 && return 0 | ||
c, ii = next(s, i) | ||
j = j2; k = ii | ||
matched = true | ||
while !done(t,j) | ||
if done(s,k) | ||
while !done(t, j) | ||
if done(s, k) | ||
matched = false | ||
break | ||
end | ||
c, k = next(s,k) | ||
d, j = next(t,j) | ||
c, k = next(s, k) | ||
d, j = next(t, j) | ||
if c != d | ||
matched = false | ||
break | ||
end | ||
end | ||
if matched | ||
return i | ||
end | ||
matched && return i | ||
i = ii | ||
end | ||
end | ||
|
||
_searchindex(s, t::Char, i) = search(s, t, i) | ||
|
||
function _search_bloom_mask(c) | ||
UInt64(1) << (c & 63) | ||
end | ||
|
||
_nthbyte(s::String, i) = codeunit(s, i) | ||
_nthbyte(a::ByteArray, i) = a[i] | ||
_nthbyte(s::String, i) = @inbounds codeunit(s, i) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Better add |
||
_nthbyte(a::ByteArray, i) = @inbounds a[i] | ||
|
||
function _searchindex(s::Union{String,ByteArray}, t::Union{String,ByteArray}, i) | ||
n = sizeof(t) | ||
m = sizeof(s) | ||
# assumes that caller has checked if i is a valid index | ||
isempty(t) && return i | ||
|
||
if n == 0 | ||
return 1 <= i <= m+1 ? max(1, i) : 0 | ||
elseif m == 0 | ||
return 0 | ||
elseif n == 1 | ||
return search(s, _nthbyte(t,1), i) | ||
end | ||
m = sizeof(s) | ||
n = sizeof(t) | ||
|
||
n == 0 && return i | ||
n == 1 && return search(s, _nthbyte(t, 1), i) | ||
w = m - n | ||
if w < 0 || i - 1 > w | ||
return 0 | ||
end | ||
w < i - 1 && return 0 | ||
|
||
bloom_mask = UInt64(0) | ||
skip = n - 1 | ||
tlast = _nthbyte(t,n) | ||
tlast = _nthbyte(t, n) | ||
for j in 1:n | ||
bloom_mask |= _search_bloom_mask(_nthbyte(t,j)) | ||
if _nthbyte(t,j) == tlast && j < n | ||
bloom_mask |= _search_bloom_mask(_nthbyte(t, j)) | ||
if _nthbyte(t, j) == tlast && j < n | ||
skip = n - j - 1 | ||
end | ||
end | ||
|
||
i -= 1 | ||
while i <= w | ||
if _nthbyte(s,i+n) == tlast | ||
if _nthbyte(s, i + n) == tlast | ||
# check candidate | ||
j = 0 | ||
while j < n - 1 | ||
if _nthbyte(s,i+j+1) != _nthbyte(t,j+1) | ||
if _nthbyte(s, i + j + 1) != _nthbyte(t, j + 1) | ||
break | ||
end | ||
j += 1 | ||
end | ||
|
||
# match found | ||
if j == n - 1 | ||
return i+1 | ||
return i + 1 | ||
end | ||
|
||
# no match, try to rule out the next character | ||
if i < w && bloom_mask & _search_bloom_mask(_nthbyte(s,i+n+1)) == 0 | ||
if i < w && bloom_mask & _search_bloom_mask(_nthbyte(s, i + n + 1)) == 0 | ||
i += n | ||
else | ||
i += skip | ||
end | ||
elseif i < w | ||
if bloom_mask & _search_bloom_mask(_nthbyte(s,i+n+1)) == 0 | ||
if bloom_mask & _search_bloom_mask(_nthbyte(s, i + n + 1)) == 0 | ||
i += n | ||
end | ||
end | ||
i += 1 | ||
end | ||
|
||
0 | ||
end | ||
|
||
searchindex(s::ByteArray, t::ByteArray, i) = _searchindex(s,t,i) | ||
|
||
""" | ||
searchindex(s::AbstractString, substring, [start::Integer]) | ||
searchindex(s, t, [start::Integer]) | ||
|
||
Similar to [`search`](@ref), but return only the start index at which | ||
the substring is found, or `0` if it is not. | ||
Similar to [`search`](@ref), but return only the start index at which the second argument | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "substring" was clearer, let's not make the docs worse because of that |
||
is found, or `0` if it is not. | ||
|
||
# Examples | ||
```jldoctest | ||
julia> searchindex("Hello to the world", "z") | ||
0 | ||
|
||
julia> searchindex("JuliaLang","Julia") | ||
julia> searchindex("JuliaLang", "Julia") | ||
1 | ||
|
||
julia> searchindex("JuliaLang","Lang") | ||
julia> searchindex("JuliaLang", "Lang") | ||
6 | ||
|
||
julia> searchindex(Int8[1, 2, 3], Int8[3], 2) | ||
3 | ||
|
||
julia> searchindex("abc", 'b', 3) | ||
0 | ||
|
||
julia> searchindex("", 'b', 3) | ||
0 | ||
``` | ||
""" | ||
searchindex(s::AbstractString, t::AbstractString, i::Integer) = _searchindex(s,t,i) | ||
searchindex(s::AbstractString, t::AbstractString) = searchindex(s,t,start(s)) | ||
searchindex(s::AbstractString, c::Char, i::Integer) = _searchindex(s,c,i) | ||
searchindex(s::AbstractString, c::Char) = searchindex(s,c,start(s)) | ||
searchindex(s::AbstractString, c::Chars, i::Integer) = search(s, c, i) | ||
searchindex(s::AbstractString, c::Chars) = s == "" ? 0 : search(s, c, start(s)) | ||
function searchindex(s::ByteArray, t::ByteArray, i::Integer) | ||
if !(1 <= i <= length(s)) | ||
throw(BoundsError(s, i)) | ||
end | ||
_searchindex(s, t, i) | ||
end | ||
searchindex(s::ByteArray, t::ByteArray) = length(s) == 0 ? 0 : searchindex(s, t, 1) | ||
|
||
function searchindex(s::String, t::String, i::Integer=1) | ||
# Check for fast case of a single byte | ||
# (for multi-byte UTF-8 sequences, use searchindex on byte arrays instead) | ||
function searchindex(s::AbstractString, t::AbstractString, i::Integer) | ||
# Check for fast case of a single character | ||
# (for multi-character sequences, use searchindex on byte arrays instead) | ||
if endof(t) == 1 | ||
search(s, t[1], i) | ||
else | ||
isvalid(s, i) || throw(ArgumentError("index $i is invalid")) | ||
_searchindex(s, t, i) | ||
end | ||
end | ||
|
||
searchindex(s::AbstractString, t::AbstractString) = s == "" ? 0 : searchindex(s, t, start(s)) | ||
|
||
function _search(s, t, i::Integer) | ||
idx = searchindex(s,t,i) | ||
idx = searchindex(s, t, i) | ||
if isempty(t) | ||
idx:idx-1 | ||
else | ||
idx:(idx > 0 ? idx + endof(t) - 1 : -1) | ||
end | ||
end | ||
|
||
search(s::AbstractString, t::AbstractString, i::Integer=start(s)) = _search(s, t, i) | ||
search(s::ByteArray, t::ByteArray, i::Integer=start(s)) = _search(s, t, i) | ||
search(s::AbstractString, t::AbstractString, i::Integer) = _search(s, t, i) | ||
search(s::AbstractString, t::AbstractString) = s == "" ? (0:-1) : _search(s, t, start(s)) | ||
search(s::ByteArray, t::ByteArray, i::Integer) = _search(s, t, i) | ||
search(s::ByteArray, t::ByteArray) = length(s)==0 ? (0:-1) : _search(s, t, 1) | ||
|
||
function rsearch(s::AbstractString, c::Chars) | ||
j = search(RevString(s), c) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd rather not document this, it really should be an internal function which shouldn't have been exposed. I was preparing a PR to merge
search
withfindfirst
which would also handle this.