Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

remove RevString; efficient generic reverseind #24708

Merged
merged 4 commits into from
Dec 4, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,12 @@ This section lists changes that do not have deprecation warnings.
`AbstractArray` types that specialized broadcasting using the old internal API will
need to switch to the new API. ([#20740])

* The `RevString` type has been removed from the language; `reverse(::String)` returns
a `String` with code points (or fragments thereof) in reverse order. In general,
`reverse(s)` should return a string of the same type and encoding as `s` with code
points in reverse order; any string type overrides `reverse` to return a different
type of string must also override `reverseind` to compute reversed indices correctly.

Library improvements
--------------------

Expand Down Expand Up @@ -409,6 +415,15 @@ Library improvements
* The `keys` of an `Associative` are now an `AbstractSet`. `Base.KeyIterator{<:Associative}`
has been changed to `KeySet{K, <:Associative{K}} <: AbstractSet{K}` ([#24580]).

* New function `ncodeunits(s::AbstractString)` gives the number of code units in a string.
The generic definition is constant time but calls `endof(s)` which may be inefficient.
Therefore custom string types may want to define direct `ncodeunits` methods.

* `reverseind(s::AbstractString, i::Integer)` now has an efficient generic fallback, so
custom string types do not need to provide their own efficient defintions. The generic
definition relies on `ncodeunits` however, so for optimal performance you may need to
define a custom method for that function.

Compiler/Runtime improvements
-----------------------------

Expand Down
2 changes: 1 addition & 1 deletion base/exports.jl
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,6 @@ export
Rational,
Regex,
RegexMatch,
RevString,
RoundFromZero,
RoundDown,
RoundingMode,
Expand Down Expand Up @@ -756,6 +755,7 @@ export
lstrip,
match,
matchall,
ncodeunits,
ndigits,
nextind,
normalize_string,
Expand Down
3 changes: 0 additions & 3 deletions base/precompile.jl
Original file line number Diff line number Diff line change
Expand Up @@ -578,9 +578,6 @@ precompile(Tuple{typeof(Base.LineEdit.complete_line), Base.LineEdit.PromptState,
precompile(Tuple{typeof(Base.LineEdit.input_string_newlines_aftercursor), Base.LineEdit.PromptState})
precompile(Tuple{typeof(Base.LineEdit.complete_line), Base.REPL.REPLCompletionProvider, Base.LineEdit.PromptState})
precompile(Tuple{getfield(Base, Symbol("#kw##parse")), Array{Any, 1}, typeof(Base.parse), String})
precompile(Tuple{typeof(Base.isvalid), Base.RevString{String}, Int64})
precompile(Tuple{typeof(Base.nextind), Base.RevString{String}, Int64})
precompile(Tuple{typeof(Base.search), Base.RevString{String}, Array{Char, 1}, Int64})
precompile(Tuple{typeof(Base.rsearch), String, Array{Char, 1}, Int64})
precompile(Tuple{getfield(Base.REPLCompletions, Symbol("#kw##find_start_brace")), Array{Any, 1}, typeof(Base.REPLCompletions.find_start_brace), String})
precompile(Tuple{typeof(Core.Inference.isbits), Tuple{Void, Void, Void}})
Expand Down
13 changes: 8 additions & 5 deletions base/repl/REPLCompletions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ end
# closed start brace from the end of the string.
function find_start_brace(s::AbstractString; c_start='(', c_end=')')
braces = 0
r = RevString(s)
r = reverse(s)
i = start(r)
in_single_quotes = false
in_double_quotes = false
Expand All @@ -245,18 +245,21 @@ function find_start_brace(s::AbstractString; c_start='(', c_end=')')
in_back_ticks = true
end
else
if !in_back_ticks && !in_double_quotes && c == '\'' && !done(r, i) && next(r, i)[1]!='\\'
if !in_back_ticks && !in_double_quotes &&
c == '\'' && !done(r, i) && next(r, i)[1] != '\\'
in_single_quotes = !in_single_quotes
elseif !in_back_ticks && !in_single_quotes && c == '"' && !done(r, i) && next(r, i)[1]!='\\'
elseif !in_back_ticks && !in_single_quotes &&
c == '"' && !done(r, i) && next(r, i)[1] != '\\'
in_double_quotes = !in_double_quotes
elseif !in_single_quotes && !in_double_quotes && c == '`' && !done(r, i) && next(r, i)[1]!='\\'
elseif !in_single_quotes && !in_double_quotes &&
c == '`' && !done(r, i) && next(r, i)[1] != '\\'
in_back_ticks = !in_back_ticks
end
end
braces == 1 && break
end
braces != 1 && return 0:-1, -1
method_name_end = reverseind(r, i)
method_name_end = reverseind(s, i)
startind = nextind(s, rsearch(s, non_identifier_chars, method_name_end))
return (startind:endof(s), method_name_end)
end
Expand Down
2 changes: 1 addition & 1 deletion base/shell.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ function shell_parse(str::AbstractString, interpolate::Bool=true;
special::AbstractString="")
s = lstrip(str)
# strips the end but respects the space when the string ends with "\\ "
r = RevString(s)
r = reverse(s)
i = start(r)
c_old = nothing
while !done(r,i)
Expand Down
24 changes: 17 additions & 7 deletions base/strings/basic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ julia> 'j' * "ulia"

one(::Union{T,Type{T}}) where {T<:AbstractString} = convert(T, "")

# generic number of code units; implementations generally know how long a string
# is though and should override this with a more efficient method
ncodeunits(s::AbstractString) = nextind(s, endof(s)) - 1

"""
length(s::AbstractString)

Expand Down Expand Up @@ -233,11 +237,11 @@ end
## Generic indexing functions ##

"""
thisind(str::AbstractString, i::Integer)
thisind(s::AbstractString, i::Integer)

Get the largest valid string index at or before `i`.
Returns `0` if there is no valid string index at or before `i`.
Returns `endof(str)` if `i≥endof(str)`.
If `i` is the index into a character in `s` then `thisind` returns the index of the
start of that character. If `i < start(s)` then it returns `start(s) - 1`.
If `i > ncodeunits(s)` then it returns `ncodeunits(s) + 1`.

# Examples
```jldoctest
Expand All @@ -253,15 +257,21 @@ julia> thisind("αβγdef", 3)
julia> thisind("αβγdef", 4)
3

julia> thisind("αβγdef", 20)
julia> thisind("αβγdef", 9)
9

julia> thisind("αβγdef", 10)
10

julia> thisind("αβγdef", 20)
10
"""
function thisind(s::AbstractString, i::Integer)
j = Int(i)
isvalid(s, j) && return j
j < start(s) && return 0
e = endof(s)
j >= endof(s) && return e
n = ncodeunits(s)
j > n && return n + 1
prevind(s, j)
end

Expand Down
52 changes: 26 additions & 26 deletions base/strings/search.jl
Original file line number Diff line number Diff line change
Expand Up @@ -194,12 +194,6 @@ end
search(s::AbstractString, t::AbstractString, i::Integer=start(s)) = _search(s, t, i)
search(s::ByteArray, t::ByteArray, i::Integer=start(s)) = _search(s, t, i)

function rsearch(s::AbstractString, c::Chars)
j = search(RevString(s), c)
j == 0 && return 0
endof(s)-j+1
end

"""
rsearch(s::AbstractString, chars::Chars, [start::Integer])

Expand All @@ -212,44 +206,50 @@ julia> rsearch("aaabbb","b")
6:6
```
"""
function rsearch(s::AbstractString, c::Chars, i::Integer)
e = endof(s)
j = search(RevString(s), c, e-i+1)
j == 0 && return 0
e-j+1
function rsearch(s::AbstractString, c::Chars, i::Integer=start(s))
if i < 1
return i == 0 ? 0 : throw(BoundsError(s, i))
end
n = ncodeunits(s)
if i > n
return i == n+1 ? 0 : throw(BoundsError(s, i))
end
# r[reverseind(r,i)] == reverse(r)[i] == s[i]
# s[reverseind(s,j)] == reverse(s)[j] == r[j]
r = reverse(s)
j = search(r, c, reverseind(r, i))
j == 0 ? 0 : reverseind(s, j)
end

function _rsearchindex(s, t, i)
if isempty(t)
return 1 <= i <= nextind(s,endof(s)) ? i :
return 1 <= i <= nextind(s, endof(s)) ? i :
throw(BoundsError(s, i))
end
t = RevString(t)
rs = RevString(s)
t = reverse(t)
rs = reverse(s)
l = endof(s)
t1, j2 = next(t,start(t))
t1, j2 = next(t, start(t))
while true
i = rsearch(s,t1,i)
if i == 0 return 0 end
c, ii = next(rs,l-i+1)
i = rsearch(s, t1, i)
i == 0 && return 0
c, ii = next(rs, reverseind(rs, i))
j = j2; k = ii
matched = true
while !done(t,j)
if done(rs,k)
while !done(t, j)
if done(rs, k)
matched = false
break
end
c, k = next(rs,k)
d, j = next(t,j)
c, k = next(rs, k)
d, j = next(t, j)
if c != d
matched = false
break
end
end
if matched
return nextind(s,l-k+1)
end
i = l-ii+1
matched && return nextind(s, reverseind(s, k))
i = reverseind(s, ii)
end
end

Expand Down
58 changes: 16 additions & 42 deletions base/strings/string.jl
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,20 @@ codeunit(s::AbstractString, i::Integer)
@gc_preserve s unsafe_load(pointer(s, i))
end

"""
ncodeunits(s::AbstractString)

The number of code units in a string. For example, for UTF-8-like data such as
the default `String` type, the number of code units is the number of bytes in
the string, a.k.a. `sizeof(s)`. For a UTF-16 encoded string type, however, the
code unit is `UInt16` so the number of code units is the number of `UInt16`
words in the representation of the string. The expression `codeunit(s, i)` is
valid and safe for precisely the range of `i` values `1:ncodeunits(s)`.

See also: [`codeunit`](@ref).
"""
ncodeunits(s::String) = sizeof(s)

write(io::IO, s::String) =
@gc_preserve s unsafe_write(io, pointer(s), reinterpret(UInt, sizeof(s)))

Expand All @@ -109,8 +123,8 @@ end
function thisind(s::String, i::Integer)
j = Int(i)
j < 1 && return 0
e = endof(s)
j >= e && return e
n = ncodeunits(s)
j > n && return n + 1
@inbounds while j > 0 && is_valid_continuation(codeunit(s,j))
j -= 1
end
Expand Down Expand Up @@ -281,14 +295,6 @@ function first_utf8_byte(ch::Char)
return b
end

function reverseind(s::String, i::Integer)
j = sizeof(s) + 1 - i
@inbounds while is_valid_continuation(codeunit(s, j))
j -= 1
end
return j
end

## overload methods for efficiency ##

isvalid(s::String, i::Integer) =
Expand Down Expand Up @@ -463,38 +469,6 @@ function string(a::Union{String,Char}...)
return out
end

function reverse(s::String)
dat = Vector{UInt8}(s)
n = length(dat)
n <= 1 && return s
buf = StringVector(n)
out = n
pos = 1
@inbounds while out > 0
ch = dat[pos]
if ch > 0xdf
if ch < 0xf0
(out -= 3) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
buf[out + 1], buf[out + 2], buf[out + 3] = ch, dat[pos + 1], dat[pos + 2]
pos += 3
else
(out -= 4) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
buf[out+1], buf[out+2], buf[out+3], buf[out+4] = ch, dat[pos+1], dat[pos+2], dat[pos+3]
pos += 4
end
elseif ch > 0x7f
(out -= 2) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
buf[out + 1], buf[out + 2] = ch, dat[pos + 1]
pos += 2
else
buf[out] = ch
out -= 1
pos += 1
end
end
String(buf)
end

function repeat(s::String, r::Integer)
r < 0 && throw(ArgumentError("can't repeat a string $r times"))
n = sizeof(s)
Expand Down
2 changes: 1 addition & 1 deletion base/strings/strings.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# This file is a part of Julia. License is MIT: https://julialang.org/license

include("strings/errors.jl")
include("strings/types.jl")
include("strings/substring.jl")
include("strings/basic.jl")
include("strings/search.jl")
include("strings/util.jl")
Expand Down
Loading