From f1ea3aac8769d99fa09165b89b90910a286e7911 Mon Sep 17 00:00:00 2001 From: Stefan Karpinski Date: Wed, 22 Nov 2017 15:39:06 +0000 Subject: [PATCH] remove `RevString`; efficient generic `reverseind` These seem unrelated, but they're actually linked: * If you reverse generic strings by wrapping them in `RevString` then then this generic `reverseind` is incorrect. * In order to have a correct generic `reverseind` one needs to assume that `reverse(s)` returns a string of the same type and encoding as `s` with code points in reverse order; one also needs to assume that the code units encoding each character remain the same when reversed. This is a valid assumption for UTF-8, UTF-16 and (trivially) UTF-32. Reverse string search functions are pretty messed up by this and I've fixed them well enough to work but they may be quite inefficient for long strings now. I'm not going to spend too much time on this since there's other work going on to generalize and unify searching APIs. Close #22611 Close #24613 See also: #10593 #23612 #24103 --- NEWS.md | 11 +++++ base/exports.jl | 1 - base/precompile.jl | 3 -- base/repl/REPLCompletions.jl | 4 +- base/shell.jl | 2 +- base/strings/search.jl | 52 +++++++++++------------ base/strings/string.jl | 40 ------------------ base/strings/strings.jl | 2 +- base/strings/{types.jl => substring.jl} | 55 ++++++++++--------------- base/strings/util.jl | 15 ++++--- stdlib/Test/src/Test.jl | 3 ++ test/replcompletions.jl | 2 +- test/strings/types.jl | 34 +++++++-------- test/unicode/utf8.jl | 18 ++++---- 14 files changed, 96 insertions(+), 146 deletions(-) rename base/strings/{types.jl => substring.jl} (80%) diff --git a/NEWS.md b/NEWS.md index c879f70f2f38f..5c1ea9855a52f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -295,6 +295,12 @@ This section lists changes that do not have deprecation warnings. `AbstractArray` types that specialized broadcasting using the old internal API will need to switch to the new API. ([#20740]) + * The `RevString` type has been removed from the language; `reverse(::String)` returns + a `String` with code points (or fragments thereof) in reverse order. In general, + `reverse(s)` should return a string of the same type and encoding as `s` with code + points in reverse order; any string type overrides `reverse` to return a different + type of string must also override `reverseind` to compute reversed indices correctly. + Library improvements -------------------- @@ -397,6 +403,11 @@ Library improvements The generic definition is constant time but calls `endof(s)` which may be inefficient. Therefore custom string types may want to define direct `ncodeunits` methods. + * `reverseind(s::AbstractString, i::Integer)` now has an efficient generic fallback, so + custom string types do not need to provide their own efficient defintions. The generic + definition relies on `ncodeunits` however, so for optimal performance you may need to + define a custom method for that function. + Compiler/Runtime improvements ----------------------------- diff --git a/base/exports.jl b/base/exports.jl index 3c8186d0faf20..1c79dba92ce8c 100644 --- a/base/exports.jl +++ b/base/exports.jl @@ -88,7 +88,6 @@ export Rational, Regex, RegexMatch, - RevString, RoundFromZero, RoundDown, RoundingMode, diff --git a/base/precompile.jl b/base/precompile.jl index 08b1b636f60c2..3f86f1a52f3fd 100644 --- a/base/precompile.jl +++ b/base/precompile.jl @@ -578,9 +578,6 @@ precompile(Tuple{typeof(Base.LineEdit.complete_line), Base.LineEdit.PromptState, precompile(Tuple{typeof(Base.LineEdit.input_string_newlines_aftercursor), Base.LineEdit.PromptState}) precompile(Tuple{typeof(Base.LineEdit.complete_line), Base.REPL.REPLCompletionProvider, Base.LineEdit.PromptState}) precompile(Tuple{getfield(Base, Symbol("#kw##parse")), Array{Any, 1}, typeof(Base.parse), String}) -precompile(Tuple{typeof(Base.isvalid), Base.RevString{String}, Int64}) -precompile(Tuple{typeof(Base.nextind), Base.RevString{String}, Int64}) -precompile(Tuple{typeof(Base.search), Base.RevString{String}, Array{Char, 1}, Int64}) precompile(Tuple{typeof(Base.rsearch), String, Array{Char, 1}, Int64}) precompile(Tuple{getfield(Base.REPLCompletions, Symbol("#kw##find_start_brace")), Array{Any, 1}, typeof(Base.REPLCompletions.find_start_brace), String}) precompile(Tuple{typeof(Core.Inference.isbits), Tuple{Void, Void, Void}}) diff --git a/base/repl/REPLCompletions.jl b/base/repl/REPLCompletions.jl index 2040476af8947..3e5056d613f26 100644 --- a/base/repl/REPLCompletions.jl +++ b/base/repl/REPLCompletions.jl @@ -225,7 +225,7 @@ end # closed start brace from the end of the string. function find_start_brace(s::AbstractString; c_start='(', c_end=')') braces = 0 - r = RevString(s) + r = reverse(s) i = start(r) in_single_quotes = false in_double_quotes = false @@ -259,7 +259,7 @@ function find_start_brace(s::AbstractString; c_start='(', c_end=')') braces == 1 && break end braces != 1 && return 0:-1, -1 - method_name_end = reverseind(r, i) + method_name_end = reverseind(s, i) startind = nextind(s, rsearch(s, non_identifier_chars, method_name_end)) return (startind:endof(s), method_name_end) end diff --git a/base/shell.jl b/base/shell.jl index b8923ced43d04..72ffd23d9a944 100644 --- a/base/shell.jl +++ b/base/shell.jl @@ -14,7 +14,7 @@ function shell_parse(str::AbstractString, interpolate::Bool=true; special::AbstractString="") s = lstrip(str) # strips the end but respects the space when the string ends with "\\ " - r = RevString(s) + r = reverse(s) i = start(r) c_old = nothing while !done(r,i) diff --git a/base/strings/search.jl b/base/strings/search.jl index 23f813ea28b26..43e880a26b9e5 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -194,12 +194,6 @@ end search(s::AbstractString, t::AbstractString, i::Integer=start(s)) = _search(s, t, i) search(s::ByteArray, t::ByteArray, i::Integer=start(s)) = _search(s, t, i) -function rsearch(s::AbstractString, c::Chars) - j = search(RevString(s), c) - j == 0 && return 0 - endof(s)-j+1 -end - """ rsearch(s::AbstractString, chars::Chars, [start::Integer]) @@ -212,44 +206,50 @@ julia> rsearch("aaabbb","b") 6:6 ``` """ -function rsearch(s::AbstractString, c::Chars, i::Integer) - e = endof(s) - j = search(RevString(s), c, e-i+1) - j == 0 && return 0 - e-j+1 +function rsearch(s::AbstractString, c::Chars, i::Integer=start(s)) + if i < 1 + return i == 0 ? 0 : throw(BoundsError(s, i)) + end + n = ncodeunits(s) + if i > n + return i == n+1 ? 0 : throw(BoundsError(s, i)) + end + # r[reverseind(r,i)] == reverse(r)[i] == s[i] + # s[reverseind(s,j)] == reverse(s)[j] == r[j] + r = reverse(s) + j = search(r, c, reverseind(r, i)) + j == 0 ? 0 : reverseind(s, j) end function _rsearchindex(s, t, i) if isempty(t) - return 1 <= i <= nextind(s,endof(s)) ? i : + return 1 <= i <= nextind(s, endof(s)) ? i : throw(BoundsError(s, i)) end - t = RevString(t) - rs = RevString(s) + t = reverse(t) + rs = reverse(s) l = endof(s) - t1, j2 = next(t,start(t)) + t1, j2 = next(t, start(t)) while true - i = rsearch(s,t1,i) - if i == 0 return 0 end - c, ii = next(rs,l-i+1) + i = rsearch(s, t1, i) + i == 0 && return 0 + c, ii = next(rs, reverseind(rs, i)) j = j2; k = ii matched = true - while !done(t,j) - if done(rs,k) + while !done(t, j) + if done(rs, k) matched = false break end - c, k = next(rs,k) - d, j = next(t,j) + c, k = next(rs, k) + d, j = next(t, j) if c != d matched = false break end end - if matched - return nextind(s,l-k+1) - end - i = l-ii+1 + matched && return nextind(s, reverseind(s, k)) + i = reverseind(s, ii) end end diff --git a/base/strings/string.jl b/base/strings/string.jl index 1c8cae4501b18..e66f876a5f77d 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -295,14 +295,6 @@ function first_utf8_byte(ch::Char) return b end -function reverseind(s::String, i::Integer) - j = sizeof(s) + 1 - i - @inbounds while is_valid_continuation(codeunit(s, j)) - j -= 1 - end - return j -end - ## overload methods for efficiency ## isvalid(s::String, i::Integer) = @@ -477,38 +469,6 @@ function string(a::Union{String,Char}...) return out end -function reverse(s::String) - dat = Vector{UInt8}(s) - n = length(dat) - n <= 1 && return s - buf = StringVector(n) - out = n - pos = 1 - @inbounds while out > 0 - ch = dat[pos] - if ch > 0xdf - if ch < 0xf0 - (out -= 3) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch)) - buf[out + 1], buf[out + 2], buf[out + 3] = ch, dat[pos + 1], dat[pos + 2] - pos += 3 - else - (out -= 4) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch)) - buf[out+1], buf[out+2], buf[out+3], buf[out+4] = ch, dat[pos+1], dat[pos+2], dat[pos+3] - pos += 4 - end - elseif ch > 0x7f - (out -= 2) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch)) - buf[out + 1], buf[out + 2] = ch, dat[pos + 1] - pos += 2 - else - buf[out] = ch - out -= 1 - pos += 1 - end - end - String(buf) -end - function repeat(s::String, r::Integer) r < 0 && throw(ArgumentError("can't repeat a string $r times")) n = sizeof(s) diff --git a/base/strings/strings.jl b/base/strings/strings.jl index 0acad00f79ca7..1e442a09e406f 100644 --- a/base/strings/strings.jl +++ b/base/strings/strings.jl @@ -1,7 +1,7 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license include("strings/errors.jl") -include("strings/types.jl") +include("strings/substring.jl") include("strings/basic.jl") include("strings/search.jl") include("strings/util.jl") diff --git a/base/strings/types.jl b/base/strings/substring.jl similarity index 80% rename from base/strings/types.jl rename to base/strings/substring.jl index e4474e5cb7edc..d1bf33e4123fb 100644 --- a/base/strings/types.jl +++ b/base/strings/substring.jl @@ -1,9 +1,5 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license -# SubString and RevString types - -## substrings reference original strings ## - """ SubString(s::AbstractString, i::Integer, j::Integer=endof(s)) SubString(s::AbstractString, r::UnitRange{<:Integer}) @@ -51,6 +47,9 @@ end SubString(s::AbstractString) = SubString(s, 1, endof(s)) SubString{T}(s::T) where {T<:AbstractString} = SubString{T}(s, 1, endof(s)) +convert(::Type{SubString{S}}, s::AbstractString) where {S<:AbstractString} = + SubString(convert(S, s)) + String(p::SubString{String}) = unsafe_string(pointer(p.string, p.offset+1), nextind(p, p.endof)-1) @@ -123,32 +122,18 @@ function unsafe_convert(::Type{Ptr{R}}, s::SubString{String}) where R<:Union{Int convert(Ptr{R}, pointer(s.string)) + s.offset end -## reversed strings without data movement ## - -struct RevString{T<:AbstractString} <: AbstractString - string::T -end - -endof(s::RevString) = endof(s.string) -length(s::RevString) = length(s.string) -sizeof(s::RevString) = sizeof(s.string) - -function next(s::RevString, i::Int) - n = endof(s); j = n-i+1 - (s.string[j], n-prevind(s.string,j)+1) -end - """ reverse(s::AbstractString) -> AbstractString -Reverses a string. - -Technically, this function reverses the codepoints in a string, and its +Reverses a string. Technically, this function reverses the codepoints in a string and its main utility is for reversed-order string processing, especially for reversed -regular-expression searches. See also [`reverseind`](@ref) to convert indices -in `s` to indices in `reverse(s)` and vice-versa, and [`graphemes`](@ref) -to operate on user-visible "characters" (graphemes) rather than codepoints. -See also [`Iterators.reverse`](@ref) for reverse-order iteration without making a copy. +regular-expression searches. See also [`reverseind`](@ref) to convert indices in `s` to +indices in `reverse(s)` and vice-versa, and [`graphemes`](@ref) to operate on user-visible +"characters" (graphemes) rather than codepoints. See also [`Iterators.reverse`](@ref) for +reverse-order iteration without making a copy. Custom string types must implement the +`reverse` function themselves and should typically return a string with the same type +and encoding. If they return a string with a different encoding, they must also override +`reverseind` for that string type to satisfy `s[reverseind(s,i)] == reverse(s)[i]`. # Examples ```jldoctest @@ -162,10 +147,15 @@ julia> join(reverse(collect(graphemes("ax̂e")))) # reverses graphemes "ex̂a" ``` """ -reverse(s::AbstractString) = RevString(s) -reverse(s::RevString) = s.string - -## reverse an index i so that reverse(s)[i] == s[reverseind(s,i)] +function reverse(s::Union{String,SubString{String}})::String + sprint() do io + i, j = start(s), endof(s) + while i ≤ j + c, j = s[j], prevind(s, j) + write(io, c) + end + end +end """ reverseind(v, i) @@ -185,10 +175,7 @@ julia> for i in 1:length(r) Julia ``` """ -reverseind(s::AbstractString, i) = chr2ind(s, length(s) + 1 - ind2chr(reverse(s), i)) -reverseind(s::RevString, i::Integer) = endof(s) - i + 1 -reverseind(s::SubString{String}, i::Integer) = - reverseind(s.string, nextind(s.string, endof(s.string))-s.offset-s.endof+i-1) - s.offset +reverseind(s::AbstractString, i::Integer) = thisind(s, ncodeunits(s)-i+1) """ repeat(s::AbstractString, r::Integer) diff --git a/base/strings/util.jl b/base/strings/util.jl index c066c188b7e27..6ccba02aed662 100644 --- a/base/strings/util.jl +++ b/base/strings/util.jl @@ -190,16 +190,15 @@ julia> rstrip(a) ``` """ function rstrip(s::AbstractString, chars::Chars=_default_delims) - r = RevString(s) - i = start(r) - while !done(r,i) - c, j = next(r,i) - if !(c in chars) - return SubString(s, 1, endof(s)-i+1) - end + a = start(s) + i = endof(s) + while a ≤ i + c = s[i] + j = prevind(s, i) + c in chars || return SubString(s, 1:i) i = j end - SubString(s, 1, 0) + SubString(s, a, a-1) end """ diff --git a/stdlib/Test/src/Test.jl b/stdlib/Test/src/Test.jl index 461fabab56545..61c316110ca08 100644 --- a/stdlib/Test/src/Test.jl +++ b/stdlib/Test/src/Test.jl @@ -1384,6 +1384,9 @@ struct GenericString <: AbstractString end Base.endof(s::GenericString) = endof(s.string) Base.next(s::GenericString, i::Int) = next(s.string, i) +Base.reverse(s::GenericString) = GenericString(reverse(s.string)) +Base.reverse(s::SubString{GenericString}) = + GenericString(typeof(s.string)(reverse(String(s)))) """ The `GenericSet` can be used to test generic set APIs that program to diff --git a/test/replcompletions.jl b/test/replcompletions.jl index a45fb05c7d28e..9698b1622fdf9 100644 --- a/test/replcompletions.jl +++ b/test/replcompletions.jl @@ -3,7 +3,7 @@ using Base.REPLCompletions let ex = quote - module CompletionFoo + module CompletionFoo mutable struct Test_y yy end diff --git a/test/strings/types.jl b/test/strings/types.jl index a9ce276c02961..12dd75a1bd421 100644 --- a/test/strings/types.jl +++ b/test/strings/types.jl @@ -1,6 +1,6 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license -## SubString, RevString and Cstring tests ## +## SubString and Cstring tests ## ## SubString tests ## u8str = "∀ ε > 0, ∃ δ > 0: |x-y| < δ ⇒ |f(x)-f(y)| < ε" @@ -207,31 +207,13 @@ let s = "|η(α)-ϕ(κ)| < ε" @test length(SubString(s,4,11))==length(s[4:11]) end -## Reverse strings ## - -let rs = RevString("foobar") - @test length(rs) == 6 - @test sizeof(rs) == 6 - @test isascii(rs) -end - -# issue #4586 -@test rsplit(RevString("ailuj"),'l') == ["ju","ia"] -@test parse(Float64,RevString("64")) === 46.0 - -# reverseind -for T in (String, GenericString) +@testset "reverseind" for T in (String, SubString, GenericString) for prefix in ("", "abcd", "\U0001d6a4\U0001d4c1", "\U0001d6a4\U0001d4c1c", " \U0001d6a4\U0001d4c1") for suffix in ("", "abcde", "\U0001d4c1β\U0001d6a4", "\U0001d4c1β\U0001d6a4c", " \U0001d4c1β\U0001d6a4") for c in ('X', 'δ', '\U0001d6a5') s = convert(T, string(prefix, c, suffix)) r = reverse(s) ri = search(r, c) - @test r == RevString(s) - @test c == s[reverseind(s, ri)] == r[ri] - s = RevString(s) - r = reverse(s) - ri = search(r, c) @test c == s[reverseind(s, ri)] == r[ri] s = convert(T, string(prefix, prefix, c, suffix, suffix)) pre = convert(T, prefix) @@ -244,6 +226,18 @@ for T in (String, GenericString) end end +@testset "reverseind of empty strings" begin + for s in ("", + SubString("", 1, 0), + SubString("ab", 1, 0), + SubString("ab", 2, 1), + SubString("ab", 3, 2), + GenericString("")) + @test reverseind(s, 0) == 1 + @test reverseind(s, 1) == 0 + end +end + ## Cstring tests ## # issue #13974: comparison against pointers diff --git a/test/unicode/utf8.jl b/test/unicode/utf8.jl index bdb9664fc26fb..a9db6316d2fa9 100644 --- a/test/unicode/utf8.jl +++ b/test/unicode/utf8.jl @@ -27,16 +27,16 @@ end @test reverse("a") == "a" @test reverse("abc") == "cba" @test reverse("xyz\uff\u800\uffff\U10ffff") == "\U10ffff\uffff\u800\uffzyx" - for str in [ - b"xyz\xc1", - b"xyz\xd0", - b"xyz\xe0", - b"xyz\xed\x80", - b"xyz\xf0", - b"xyz\xf0\x80", - b"xyz\xf0\x80\x80" + for (s, r) in [ + b"xyz\xc1" => b"\xc1zyx", + b"xyz\xd0" => b"\xd0zyx", + b"xyz\xe0" => b"\xe0zyx", + b"xyz\xed\x80" => b"\xed\x80zyx", + b"xyz\xf0" => b"\xf0zyx", + b"xyz\xf0\x80" => b"\xf0\x80zyx", + b"xyz\xf0\x80\x80" => b"\xf0\x80\x80zyx", ] - @test_throws UnicodeError reverse(String(str)) + @test_broken reverse(String(s)) == String(r) end end