SubString{UTF8String} and UTF8String have different slicing behaviour #7811

ivarne · 2014-08-01T19:58:50Z

julia> a = "æøå"
"æøå"

julia> a[2:end]
"øå"

julia> b = SubString(a,1)
"æøå"

julia> b[2:end]
ERROR: invalid SubString indexes
 in getindex at ./string.jl:657

I'm not sure what the correct behaviour should be, but having them different seems wrong. Some relevant discussion in #5446

The text was updated successfully, but these errors were encountered:

stevengj · 2014-08-02T21:14:35Z

a[2:end] is wrong, because 2 is not the index of the beginning of a UTF8 character; you really want a[3:end] == a[chr2ind(a,2):end].

ivarne · 2014-08-02T21:50:30Z

I know the index 2 is invalid. The problem is that the behavior is different for the erroneous index, and I can't see why they should be.

JeffBezanson · 2014-08-02T21:51:02Z

Range indexing for UTF8String adjusts the start index to the next valid index, while SubString just throws an error in this case. The "adjusting" behavior is probably not necessary.

ivarne · 2014-08-02T22:02:14Z

Changing range indexing for UTF8String (much better word than slicing, by the way) is a breaking change, that will cause problems. Added appropriate labels. Seems like this should wait for 0.4

stevengj · 2014-08-02T22:36:14Z

I suspect that most usages that rely on the auto-adjustment are actually bugs (people thinking that character index == string index). So I'm fine with UTF8String throwing an error instead.

catawbasam · 2014-08-03T13:08:45Z

+1 for throwing the error. The auto-adjustment makes it harder to figure out how UTF8String actually works.

vtjnash · 2015-04-17T04:56:52Z

I've been working on the following patch. any thoughts on concerns on this change? I've been focusing on validation of the first index only. it may be worth validating the second index too, however.

commit 215e3e02023e6400e5a8ab2a62cd0c8cdb359f09
Author: Jameson Nash <[email protected]>
Date:   Thu Apr 16 21:15:12 2015 -0400

    throw ArgumentError when accessing an invalid UTF-8 indice, rather than attempting to adjust it. fix #7811

diff --git a/base/ascii.jl b/base/ascii.jl
index a9c905f..3622e3c 100644
--- a/base/ascii.jl
+++ b/base/ascii.jl
@@ -17,7 +17,11 @@ sizeof(s::ASCIIString) = sizeof(s.data)
 getindex(s::ASCIIString, r::Vector) = ASCIIString(getindex(s.data,r))
 getindex(s::ASCIIString, r::UnitRange{Int}) = ASCIIString(getindex(s.data,r))
 getindex(s::ASCIIString, indx::AbstractVector{Int}) = ASCIIString(s.data[indx])
-search(s::ASCIIString, c::Char, i::Integer) = c < Char(0x80) ? search(s.data,c%UInt8,i) : 0
+function search(s::ASCIIString, c::Char, i::Integer)
+    i == sizeof(s) + 1 && return 0
+    (i < 1 || i > sizeof(s)) && throw(BoundsError(s, i))
+    return c < Char(0x80) ? search(s.data,c%UInt8,i) : 0
+end
 rsearch(s::ASCIIString, c::Char, i::Integer) = c < Char(0x80) ? rsearch(s.data,c%UInt8,i) : 0

 function string(c::ASCIIString...)
diff --git a/base/string.jl b/base/string.jl
index 61b6567..7909821 100644
--- a/base/string.jl
+++ b/base/string.jl
@@ -185,10 +185,9 @@ function search(s::AbstractString, c::Chars, i::Integer)
         return 1 <= i <= nextind(s,endof(s)) ? i :
                throw(BoundsError(s, i))
     end
-    if i < 1
+    if i < 1 || i > nextind(s,endof(s))
         throw(BoundsError(s, i))
     end
-    i = nextind(s,i-1)
     while !done(s,i)
         d, j = next(s,i)
         if d in c
@@ -632,7 +631,7 @@ prevind(s::SubString, i::Integer) = prevind(s.string, i+s.offset)-s.offset

 convert{T<:AbstractString}(::Type{SubString{T}}, s::T) = SubString(s, 1, endof(s))

-bytestring{T <: ByteString}(p::SubString{T}) = bytestring(pointer(p.string.data)+p.offset, nextind(p, p.endof)-1)
+bytestring{T <: ByteString}(p::SubString{T}) = bytestring(p.string.data[1+p.offset:p.offset+nextind(p, p.endof)-1])

 function serialize{T}(s, ss::SubString{T})
     # avoid saving a copy of the parent string, keeping the type of ss
diff --git a/base/utf8.jl b/base/utf8.jl
index c254a50..8eea581 100644
--- a/base/utf8.jl
+++ b/base/utf8.jl
@@ -54,16 +54,7 @@ function next(s::UTF8String, i::Int)
     d = s.data
     b = d[i]
     if !is_utf8_start(b)
-        j = i-1
-        while 0 < j && !is_utf8_start(d[j])
-            j -= 1
-        end
-        if 0 < j && i <= j+utf8_trailing[d[j]+1] <= length(d)
-            # b is a continuation byte of a valid UTF-8 character
-            throw(ArgumentError("invalid UTF-8 character index"))
-        end
-        # move past 1 byte in case the data is actually Latin-1
-        return '\ufffd', i+1
+        throw(ArgumentError("invalid UTF-8 character index"))
     end
     trailing = utf8_trailing[b+1]
     if length(d) < i + trailing
@@ -111,8 +102,11 @@ function getindex(s::UTF8String, r::UnitRange{Int})
     isempty(r) && return empty_utf8
     i, j = first(r), last(r)
     d = s.data
+    if i < 1 || i > length(s.data)
+        throw(BoundsError(s, i))
+    end
     if !is_utf8_start(d[i])
-        i = nextind(s,i)
+        throw(ArgumentError("invalid UTF-8 character index"))
     end
     if j > length(d)
         throw(BoundsError())
@@ -122,6 +116,13 @@ function getindex(s::UTF8String, r::UnitRange{Int})
 end

 function search(s::UTF8String, c::Char, i::Integer)
+    if i < 1 || i > sizeof(s)
+        i == sizeof(s) + 1 && return 0
+        throw(BoundsError(s, i))
+    end
+    if !is_utf8_start(s.data[i])
+        throw(ArgumentError("invalid UTF-8 character index"))
+    end
     c < Char(0x80) && return search(s.data, c%UInt8, i)
     while true
         i = search(s.data, first_utf8_byte(c), i)
diff --git a/test/strings.jl b/test/strings.jl
index 3734a2e..f768def 100644
--- a/test/strings.jl
+++ b/test/strings.jl
@@ -256,6 +256,8 @@ u8str = "∀ ε > 0, ∃ δ > 0: |x-y| < δ ⇒ |f(x)-f(y)| < ε"

 # ascii search
 for str in [astr, Base.GenericString(astr)]
+    @test_throws BoundsError search(str, 'z', 0)
+    @test_throws BoundsError search(str, '∀', 0)
     @test search(str, 'x') == 0
     @test search(str, '\0') == 0
     @test search(str, '\u80') == 0
@@ -269,6 +271,8 @@ for str in [astr, Base.GenericString(astr)]
     @test search(str, ',', 7) == 0
     @test search(str, '\n') == 14
     @test search(str, '\n', 15) == 0
+    @test_throws BoundsError search(str, 'ε', nextind(str,endof(str))+1)
+    @test_throws BoundsError search(str, 'a', nextind(str,endof(str))+1)
 end

 # ascii rsearch
@@ -291,23 +295,32 @@ end

 # utf-8 search
 for str in (u8str, Base.GenericString(u8str))
+    @test_throws BoundsError search(str, 'z', 0)
+    @test_throws BoundsError search(str, '∀', 0)
     @test search(str, 'z') == 0
     @test search(str, '\0') == 0
     @test search(str, '\u80') == 0
     @test search(str, '∄') == 0
     @test search(str, '∀') == 1
-    @test search(str, '∀', 2) == 0
+    @test_throws ArgumentError search(str, '∀', 2)
+    @test search(str, '∀', 4) == 0
     @test search(str, '∃') == 13
-    @test search(str, '∃', 14) == 0
+    @test_throws ArgumentError search(str, '∃', 15)
+    @test search(str, '∃', 16) == 0
     @test search(str, 'x') == 26
     @test search(str, 'x', 27) == 43
     @test search(str, 'x', 44) == 0
     @test search(str, 'δ') == 17
-    @test search(str, 'δ', 18) == 33
-    @test search(str, 'δ', 34) == 0
+    @test_throws ArgumentError search(str, 'δ', 18)
+    @test search(str, 'δ', nextind(str,17)) == 33
+    @test search(str, 'δ', nextind(str,33)) == 0
     @test search(str, 'ε') == 5
-    @test search(str, 'ε', 6) == 54
-    @test search(str, 'ε', 55) == 0
+    @test search(str, 'ε', nextind(str,5)) == 54
+    @test search(str, 'ε', nextind(str,54)) == 0
+    @test search(str, 'ε', nextind(str,endof(str))) == 0
+    @test search(str, 'a', nextind(str,endof(str))) == 0
+    @test_throws BoundsError search(str, 'ε', nextind(str,endof(str))+1)
+    @test_throws BoundsError search(str, 'a', nextind(str,endof(str))+1)
 end

 # utf-8 rsearch

there were a number of other inconsistencies I noted and attempted fixed in this patch, for example, with search:

julia> search("asdf",'a',0)
ERROR: BoundsError: attempt to access 4-element Array{UInt8,1}:
 0x61
 0x73
 0x64
 0x66
  at index [0]
 in search at ./string.jl:1654
 in search at no file


julia> search("asdf",'α',0)
0

I tried to fix many of these also, making this patch much larger than I originally expected, so it may be time to turn this into a PR...

pao · 2015-04-17T12:46:38Z

cf #9297 cc @nalimilan?

…an attempting to adjust it. fix #7811

vtjnash · 2015-04-26T06:11:29Z

i turned this into a branch. any concerns before I go ahead and merge this?

ScottPJones · 2015-05-22T14:03:43Z

For what it's worth, I think this should definitely go into 0.4

…an attempting to adjust it (in getindex, next, search). fix #7811

…n attempting to adjust it (in getindex, next, search). fix #7811

ivarne added the bug label Aug 2, 2014

JeffBezanson changed the title ~~SubString{UTF8String} and UTF8String has different slicing behaviour~~ SubString{UTF8String} and UTF8String have different slicing behaviour Aug 2, 2014

ivarne added breaking labels Aug 2, 2014

ivarne mentioned this issue Dec 1, 2014

Path with space completion fix for REPL #8838

Merged

JeffBezanson added this to the 0.4 milestone Dec 4, 2014

ivarne mentioned this issue Dec 7, 2014

add graphemes(s) function to iterate over string graphemes #9261

Merged

ivarne mentioned this issue Dec 15, 2014

fix repl rsearch with one-char query #9352

Merged

JeffBezanson modified the milestones: 0.4, 0.4.1 Feb 25, 2015

vtjnash added a commit that referenced this issue Apr 26, 2015

throw ArgumentError when accessing an invalid UTF-8 indice, rather th…

de8f898

…an attempting to adjust it. fix #7811

tkelman mentioned this issue May 24, 2015

throw ArgumentError when accessing an invalid UTF-8 indice, rather th… #11423

Merged

vtjnash added a commit that referenced this issue Jun 6, 2015

throw ArgumentError when accessing an invalid UTF-8 indice, rather th…

6bd2aeb

…an attempting to adjust it (in getindex, next, search). fix #7811

vtjnash added a commit that referenced this issue Jun 6, 2015

throw UnicodeError when accessing an invalid UTF-8 indice, rather tha…

3e53553

…n attempting to adjust it (in getindex, next, search). fix #7811

vtjnash closed this as completed in #11423 Jun 10, 2015

vtjnash mentioned this issue Mar 26, 2016

SubString reports out-of-bounds error for empty ranges #15624

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

SubString{UTF8String} and UTF8String have different slicing behaviour #7811

SubString{UTF8String} and UTF8String have different slicing behaviour #7811

ivarne commented Aug 1, 2014

stevengj commented Aug 2, 2014

ivarne commented Aug 2, 2014

JeffBezanson commented Aug 2, 2014

ivarne commented Aug 2, 2014

stevengj commented Aug 2, 2014

catawbasam commented Aug 3, 2014

vtjnash commented Apr 17, 2015

pao commented Apr 17, 2015

vtjnash commented Apr 26, 2015

ScottPJones commented May 22, 2015

SubString{UTF8String} and UTF8String have different slicing behaviour #7811

SubString{UTF8String} and UTF8String have different slicing behaviour #7811

Comments

ivarne commented Aug 1, 2014

stevengj commented Aug 2, 2014

ivarne commented Aug 2, 2014

JeffBezanson commented Aug 2, 2014

ivarne commented Aug 2, 2014

stevengj commented Aug 2, 2014

catawbasam commented Aug 3, 2014

vtjnash commented Apr 17, 2015

pao commented Apr 17, 2015

vtjnash commented Apr 26, 2015

ScottPJones commented May 22, 2015