From 4025c5267cc12de01575dae3a3f170c63c8c0bd9 Mon Sep 17 00:00:00 2001 From: Jan Zajic Date: Sat, 9 Nov 2019 21:08:36 +0100 Subject: [PATCH 01/10] add missing documentation to String methods --- src/string.cr | 222 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 204 insertions(+), 18 deletions(-) diff --git a/src/string.cr b/src/string.cr index 673bd0cbe3dc..dac1b98bf53f 100644 --- a/src/string.cr +++ b/src/string.cr @@ -725,10 +725,12 @@ class String end end - # Returns the `Char` at the given *index*, or raises `IndexError` if out of bounds. + # Returns the `Char` at the given *index*. # # Negative indices can be used to start counting from the end of the string. # + # Raises `IndexError` if the *index* is out of range. + # # ``` # "hello"[0] # => 'h' # "hello"[1] # => 'e' @@ -849,11 +851,32 @@ class String self[regex, group]?.not_nil! end - def char_at(index : Int) + # Returns the `Char` at the given *index*. + # + # Negative indices can be used to start counting from the end of the string. + # + # Raises `IndexError` if the *index* is out of range. + # + # ``` + # "hello".char_at(0) # => 'h' + # "hello".char_at(1) # => 'e' + # "hello".char_at(-1) # => 'o' + # "hello".char_at(-2) # => 'l' + # "hello".char_at(5) # raises IndexError + # ``` + def char_at(index : Int) : Char char_at(index) { raise IndexError.new } end - def char_at(index : Int) + # Returns the `Char` at the given *index*, or yields if out of bounds. + # + # Negative indices can be used to start counting from the end of the string. + # + # ``` + # "hello".char_at(4) { 'x' } # => 'o' + # "hello".char_at(5) { 'x' } # => 'x' + # ``` + def char_at(index : Int, &) if ascii_only? byte = byte_at?(index) if byte @@ -874,11 +897,42 @@ class String end end - def byte_slice(start : Int, count : Int) + # Returns a new string consisted of *count* bytes starting at *start* byte. + # + # The *start* argument can be negative to start counting + # from the end of the string. + # If `count` is bigger than number of bytes from *start* to `bytelen`, + # only remaining bytes are returned. + # + # Be careful when working with multibyte characters - they can be splitted, + # which may lead to invalid UTF-8 values. These, + # when asked as chars, will use the unicode replacement �. + # + # Raises `IndexError` if the *start* index is out of range. + # + # Raises `ArgumentError` if *count* is negative. + # + # ``` + # "hello".byte_slice(0, 2) # => "he" + # "hello".byte_slice(0, 100) # => "hello" + # "hello".byte_slice(-2, 3) # => "he" + # "hello".byte_slice(-2, 5) # => "he" + # "hello".byte_slice(-2, 5) # => "he" + # "¥hello".byte_slice(0, 2) # => "¥" + # "¥hello".byte_slice(2, 2) # => "he" + # "¥hello".byte_slice(0, 1) # => "�" + # "¥hello".byte_slice(1, 1) # => "�" + # "¥hello".byte_slice(1, 2) # => "�h" + # "hello".byte_slice(6, 2) # raises IndexError + # "hello".byte_slice(-6, 2) # raises IndexError + # "hello".byte_slice(0, -2) # raises ArgumentError + # ``` + def byte_slice(start : Int, count : Int) : String byte_slice?(start, count) || raise IndexError.new end - def byte_slice?(start : Int, count : Int) + # Like `byte_slice(Int, Int)` but returns `Nil` if the *start* index is out of range. + def byte_slice?(start : Int, count : Int) : String | Nil raise ArgumentError.new "Negative count" if count < 0 start += bytesize if start < 0 @@ -903,19 +957,77 @@ class String byte_slice start, bytesize - start end - def codepoint_at(index) + # Returns a substring starting from the *start* byte. + # + # The *start* argument can be negative to start counting + # from the end of the string. + # + # Be careful when working with multibyte characters - they can be splitted + # which may lead to unexpected result. + # + # Raises `IndexError` if *start* index is out of range. + # + # ``` + # "hello".byte_slice(0) # => "hello" + # "hello".byte_slice(2) # => "llo" + # "hello".byte_slice(-2) # => "lo" + # "¥hello".byte_slice(2) # => "hello" + # "¥hello".byte_slice(1) # => "�hello" + # "hello".byte_slice(6) # raises IndexError + # "hello".byte_slice(-6) # raises IndexError + # ``` + # Returns the codepoint of `Char` at the given *index*. + # + # Raises `IndexError` if the *index* is out of range. + # + # See also: `Char#ord`. + # + # ``` + # "hello".codepoint_at(0) # => 104 + # "hello".codepoint_at(-1) # => 111 + # "hello".codepoint_at(5) # raises IndexError + # ``` + def codepoint_at(index) : Int32 char_at(index).ord end - def byte_at(index) + # Returns the byte at the given *index*. + # + # Raises `IndexError` if the *index* is out of range. + # + # ``` + # "¥hello".byte_at(0) # => 194 + # "¥hello".byte_at(1) # => 165 + # "¥hello".byte_at(2) # => 104 + # "¥hello".byte_at(-1) # => 111 + # "¥hello".byte_at(6) # => 111 + # "¥hello".byte_at(7) # raises IndexError + # ``` + def byte_at(index) : UInt8 byte_at(index) { raise IndexError.new } end - def byte_at?(index) + # Returns the byte at the given *index*, or nil if out of bounds. + # + # ``` + # "¥hello".byte_at(0) # => 194 + # "¥hello".byte_at(1) # => 165 + # "¥hello".byte_at(2) # => 104 + # "¥hello".byte_at(-1) # => 111 + # "¥hello".byte_at(6) # => 111 + # "¥hello".byte_at(7) # => nil + # ``` + def byte_at?(index) : UInt8 | Nil byte_at(index) { nil } end - def byte_at(index) + # Returns the byte at the given *index*, or yield if out of bounds. + # + # ``` + # "¥hello".byte_at(6) { 0 } # => 111 + # "¥hello".byte_at(7) { 0 } # => 0 + # ``` + def byte_at(index, &) index += bytesize if index < 0 if 0 <= index < bytesize to_unsafe[index] @@ -2418,7 +2530,10 @@ class String self if !blank? end - def ==(other : self) + # Returns `true` if this string is the same as other. + # Comparison is done byte-per-byte: if a byte is less then the other corresponding + # byte, `false` is returned and so on. + def ==(other : self) : Bool return true if same?(other) return false unless bytesize == other.bytesize to_unsafe.memcmp(other.to_unsafe, bytesize) == 0 @@ -2935,6 +3050,15 @@ class String {pre, mid, post} end + # Returns the index of *byte* in the string, or `nil` if the byte is not present. + # If *offset* is present, it defines the position to start the search. + # + # ``` + # "Hello, World".byte_index(0x6f) # => 4 + # "Hello, World".byte_index(0x5a) # => nil + # "Hello, World".byte_index(0x6f, 5) # => 8 + # "💣".byte_index(0xA3) # => 3 + # ``` def byte_index(byte : Int, offset = 0) offset.upto(bytesize - 1) do |i| if to_unsafe[i] == byte @@ -2944,6 +3068,12 @@ class String nil end + # Returns the byte index of *search* in the string, or `nil` if the string is not present. + # If *offset* is present, it defines the position to start the search. + # + # ``` + # "¥hello".byte_index("hello") # => 2 + # ``` def byte_index(search : String, offset = 0) offset += bytesize if offset < 0 return if offset < 0 @@ -4219,12 +4349,25 @@ class String io << '}' if char.ord > 0xFFFF end - def starts_with?(str : String) + # Returns true if this string starts with the given *str*, otherwise `false`. + # + # ``` + # "hello".starts_with?("h") # => true + # "hello".starts_with?("he") # => true + # "hello".starts_with?("hu") # => false + # ``` + def starts_with?(str : String) : Bool return false if str.bytesize > bytesize to_unsafe.memcmp(str.to_unsafe, str.bytesize) == 0 end - def starts_with?(char : Char) + # Returns `true` if this string starts with the given *char*, otherwise `false`. + # + # ``` + # "hello".starts_with?('h') # => true + # "hello".starts_with?('e') # => false + # ``` + def starts_with?(char : Char) : Bool each_char do |c| return c == char end @@ -4232,16 +4375,39 @@ class String false end - def starts_with?(re : Regex) + # Returns true if this string starts with the given *re* regular expression, otherwise `false`. + # + # ``` + # "22hello".starts_with?(/[0-9]/) # => true + # "22hello".starts_with?(/[a-z]/) # => false + # "h22".starts_with?(/[a-z]/) # => true + # "h22".starts_with?(/[A-Z]/) # => true + # "h22".starts_with?(/[a-z]{2}/) # => false + # "hh22".starts_with?(/[a-z]{2}/) # => true + # ``` + def starts_with?(re : Regex) : Bool !!($~ = re.match_at_byte_index(self, 0, Regex::Options::ANCHORED)) end - def ends_with?(str : String) + # Returns true if this string ends with the given *str*, otherwise `false`. + # + # ``` + # "hello".ends_with?("o") # => true + # "hello".ends_with?("lo") # => true + # "hello".ends_with?("ll") # => false + # ``` + def ends_with?(str : String) : Bool return false if str.bytesize > bytesize (to_unsafe + bytesize - str.bytesize).memcmp(str.to_unsafe, str.bytesize) == 0 end - def ends_with?(char : Char) + # Returns true if this string ends with the given *char*, otherwise `false`. + # + # ``` + # "hello".ends_with?('o') # => true + # "hello".ends_with?('l') # => false + # ``` + def ends_with?(char : Char) : Bool return false unless bytesize > 0 if char.ascii? || ascii_only? @@ -4258,7 +4424,17 @@ class String true end - def ends_with?(re : Regex) + # Returns true if this string ends with the given *re* regular expression, otherwise `false`. + # + # ``` + # "22hello".ends_with?(/[0-9]/) # => false + # "22hello".ends_with?(/[a-z]/) # => true + # "22h".ends_with?(/[a-z]/) # => true + # "22h".ends_with?(/[A-Z]/) # => true + # "22h".ends_with?(/[a-z]{2}/) # => false + # "22hh".ends_with?(/[a-z]{2}/) # => true + # ``` + def ends_with?(re : Regex) : Bool !!($~ = /#{re}\z/.match(self)) end @@ -4397,18 +4573,22 @@ class String char_index end - def clone + # Returns `self` + def clone : String self end - def dup + # Returns `self` + def dup : String self end + # Returns `self` def to_s : String self end + # Appends `self` characters to the given IO object. def to_s(io : IO) : Nil io.write_utf8(to_slice) end @@ -4425,10 +4605,16 @@ class String pointerof(@c) end + # Returns *count* of underlying bytes of this String starting at given *byte_offset* in an **unsafe** way. + # + # The returned slice is read-only. def unsafe_byte_slice(byte_offset, count) Slice.new(to_unsafe + byte_offset, count, read_only: true) end + # Returns the underlying bytes of this String starting at given *byte_offset* in an **unsafe** way. + # + # The returned slice is read-only. def unsafe_byte_slice(byte_offset) Slice.new(to_unsafe + byte_offset, bytesize - byte_offset, read_only: true) end From 4bfd993c3039fca23176896eac6e21688ce2d8a0 Mon Sep 17 00:00:00 2001 From: Jan Zajic Date: Sat, 9 Nov 2019 21:12:23 +0100 Subject: [PATCH 02/10] change byte_slice(Int) ArgumentError->IndexError when out of range --- spec/std/string_spec.cr | 12 ++++++++++++ src/string.cr | 6 ++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/spec/std/string_spec.cr b/spec/std/string_spec.cr index 8ace564ab67f..ececca3c7766 100644 --- a/spec/std/string_spec.cr +++ b/spec/std/string_spec.cr @@ -229,6 +229,18 @@ describe "String" do it "gets byte_slice with negative index" do "hello".byte_slice(-2, 3).should eq("lo") end + + it "gets byte_slice(Int) with with start out of bounds" do + expect_raises(IndexError) do + "hello".byte_slice(10) + end + end + + it "gets byte_slice(Int) with with start out of bounds" do + expect_raises(IndexError) do + "hello".byte_slice(-10) + end + end end describe "to_i" do diff --git a/src/string.cr b/src/string.cr index dac1b98bf53f..7dd2e1b18c86 100644 --- a/src/string.cr +++ b/src/string.cr @@ -953,8 +953,10 @@ class String end end - def byte_slice(start : Int) - byte_slice start, bytesize - start + def byte_slice(start : Int) : String + count = bytesize - start + raise IndexError.new if start > 0 && count < 0 + byte_slice start, count end # Returns a substring starting from the *start* byte. From 352373be4c8d6b9abf859c2e44d0e17cf6e6c218 Mon Sep 17 00:00:00 2001 From: Jan Zajic Date: Sat, 16 Nov 2019 22:51:26 +0100 Subject: [PATCH 03/10] add missing documentation to String methods #2 --- src/string.cr | 120 +++++++++++++++++++++++++++++--------------------- 1 file changed, 71 insertions(+), 49 deletions(-) diff --git a/src/string.cr b/src/string.cr index 7dd2e1b18c86..adb6dad27d45 100644 --- a/src/string.cr +++ b/src/string.cr @@ -729,7 +729,7 @@ class String # # Negative indices can be used to start counting from the end of the string. # - # Raises `IndexError` if the *index* is out of range. + # Raises `IndexError` if the *index* is out of bounds. # # ``` # "hello"[0] # => 'h' @@ -746,7 +746,7 @@ class String # as character indices. Indices can be negative to start # counting from the end of the string. # - # Raises `IndexError` if the range's start is out of range. + # Raises `IndexError` if the range's start is out of bounds. # # ``` # "hello"[0..2] # => "hel" @@ -759,7 +759,7 @@ class String self[*Indexable.range_to_index_and_count(range, size)] end - # Like `#[Range]`, but returns `nil` if the range's start is out of range. + # Like `#[Range]`, but returns `nil` if the range's start is out of bounds. # # ``` # "hello"[6..7]? # => nil @@ -771,17 +771,17 @@ class String # Returns a substring starting from the *start* character of size *count*. # - # The *start* argument can be negative to start counting + # *start* can can be negative to start counting # from the end of the string. # - # Raises `IndexError` if the *start* index is out of range. + # Raises `IndexError` if the *start* index is out of bounds. # # Raises `ArgumentError` if *count* is negative. def [](start : Int, count : Int) self[start, count]? || raise IndexError.new end - # Like `#[Int, Int]` but returns `nil` if the *start* index is out of range. + # Like `#[Int, Int]` but returns `nil` if the *start* index is out of bounds. def []?(start : Int, count : Int) raise ArgumentError.new "Negative count: #{count}" if count < 0 return byte_slice?(start, count) if ascii_only? @@ -855,7 +855,7 @@ class String # # Negative indices can be used to start counting from the end of the string. # - # Raises `IndexError` if the *index* is out of range. + # Raises `IndexError` if the *index* is out of bounds. # # ``` # "hello".char_at(0) # => 'h' @@ -873,8 +873,11 @@ class String # Negative indices can be used to start counting from the end of the string. # # ``` - # "hello".char_at(4) { 'x' } # => 'o' - # "hello".char_at(5) { 'x' } # => 'x' + # "hello".char_at(4) { 'x' } # => 'o' + # "hello".char_at(5) { 'x' } # => 'x' + # "hello".char_at(-1) { 'x' } # => 'o' + # "hello".char_at(-5) { 'x' } # => 'h' + # "hello".char_at(-6) { 'x' } # => 'x' # ``` def char_at(index : Int, &) if ascii_only? @@ -897,18 +900,18 @@ class String end end - # Returns a new string consisted of *count* bytes starting at *start* byte. + # Returns a new string built from *count* bytes starting at *start* byte. # - # The *start* argument can be negative to start counting + # *start* can can be negative to start counting # from the end of the string. - # If `count` is bigger than number of bytes from *start* to `bytelen`, + # If *count* is bigger than the number of bytes from *start* to `#bytesize`, # only remaining bytes are returned. # # Be careful when working with multibyte characters - they can be splitted, # which may lead to invalid UTF-8 values. These, # when asked as chars, will use the unicode replacement �. # - # Raises `IndexError` if the *start* index is out of range. + # Raises `IndexError` if the *start* index is out of bounds. # # Raises `ArgumentError` if *count* is negative. # @@ -931,7 +934,17 @@ class String byte_slice?(start, count) || raise IndexError.new end - # Like `byte_slice(Int, Int)` but returns `Nil` if the *start* index is out of range. + # Like `byte_slice(Int, Int)` but returns `Nil` if the *start* index is out of bounds. + # + # Raises `ArgumentError` if *count* is negative. + # + # ``` + # "hello".byte_slice(0, 2) # => "he" + # "hello".byte_slice(0, 100) # => "hello" + # "hello".byte_slice(6, 2) # => nil + # "hello".byte_slice(-6, 2) # => nil + # "hello".byte_slice(0, -2) # raises ArgumentError + # ``` def byte_slice?(start : Int, count : Int) : String | Nil raise ArgumentError.new "Negative count" if count < 0 @@ -953,21 +966,15 @@ class String end end - def byte_slice(start : Int) : String - count = bytesize - start - raise IndexError.new if start > 0 && count < 0 - byte_slice start, count - end - # Returns a substring starting from the *start* byte. # - # The *start* argument can be negative to start counting + # *start* can can be negative to start counting # from the end of the string. # # Be careful when working with multibyte characters - they can be splitted # which may lead to unexpected result. # - # Raises `IndexError` if *start* index is out of range. + # Raises `IndexError` if *start* index is out of bounds. # # ``` # "hello".byte_slice(0) # => "hello" @@ -978,9 +985,17 @@ class String # "hello".byte_slice(6) # raises IndexError # "hello".byte_slice(-6) # raises IndexError # ``` - # Returns the codepoint of `Char` at the given *index*. + def byte_slice(start : Int) : String + count = bytesize - start + raise IndexError.new if start > 0 && count < 0 + byte_slice start, count + end + + # Returns the codepoint of the character at the given *index*. + # + # Negative indices can be used to start counting from the end of the string. # - # Raises `IndexError` if the *index* is out of range. + # Raises `IndexError` if the *index* is out of bounds. # # See also: `Char#ord`. # @@ -995,7 +1010,7 @@ class String # Returns the byte at the given *index*. # - # Raises `IndexError` if the *index* is out of range. + # Raises `IndexError` if the *index* is out of bounds. # # ``` # "¥hello".byte_at(0) # => 194 @@ -1009,7 +1024,7 @@ class String byte_at(index) { raise IndexError.new } end - # Returns the byte at the given *index*, or nil if out of bounds. + # Returns the byte at the given *index*, or `nil` if out of bounds. # # ``` # "¥hello".byte_at(0) # => 194 @@ -1023,11 +1038,11 @@ class String byte_at(index) { nil } end - # Returns the byte at the given *index*, or yield if out of bounds. + # Returns the byte at the given *index*, or yields if out of bounds. # # ``` - # "¥hello".byte_at(6) { 0 } # => 111 - # "¥hello".byte_at(7) { 0 } # => 0 + # "¥hello".byte_at(6) { "OUT OF BOUNDS" } # => 111 + # "¥hello".byte_at(7) { "OUT OF BOUNDS" } # => "OUT OF BOUNDS" # ``` def byte_at(index, &) index += bytesize if index < 0 @@ -2532,8 +2547,8 @@ class String self if !blank? end - # Returns `true` if this string is the same as other. - # Comparison is done byte-per-byte: if a byte is less then the other corresponding + # Returns `true` if this string is equal to `*other*. + # Comparison is done byte-per-byte: if a byte is different from the corresponding # byte, `false` is returned and so on. def ==(other : self) : Bool return true if same?(other) @@ -3061,7 +3076,7 @@ class String # "Hello, World".byte_index(0x6f, 5) # => 8 # "💣".byte_index(0xA3) # => 3 # ``` - def byte_index(byte : Int, offset = 0) + def byte_index(byte : Int, offset = 0) : Int32? offset.upto(bytesize - 1) do |i| if to_unsafe[i] == byte return i @@ -3073,10 +3088,17 @@ class String # Returns the byte index of *search* in the string, or `nil` if the string is not present. # If *offset* is present, it defines the position to start the search. # + # # Negative *offset* can be used to start the search from the end of the string. + # # ``` - # "¥hello".byte_index("hello") # => 2 + # "¥hello".byte_index("hello") # => 2 + # "hello".byte_index("world") # => nil + # "Dizzy Miss Lizzy".byte_index("izzy") # => 1 + # "Dizzy Miss Lizzy".byte_index("izzy", 2) # => 12 + # "Dizzy Miss Lizzy".byte_index("izzy", -4) # => 12 + # "Dizzy Miss Lizzy".byte_index("izzy", -4) # => nil # ``` - def byte_index(search : String, offset = 0) + def byte_index(search : String, offset = 0) : Int32? offset += bytesize if offset < 0 return if offset < 0 @@ -4351,7 +4373,7 @@ class String io << '}' if char.ord > 0xFFFF end - # Returns true if this string starts with the given *str*, otherwise `false`. + # Returns `true` if this string starts with the given *str*. # # ``` # "hello".starts_with?("h") # => true @@ -4363,7 +4385,7 @@ class String to_unsafe.memcmp(str.to_unsafe, str.bytesize) == 0 end - # Returns `true` if this string starts with the given *char*, otherwise `false`. + # Returns `true` if this string starts with the given *char*. # # ``` # "hello".starts_with?('h') # => true @@ -4377,7 +4399,7 @@ class String false end - # Returns true if this string starts with the given *re* regular expression, otherwise `false`. + # Returns `true` if the regular expression *re* matches at the start of this string. # # ``` # "22hello".starts_with?(/[0-9]/) # => true @@ -4391,7 +4413,7 @@ class String !!($~ = re.match_at_byte_index(self, 0, Regex::Options::ANCHORED)) end - # Returns true if this string ends with the given *str*, otherwise `false`. + # Returns `true` if this string ends with the given *str*. # # ``` # "hello".ends_with?("o") # => true @@ -4403,7 +4425,7 @@ class String (to_unsafe + bytesize - str.bytesize).memcmp(str.to_unsafe, str.bytesize) == 0 end - # Returns true if this string ends with the given *char*, otherwise `false`. + # Returns `true` if this string ends with the given *char*. # # ``` # "hello".ends_with?('o') # => true @@ -4426,7 +4448,7 @@ class String true end - # Returns true if this string ends with the given *re* regular expression, otherwise `false`. + # Returns `true` if the regular expression *re* matches at the end of this string. # # ``` # "22hello".ends_with?(/[0-9]/) # => false @@ -4575,27 +4597,27 @@ class String char_index end - # Returns `self` + # Returns `self`. def clone : String self end - # Returns `self` + # ditto def dup : String self end - # Returns `self` + # ditto def to_s : String self end - # Appends `self` characters to the given IO object. + # Appends `self` to *io*. def to_s(io : IO) : Nil io.write_utf8(to_slice) end - # Returns the underlying bytes of this String in an **unsafe** way. + # Returns the underlying bytes of this String. # # The returned slice is read-only. def to_slice : Bytes @@ -4607,17 +4629,17 @@ class String pointerof(@c) end - # Returns *count* of underlying bytes of this String starting at given *byte_offset* in an **unsafe** way. + # Returns *count* of underlying bytes of this String starting at given *byte_offset*. # # The returned slice is read-only. - def unsafe_byte_slice(byte_offset, count) + def unsafe_byte_slice(byte_offset, count) : Slice Slice.new(to_unsafe + byte_offset, count, read_only: true) end - # Returns the underlying bytes of this String starting at given *byte_offset* in an **unsafe** way. + # Returns the underlying bytes of this String starting at given *byte_offset*. # # The returned slice is read-only. - def unsafe_byte_slice(byte_offset) + def unsafe_byte_slice(byte_offset) : Slice Slice.new(to_unsafe + byte_offset, bytesize - byte_offset, read_only: true) end From d18213f1a09113ad5d305b32d954384095bba734 Mon Sep 17 00:00:00 2001 From: Jan Zajic Date: Sun, 17 Nov 2019 20:56:44 +0100 Subject: [PATCH 04/10] byte_index doc typo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Johannes Müller --- src/string.cr | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/string.cr b/src/string.cr index adb6dad27d45..5b7b56932714 100644 --- a/src/string.cr +++ b/src/string.cr @@ -3088,7 +3088,7 @@ class String # Returns the byte index of *search* in the string, or `nil` if the string is not present. # If *offset* is present, it defines the position to start the search. # - # # Negative *offset* can be used to start the search from the end of the string. + # Negative *offset* can be used to start the search from the end of the string. # # ``` # "¥hello".byte_index("hello") # => 2 From dd46819294d78badd9082a65603778e3211d1d3e Mon Sep 17 00:00:00 2001 From: Jan Zajic Date: Sun, 17 Nov 2019 21:16:00 +0100 Subject: [PATCH 05/10] change ditto to :ditto: in string doc Signed-off-by: Jan Zajic --- src/string.cr | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/string.cr b/src/string.cr index 5b7b56932714..3a4cf399af01 100644 --- a/src/string.cr +++ b/src/string.cr @@ -4602,12 +4602,12 @@ class String self end - # ditto + # :ditto: def dup : String self end - # ditto + # :ditto: def to_s : String self end From 456ba9ebb0642ad49b620f574d158f782634e3e9 Mon Sep 17 00:00:00 2001 From: Jan Zajic Date: Thu, 21 Nov 2019 16:23:08 +0100 Subject: [PATCH 06/10] validate offset in String#byte_index(Int,offset) and make negative offset possible --- spec/std/string_spec.cr | 8 ++++++++ src/string.cr | 17 +++++++++++++---- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/spec/std/string_spec.cr b/spec/std/string_spec.cr index ececca3c7766..0ade01d2fee6 100644 --- a/spec/std/string_spec.cr +++ b/spec/std/string_spec.cr @@ -932,6 +932,14 @@ describe "String" do it { "foo".byte_index('o'.ord).should eq(1) } it { "foo bar booz".byte_index('o'.ord, 3).should eq(9) } it { "foo".byte_index('a'.ord).should be_nil } + it { "foo".byte_index('a'.ord).should be_nil } + it { "foo".byte_index('o'.ord, 3).should be_nil } + it { + "Dizzy Miss Lizzy".byte_index('z'.ord).should eq(2) + "Dizzy Miss Lizzy".byte_index('z'.ord, 3).should eq(3) + "Dizzy Miss Lizzy".byte_index('z'.ord, -4).should eq(13) + "Dizzy Miss Lizzy".byte_index('z'.ord, -17).should be_nil + } it "gets byte index of string" do "hello world".byte_index("he").should eq(0) diff --git a/src/string.cr b/src/string.cr index 3a4cf399af01..01acb2f44858 100644 --- a/src/string.cr +++ b/src/string.cr @@ -3070,13 +3070,22 @@ class String # Returns the index of *byte* in the string, or `nil` if the byte is not present. # If *offset* is present, it defines the position to start the search. # + # Negative *offset* can be used to start the search from the end of the string. + # # ``` - # "Hello, World".byte_index(0x6f) # => 4 - # "Hello, World".byte_index(0x5a) # => nil - # "Hello, World".byte_index(0x6f, 5) # => 8 - # "💣".byte_index(0xA3) # => 3 + # "Hello, World".byte_index(0x6f) # => 4 + # "Hello, World".byte_index(0x5a) # => nil + # "Hello, World".byte_index(0x6f, 5) # => 8 + # "💣".byte_index(0xA3) # => 3 + # "Dizzy Miss Lizzy".byte_index('z'.ord) # => 2 + # "Dizzy Miss Lizzy".byte_index('z'.ord, 3) # => 3 + # "Dizzy Miss Lizzy".byte_index('z'.ord, -4) # => 13 + # "Dizzy Miss Lizzy".byte_index('z'.ord, -17) # => nil # ``` def byte_index(byte : Int, offset = 0) : Int32? + offset += bytesize if offset < 0 + return if offset < 0 + offset.upto(bytesize - 1) do |i| if to_unsafe[i] == byte return i From 01788598ec8a7e1ac68da47760193d16fc910574 Mon Sep 17 00:00:00 2001 From: Jan Zajic Date: Mon, 25 Nov 2019 22:20:41 +0100 Subject: [PATCH 07/10] Document String byte_slice risks --- src/string.cr | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/string.cr b/src/string.cr index 01acb2f44858..974e58d9ae04 100644 --- a/src/string.cr +++ b/src/string.cr @@ -907,9 +907,10 @@ class String # If *count* is bigger than the number of bytes from *start* to `#bytesize`, # only remaining bytes are returned. # - # Be careful when working with multibyte characters - they can be splitted, - # which may lead to invalid UTF-8 values. These, - # when asked as chars, will use the unicode replacement �. + # This method should be avoided, + # unless the string is proven to be ASCII-only (for example `#ascii_only?`), + # or the byte positions are known to be at character boundaries. + # Otherwise the characters are splitted, which leads to invalid UTF-8 values. # # Raises `IndexError` if the *start* index is out of bounds. # From 43a330b6dbba112d6255ccab6644e70f5c395248 Mon Sep 17 00:00:00 2001 From: Jan Zajic Date: Fri, 3 Jan 2020 14:30:26 +0100 Subject: [PATCH 08/10] #8447 requested changes --- spec/std/string_spec.cr | 5 +---- src/string.cr | 20 ++++++++++++-------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/spec/std/string_spec.cr b/spec/std/string_spec.cr index 0ade01d2fee6..a0ff535f3924 100644 --- a/spec/std/string_spec.cr +++ b/spec/std/string_spec.cr @@ -230,13 +230,10 @@ describe "String" do "hello".byte_slice(-2, 3).should eq("lo") end - it "gets byte_slice(Int) with with start out of bounds" do + it "gets byte_slice(Int) with start out of bounds" do expect_raises(IndexError) do "hello".byte_slice(10) end - end - - it "gets byte_slice(Int) with with start out of bounds" do expect_raises(IndexError) do "hello".byte_slice(-10) end diff --git a/src/string.cr b/src/string.cr index 974e58d9ae04..6e1e227522f4 100644 --- a/src/string.cr +++ b/src/string.cr @@ -868,7 +868,7 @@ class String char_at(index) { raise IndexError.new } end - # Returns the `Char` at the given *index*, or yields if out of bounds. + # Returns the `Char` at the given *index*, or result of running the given block if out of bounds. # # Negative indices can be used to start counting from the end of the string. # @@ -910,7 +910,7 @@ class String # This method should be avoided, # unless the string is proven to be ASCII-only (for example `#ascii_only?`), # or the byte positions are known to be at character boundaries. - # Otherwise the characters are splitted, which leads to invalid UTF-8 values. + # Otherwise, multi-byte characters may be split, leading to an invalid UTF-8 encoding. # # Raises `IndexError` if the *start* index is out of bounds. # @@ -924,9 +924,9 @@ class String # "hello".byte_slice(-2, 5) # => "he" # "¥hello".byte_slice(0, 2) # => "¥" # "¥hello".byte_slice(2, 2) # => "he" - # "¥hello".byte_slice(0, 1) # => "�" - # "¥hello".byte_slice(1, 1) # => "�" - # "¥hello".byte_slice(1, 2) # => "�h" + # "¥hello".byte_slice(0, 1) # => "�" (invalid UTF-8 character) + # "¥hello".byte_slice(1, 1) # => "�" (invalid UTF-8 character) + # "¥hello".byte_slice(1, 2) # => "�h" (invalid UTF-8 character) # "hello".byte_slice(6, 2) # raises IndexError # "hello".byte_slice(-6, 2) # raises IndexError # "hello".byte_slice(0, -2) # raises ArgumentError @@ -972,8 +972,10 @@ class String # *start* can can be negative to start counting # from the end of the string. # - # Be careful when working with multibyte characters - they can be splitted - # which may lead to unexpected result. + # This method should be avoided, + # unless the string is proven to be ASCII-only (for example `#ascii_only?`), + # or the byte positions are known to be at character boundaries. + # Otherwise, multi-byte characters may be split, leading to an invalid UTF-8 encoding. # # Raises `IndexError` if *start* index is out of bounds. # @@ -982,7 +984,7 @@ class String # "hello".byte_slice(2) # => "llo" # "hello".byte_slice(-2) # => "lo" # "¥hello".byte_slice(2) # => "hello" - # "¥hello".byte_slice(1) # => "�hello" + # "¥hello".byte_slice(1) # => "�hello" (invalid UTF-8 character) # "hello".byte_slice(6) # raises IndexError # "hello".byte_slice(-6) # raises IndexError # ``` @@ -2551,6 +2553,8 @@ class String # Returns `true` if this string is equal to `*other*. # Comparison is done byte-per-byte: if a byte is different from the corresponding # byte, `false` is returned and so on. + # + # See `#compare` for more comparison options. def ==(other : self) : Bool return true if same?(other) return false unless bytesize == other.bytesize From 2620493a24a7101312b17c9f96563527915bff40 Mon Sep 17 00:00:00 2001 From: Jan Zajic Date: Mon, 6 Jan 2020 17:05:39 +0100 Subject: [PATCH 09/10] typo in byte_slice?, byte_at? doc --- src/string.cr | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/string.cr b/src/string.cr index 6e1e227522f4..5300887d9a94 100644 --- a/src/string.cr +++ b/src/string.cr @@ -940,11 +940,11 @@ class String # Raises `ArgumentError` if *count* is negative. # # ``` - # "hello".byte_slice(0, 2) # => "he" - # "hello".byte_slice(0, 100) # => "hello" - # "hello".byte_slice(6, 2) # => nil - # "hello".byte_slice(-6, 2) # => nil - # "hello".byte_slice(0, -2) # raises ArgumentError + # "hello".byte_slice?(0, 2) # => "he" + # "hello".byte_slice?(0, 100) # => "hello" + # "hello".byte_slice?(6, 2) # => nil + # "hello".byte_slice?(-6, 2) # => nil + # "hello".byte_slice?(0, -2) # raises ArgumentError # ``` def byte_slice?(start : Int, count : Int) : String | Nil raise ArgumentError.new "Negative count" if count < 0 @@ -1030,12 +1030,12 @@ class String # Returns the byte at the given *index*, or `nil` if out of bounds. # # ``` - # "¥hello".byte_at(0) # => 194 - # "¥hello".byte_at(1) # => 165 - # "¥hello".byte_at(2) # => 104 - # "¥hello".byte_at(-1) # => 111 - # "¥hello".byte_at(6) # => 111 - # "¥hello".byte_at(7) # => nil + # "¥hello".byte_at?(0) # => 194 + # "¥hello".byte_at?(1) # => 165 + # "¥hello".byte_at?(2) # => 104 + # "¥hello".byte_at?(-1) # => 111 + # "¥hello".byte_at?(6) # => 111 + # "¥hello".byte_at?(7) # => nil # ``` def byte_at?(index) : UInt8 | Nil byte_at(index) { nil } From 1d8df36adcd57ec74af0cca0e303bfd55e046fc4 Mon Sep 17 00:00:00 2001 From: Jan Zajic Date: Thu, 9 Apr 2020 10:08:03 +0200 Subject: [PATCH 10/10] requested change of index/byte_index doc --- src/string.cr | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/string.cr b/src/string.cr index 5300887d9a94..b810daa6e298 100644 --- a/src/string.cr +++ b/src/string.cr @@ -2761,7 +2761,7 @@ class String {% end %} end - # Returns the index of *search* in the string, or `nil` if the string is not present. + # Returns the index of the _first_ occurrence of *search* in the string, or `nil` if not present. # If *offset* is present, it defines the position to start the search. # # ``` @@ -3072,7 +3072,7 @@ class String {pre, mid, post} end - # Returns the index of *byte* in the string, or `nil` if the byte is not present. + # Returns the index of the _first_ ocurrence of *byte* in the string, or `nil` if not present. # If *offset* is present, it defines the position to start the search. # # Negative *offset* can be used to start the search from the end of the string.