diff --git a/spec/std/string_spec.cr b/spec/std/string_spec.cr index a8d047b77977..b53c7479a31e 100644 --- a/spec/std/string_spec.cr +++ b/spec/std/string_spec.cr @@ -203,9 +203,10 @@ describe "String" do end end - describe "byte_slice" do + describe "#byte_slice" do it "gets byte_slice" do "hello".byte_slice(1, 3).should eq("ell") + "hello".byte_slice(1..3).should eq("ell") end it "gets byte_slice with negative count" do @@ -224,14 +225,19 @@ describe "String" do expect_raises(IndexError) do "hello".byte_slice(10, 3) end + expect_raises(IndexError) do + "hello".byte_slice(10..13) + end end it "gets byte_slice with large count" do "hello".byte_slice(1, 10).should eq("ello") + "hello".byte_slice(1..10).should eq("ello") end it "gets byte_slice with negative index" do "hello".byte_slice(-2, 3).should eq("lo") + "hello".byte_slice(-2..-1).should eq("lo") end it "gets byte_slice(Int) with start out of bounds" do @@ -244,6 +250,45 @@ describe "String" do end end + describe "#byte_slice?" do + it "gets byte_slice" do + "hello".byte_slice?(1, 3).should eq("ell") + "hello".byte_slice?(1..3).should eq("ell") + end + + it "gets byte_slice with negative count" do + expect_raises(ArgumentError) do + "hello".byte_slice?(1, -10) + end + end + + it "gets byte_slice with negative count at last" do + expect_raises(ArgumentError) do + "hello".byte_slice?(5, -1) + end + end + + it "gets byte_slice with start out of bounds" do + "hello".byte_slice?(10, 3).should be_nil + "hello".byte_slice?(10..13).should be_nil + end + + it "gets byte_slice with large count" do + "hello".byte_slice?(1, 10).should eq("ello") + "hello".byte_slice?(1..11).should eq("ello") + end + + it "gets byte_slice with negative index" do + "hello".byte_slice?(-2, 3).should eq("lo") + "hello".byte_slice?(-2..-1).should eq("lo") + end + + it "gets byte_slice(Int) with start out of bounds" do + "hello".byte_slice?(10).should be_nil + "hello".byte_slice?(-10).should be_nil + end + end + describe "to_i" do it { "1234".to_i.should eq(1234) } it { "-128".to_i8.should eq(-128) } diff --git a/src/string.cr b/src/string.cr index dd3918252002..0ff7c3ec3c2c 100644 --- a/src/string.cr +++ b/src/string.cr @@ -1173,9 +1173,9 @@ class String # "hello".byte_slice(-2, 5) # => "he" # "¥hello".byte_slice(0, 2) # => "¥" # "¥hello".byte_slice(2, 2) # => "he" - # "¥hello".byte_slice(0, 1) # => "�" (invalid UTF-8 character) - # "¥hello".byte_slice(1, 1) # => "�" (invalid UTF-8 character) - # "¥hello".byte_slice(1, 2) # => "�h" (invalid UTF-8 character) + # "¥hello".byte_slice(0, 1) # => "\xC2" (invalid UTF-8 character) + # "¥hello".byte_slice(1, 1) # => "\xA5" (invalid UTF-8 character) + # "¥hello".byte_slice(1, 2) # => "\xA5h" (invalid UTF-8 character) # "hello".byte_slice(6, 2) # raises IndexError # "hello".byte_slice(-6, 2) # raises IndexError # "hello".byte_slice(0, -2) # raises ArgumentError @@ -1184,6 +1184,35 @@ class String byte_slice?(start, count) || raise IndexError.new end + # Returns a new string built from byte in *range*. + # + # Byte indices can be negative to start counting from the end of the string. + # If the end index is bigger than `#bytesize`, only remaining bytes are returned. + # + # This method should be avoided, + # unless the string is proven to be ASCII-only (for example `#ascii_only?`), + # or the byte positions are known to be at character boundaries. + # Otherwise, multi-byte characters may be split, leading to an invalid UTF-8 encoding. + # + # Raises `IndexError` if the *range* begin is out of bounds. + # + # ``` + # "hello".byte_slice(0..2) # => "hel" + # "hello".byte_slice(0..100) # => "hello" + # "hello".byte_slice(-2..3) # => "l" + # "hello".byte_slice(-2..5) # => "lo" + # "¥hello".byte_slice(0...2) # => "¥" + # "¥hello".byte_slice(2...4) # => "he" + # "¥hello".byte_slice(0..0) # => "\xC2" (invalid UTF-8 character) + # "¥hello".byte_slice(1..1) # => "\xA5" (invalid UTF-8 character) + # "¥hello".byte_slice(1..2) # => "\xA5h" (invalid UTF-8 character) + # "hello".byte_slice(6..2) # raises IndexError + # "hello".byte_slice(-6..2) # raises IndexError + # ``` + def byte_slice(range : Range) : String + byte_slice(*Indexable.range_to_index_and_count(range, bytesize) || raise IndexError.new) + end + # Like `byte_slice(Int, Int)` but returns `Nil` if the *start* index is out of bounds. # # Raises `ArgumentError` if *count* is negative. @@ -1209,6 +1238,18 @@ class String end end + # Like `byte_slice(Range)` but returns `Nil` if *range* begin is out of bounds. + # + # ``` + # "hello".byte_slice?(0..2) # => "hel" + # "hello".byte_slice?(0..100) # => "hello" + # "hello".byte_slice?(6..8) # => nil + # "hello".byte_slice?(-6..2) # => nil + # ``` + def byte_slice?(range : Range) : String? + byte_slice?(*Indexable.range_to_index_and_count(range, bytesize) || return nil) + end + # Returns a substring starting from the *start* byte. # # *start* can be negative to start counting @@ -1226,7 +1267,7 @@ class String # "hello".byte_slice(2) # => "llo" # "hello".byte_slice(-2) # => "lo" # "¥hello".byte_slice(2) # => "hello" - # "¥hello".byte_slice(1) # => "�hello" (invalid UTF-8 character) + # "¥hello".byte_slice(1) # => "\xA5hello" (invalid UTF-8 character) # "hello".byte_slice(6) # raises IndexError # "hello".byte_slice(-6) # raises IndexError # ``` @@ -1236,6 +1277,33 @@ class String byte_slice start, count end + # Returns a substring starting from the *start* byte. + # + # *start* can be negative to start counting + # from the end of the string. + # + # This method should be avoided, + # unless the string is proven to be ASCII-only (for example `#ascii_only?`), + # or the byte positions are known to be at character boundaries. + # Otherwise, multi-byte characters may be split, leading to an invalid UTF-8 encoding. + # + # Returns `nil` if *start* index is out of bounds. + # + # ``` + # "hello".byte_slice?(0) # => "hello" + # "hello".byte_slice?(2) # => "llo" + # "hello".byte_slice?(-2) # => "lo" + # "¥hello".byte_slice?(2) # => "hello" + # "¥hello".byte_slice?(1) # => "\xA5hello" (invalid UTF-8 character) + # "hello".byte_slice?(6) # => nil + # "hello".byte_slice?(-6) # => nil + # ``` + def byte_slice?(start : Int) : String? + count = bytesize - start + return nil if start > 0 && count < 0 + byte_slice? start, count + end + # Returns the codepoint of the character at the given *index*. # # Negative indices can be used to start counting from the end of the string.