Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 46 additions & 1 deletion spec/std/string_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -203,9 +203,10 @@ describe "String" do
end
end

describe "byte_slice" do
describe "#byte_slice" do
it "gets byte_slice" do
"hello".byte_slice(1, 3).should eq("ell")
"hello".byte_slice(1..3).should eq("ell")
end

it "gets byte_slice with negative count" do
Expand All @@ -224,14 +225,19 @@ describe "String" do
expect_raises(IndexError) do
"hello".byte_slice(10, 3)
end
expect_raises(IndexError) do
"hello".byte_slice(10..13)
end
end

it "gets byte_slice with large count" do
"hello".byte_slice(1, 10).should eq("ello")
"hello".byte_slice(1..10).should eq("ello")
end

it "gets byte_slice with negative index" do
"hello".byte_slice(-2, 3).should eq("lo")
"hello".byte_slice(-2..-1).should eq("lo")
end

it "gets byte_slice(Int) with start out of bounds" do
Expand All @@ -244,6 +250,45 @@ describe "String" do
end
end

describe "#byte_slice?" do
it "gets byte_slice" do
"hello".byte_slice?(1, 3).should eq("ell")
"hello".byte_slice?(1..3).should eq("ell")
end

it "gets byte_slice with negative count" do
expect_raises(ArgumentError) do
"hello".byte_slice?(1, -10)
end
end

it "gets byte_slice with negative count at last" do
expect_raises(ArgumentError) do
"hello".byte_slice?(5, -1)
end
end

it "gets byte_slice with start out of bounds" do
"hello".byte_slice?(10, 3).should be_nil
"hello".byte_slice?(10..13).should be_nil
end

it "gets byte_slice with large count" do
"hello".byte_slice?(1, 10).should eq("ello")
"hello".byte_slice?(1..11).should eq("ello")
end

it "gets byte_slice with negative index" do
"hello".byte_slice?(-2, 3).should eq("lo")
"hello".byte_slice?(-2..-1).should eq("lo")
end

it "gets byte_slice(Int) with start out of bounds" do
"hello".byte_slice?(10).should be_nil
"hello".byte_slice?(-10).should be_nil
end
end

describe "to_i" do
it { "1234".to_i.should eq(1234) }
it { "-128".to_i8.should eq(-128) }
Expand Down
76 changes: 72 additions & 4 deletions src/string.cr
Original file line number Diff line number Diff line change
Expand Up @@ -1173,9 +1173,9 @@ class String
# "hello".byte_slice(-2, 5) # => "he"
Copy link
Contributor

@HertzDevil HertzDevil Dec 2, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This and the two calls above actually return "lo". I don't know why these aren't caught by the routine "fix examples in the docs" PRs

(those lines are due to #8447 so they are fairly old)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@maiha Any idea?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The basic premise is that my checkers cannot handle all EXAMPLES.

EXAMPLES that cannot be handled are managed as an "exclusion list" and excluded from testing.

In this case, the entire code block is excluded because the incorrect comment syntax would interfere with the automatic generation of tests.

"¥hello".byte_slice(0, 1)  # => "\C2" (invalid UTF-8 character)
                                      ^^^^^^^^^^^^^^^^^^^^^^^^^

The following code is generated and fails.

$ cd tmp/string/working && LC_ALL=C /v/crystal/bin/crystal run -D preview_overflow ../all_spec.cr 2>&1
Using compiled compiler at /v/crystal/.build/crystal
In /v/tmp/string/all_spec.cr:509:46

 509 | �[0m�[1m( "¥hello".byte_slice(0, 1) ).should eq( "�" (invalid UTF-8 character) )
                                                    ^
Error: expecting token ')', not '('

I understand. So how many examples of such exclusions do you have?

1979 test
1341 success 43 pending 3 general 147 pseudo 71 macro 9 wrong 44 random 321 failure

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps we could fix a format for such comments on the output value.

My initial idea for this would be to just append another comment:

"¥hello".byte_slice(0, 1)  # => "\C2" # invalid UTF-8 character

This should be fairly readable, unambiguous and easy to parse.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's good. I think it is the best choice so far. 💯

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@straight-shoota
I would like to inspect the excluded EXAMPLES like this one once again.

There are probably about 300 cases, and it takes about a week.
I smell a 1.7 release, will it be ready in time?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There should be time. 1.7.0 is scheduled for January 6.

# "¥hello".byte_slice(0, 2) # => "¥"
# "¥hello".byte_slice(2, 2) # => "he"
# "¥hello".byte_slice(0, 1) # => "" (invalid UTF-8 character)
# "¥hello".byte_slice(1, 1) # => "" (invalid UTF-8 character)
# "¥hello".byte_slice(1, 2) # => "�h" (invalid UTF-8 character)
# "¥hello".byte_slice(0, 1) # => "\xC2" (invalid UTF-8 character)
# "¥hello".byte_slice(1, 1) # => "\xA5" (invalid UTF-8 character)
# "¥hello".byte_slice(1, 2) # => "\xA5h" (invalid UTF-8 character)
# "hello".byte_slice(6, 2) # raises IndexError
# "hello".byte_slice(-6, 2) # raises IndexError
# "hello".byte_slice(0, -2) # raises ArgumentError
Expand All @@ -1184,6 +1184,35 @@ class String
byte_slice?(start, count) || raise IndexError.new
end

# Returns a new string built from byte in *range*.
#
# Byte indices can be negative to start counting from the end of the string.
# If the end index is bigger than `#bytesize`, only remaining bytes are returned.
#
# This method should be avoided,
# unless the string is proven to be ASCII-only (for example `#ascii_only?`),
# or the byte positions are known to be at character boundaries.
# Otherwise, multi-byte characters may be split, leading to an invalid UTF-8 encoding.
#
# Raises `IndexError` if the *range* begin is out of bounds.
#
# ```
# "hello".byte_slice(0..2) # => "hel"
# "hello".byte_slice(0..100) # => "hello"
# "hello".byte_slice(-2..3) # => "l"
# "hello".byte_slice(-2..5) # => "lo"
# "¥hello".byte_slice(0...2) # => "¥"
# "¥hello".byte_slice(2...4) # => "he"
# "¥hello".byte_slice(0..0) # => "\xC2" (invalid UTF-8 character)
# "¥hello".byte_slice(1..1) # => "\xA5" (invalid UTF-8 character)
# "¥hello".byte_slice(1..2) # => "\xA5h" (invalid UTF-8 character)
# "hello".byte_slice(6..2) # raises IndexError
# "hello".byte_slice(-6..2) # raises IndexError
# ```
def byte_slice(range : Range) : String
byte_slice(*Indexable.range_to_index_and_count(range, bytesize) || raise IndexError.new)
end

# Like `byte_slice(Int, Int)` but returns `Nil` if the *start* index is out of bounds.
#
# Raises `ArgumentError` if *count* is negative.
Expand All @@ -1209,6 +1238,18 @@ class String
end
end

# Like `byte_slice(Range)` but returns `Nil` if *range* begin is out of bounds.
#
# ```
# "hello".byte_slice?(0..2) # => "hel"
# "hello".byte_slice?(0..100) # => "hello"
# "hello".byte_slice?(6..8) # => nil
# "hello".byte_slice?(-6..2) # => nil
# ```
def byte_slice?(range : Range) : String?
byte_slice?(*Indexable.range_to_index_and_count(range, bytesize) || return nil)
end

# Returns a substring starting from the *start* byte.
#
# *start* can be negative to start counting
Expand All @@ -1226,7 +1267,7 @@ class String
# "hello".byte_slice(2) # => "llo"
# "hello".byte_slice(-2) # => "lo"
# "¥hello".byte_slice(2) # => "hello"
# "¥hello".byte_slice(1) # => "�hello" (invalid UTF-8 character)
# "¥hello".byte_slice(1) # => "\xA5hello" (invalid UTF-8 character)
# "hello".byte_slice(6) # raises IndexError
# "hello".byte_slice(-6) # raises IndexError
# ```
Expand All @@ -1236,6 +1277,33 @@ class String
byte_slice start, count
end

# Returns a substring starting from the *start* byte.
#
# *start* can be negative to start counting
# from the end of the string.
#
# This method should be avoided,
# unless the string is proven to be ASCII-only (for example `#ascii_only?`),
# or the byte positions are known to be at character boundaries.
# Otherwise, multi-byte characters may be split, leading to an invalid UTF-8 encoding.
#
# Returns `nil` if *start* index is out of bounds.
#
# ```
# "hello".byte_slice?(0) # => "hello"
# "hello".byte_slice?(2) # => "llo"
# "hello".byte_slice?(-2) # => "lo"
# "¥hello".byte_slice?(2) # => "hello"
# "¥hello".byte_slice?(1) # => "\xA5hello" (invalid UTF-8 character)
# "hello".byte_slice?(6) # => nil
# "hello".byte_slice?(-6) # => nil
# ```
def byte_slice?(start : Int) : String?
count = bytesize - start
return nil if start > 0 && count < 0
byte_slice? start, count
end

# Returns the codepoint of the character at the given *index*.
#
# Negative indices can be used to start counting from the end of the string.
Expand Down