Skip to content

Refactor integer parsing, add string to 128bit integer methods#11092

Closed
BlobCodes wants to merge 1 commit intocrystal-lang:masterfrom
BlobCodes:int-parsing-refactor
Closed

Refactor integer parsing, add string to 128bit integer methods#11092
BlobCodes wants to merge 1 commit intocrystal-lang:masterfrom
BlobCodes:int-parsing-refactor

Conversation

@BlobCodes
Copy link
Contributor

This PR adds the methods required to convert a string into Int128s and UInt128s.
Because of the internal changes to the integer parsing done in this PR, non-UInt64 numbers actually see a performance benefit when parsing aswell.

Benchmark Code

class String
    # Returns the result of interpreting leading characters in this string as an
  # integer base *base* (between 2 and 36).
  #
  # If there is not a valid number at the start of this string,
  # or if the resulting integer doesn't fit an `Int32`, an `ArgumentError` is raised.
  #
  # Options:
  # * **whitespace**: if `true`, leading and trailing whitespaces are allowed
  # * **underscore**: if `true`, underscores in numbers are allowed
  # * **prefix**: if `true`, the prefixes `"0x"`, `"0o"` and `"0b"` override the base
  # * **strict**: if `true`, extraneous characters past the end of the number are disallowed
  # * **leading_zero_is_octal**: if `true`, then a number prefixed with `"0"` will be treated as an octal
  #
  # ```
  # "12345".to_i             # => 12345
  # "0a".to_i                # raises ArgumentError
  # "hello".to_i             # raises ArgumentError
  # "0a".to_i(16)            # => 10
  # "1100101".to_i(2)        # => 101
  # "1100101".to_i(8)        # => 294977
  # "1100101".to_i(10)       # => 1100101
  # "1100101".to_i(base: 16) # => 17826049
  #
  # "12_345".to_i                   # raises ArgumentError
  # "12_345".to_i(underscore: true) # => 12345
  #
  # "  12345  ".to_i                    # => 12345
  # "  12345  ".to_i(whitespace: false) # raises ArgumentError
  #
  # "0x123abc".to_i               # raises ArgumentError
  # "0x123abc".to_i(prefix: true) # => 1194684
  #
  # "99 red balloons".to_i                # raises ArgumentError
  # "99 red balloons".to_i(strict: false) # => 99
  #
  # "0755".to_i                              # => 755
  # "0755".to_i(leading_zero_is_octal: true) # => 493
  # ```
  def new_to_i(base : Int = 10, whitespace : Bool = true, underscore : Bool = false, prefix : Bool = false, strict : Bool = true, leading_zero_is_octal : Bool = false)
    new_to_i32(base, whitespace, underscore, prefix, strict, leading_zero_is_octal)
  end

  # Same as `#to_i`, but returns `nil` if there is not a valid number at the start
  # of this string, or if the resulting integer doesn't fit an `Int32`.
  #
  # ```
  # "12345".to_i?             # => 12345
  # "99 red balloons".to_i?   # => nil
  # "0a".to_i?(strict: false) # => 0
  # "hello".to_i?             # => nil
  # ```
  def new_to_i?(base : Int = 10, whitespace : Bool = true, underscore : Bool = false, prefix : Bool = false, strict : Bool = true, leading_zero_is_octal : Bool = false)
    new_to_i32?(base, whitespace, underscore, prefix, strict, leading_zero_is_octal)
  end

  # Same as `#to_i`, but returns the block's value if there is not a valid number at the start
  # of this string, or if the resulting integer doesn't fit an `Int32`.
  #
  # ```
  # "12345".to_i { 0 } # => 12345
  # "hello".to_i { 0 } # => 0
  # ```
  def new_to_i(base : Int = 10, whitespace : Bool = true, underscore : Bool = false, prefix : Bool = false, strict : Bool = true, leading_zero_is_octal : Bool = false, &block)
    new_to_i32(base, whitespace, underscore, prefix, strict, leading_zero_is_octal) { yield }
  end

  # Same as `#to_i` but returns an `Int8`.
  def new_to_i8(base : Int = 10, whitespace : Bool = true, underscore : Bool = false, prefix : Bool = false, strict : Bool = true, leading_zero_is_octal : Bool = false) : Int8
    new_to_i8(base, whitespace, underscore, prefix, strict, leading_zero_is_octal) { raise ArgumentError.new("Invalid Int8: #{self}") }
  end

  # Same as `#to_i` but returns an `Int8` or `nil`.
  def new_to_i8?(base : Int = 10, whitespace : Bool = true, underscore : Bool = false, prefix : Bool = false, strict : Bool = true, leading_zero_is_octal : Bool = false) : Int8?
    new_to_i8(base, whitespace, underscore, prefix, strict, leading_zero_is_octal) { nil }
  end

  # Same as `#to_i` but returns an `Int8` or the block's value.
  def new_to_i8(base : Int = 10, whitespace : Bool = true, underscore : Bool = false, prefix : Bool = false, strict : Bool = true, leading_zero_is_octal : Bool = false, &block)
    new_gen_to_ Int8, UInt8, 127, 128
  end

  # Same as `#to_i` but returns an `UInt8`.
  def new_to_u8(base : Int = 10, whitespace : Bool = true, underscore : Bool = false, prefix : Bool = false, strict : Bool = true, leading_zero_is_octal : Bool = false) : UInt8
    new_to_u8(base, whitespace, underscore, prefix, strict, leading_zero_is_octal) { raise ArgumentError.new("Invalid UInt8: #{self}") }
  end

  # Same as `#to_i` but returns an `UInt8` or `nil`.
  def new_to_u8?(base : Int = 10, whitespace : Bool = true, underscore : Bool = false, prefix : Bool = false, strict : Bool = true, leading_zero_is_octal : Bool = false) : UInt8?
    new_to_u8(base, whitespace, underscore, prefix, strict, leading_zero_is_octal) { nil }
  end

  # Same as `#to_i` but returns an `UInt8` or the block's value.
  def new_to_u8(base : Int = 10, whitespace : Bool = true, underscore : Bool = false, prefix : Bool = false, strict : Bool = true, leading_zero_is_octal : Bool = false, &block)
    new_gen_to_ UInt8, UInt8
  end

  # Same as `#to_i` but returns an `Int16`.
  def new_to_i16(base : Int = 10, whitespace : Bool = true, underscore : Bool = false, prefix : Bool = false, strict : Bool = true, leading_zero_is_octal : Bool = false) : Int16
    new_to_i16(base, whitespace, underscore, prefix, strict, leading_zero_is_octal) { raise ArgumentError.new("Invalid Int16: #{self}") }
  end

  # Same as `#to_i` but returns an `Int16` or `nil`.
  def new_to_i16?(base : Int = 10, whitespace : Bool = true, underscore : Bool = false, prefix : Bool = false, strict : Bool = true, leading_zero_is_octal : Bool = false) : Int16?
    new_to_i16(base, whitespace, underscore, prefix, strict, leading_zero_is_octal) { nil }
  end

  # Same as `#to_i` but returns an `Int16` or the block's value.
  def new_to_i16(base : Int = 10, whitespace : Bool = true, underscore : Bool = false, prefix : Bool = false, strict : Bool = true, leading_zero_is_octal : Bool = false, &block)
    new_gen_to_ Int16, UInt16, 32767, 32768
  end

  # Same as `#to_i` but returns an `UInt16`.
  def new_to_u16(base : Int = 10, whitespace : Bool = true, underscore : Bool = false, prefix : Bool = false, strict : Bool = true, leading_zero_is_octal : Bool = false) : UInt16
    new_to_u16(base, whitespace, underscore, prefix, strict, leading_zero_is_octal) { raise ArgumentError.new("Invalid UInt16: #{self}") }
  end

  # Same as `#to_i` but returns an `UInt16` or `nil`.
  def new_to_u16?(base : Int = 10, whitespace : Bool = true, underscore : Bool = false, prefix : Bool = false, strict : Bool = true, leading_zero_is_octal : Bool = false) : UInt16?
    new_to_u16(base, whitespace, underscore, prefix, strict, leading_zero_is_octal) { nil }
  end

  # Same as `#to_i` but returns an `UInt16` or the block's value.
  def new_to_u16(base : Int = 10, whitespace : Bool = true, underscore : Bool = false, prefix : Bool = false, strict : Bool = true, leading_zero_is_octal : Bool = false, &block)
    new_gen_to_ UInt16, UInt16
  end

  # Same as `#to_i`.
  def new_to_i32(base : Int = 10, whitespace : Bool = true, underscore : Bool = false, prefix : Bool = false, strict : Bool = true, leading_zero_is_octal : Bool = false) : Int32
    new_to_i32(base, whitespace, underscore, prefix, strict, leading_zero_is_octal) { raise ArgumentError.new("Invalid Int32: #{self}") }
  end

  # Same as `#to_i`.
  def new_to_i32?(base : Int = 10, whitespace : Bool = true, underscore : Bool = false, prefix : Bool = false, strict : Bool = true, leading_zero_is_octal : Bool = false) : Int32?
    new_to_i32(base, whitespace, underscore, prefix, strict, leading_zero_is_octal) { nil }
  end

  # Same as `#to_i`.
  def new_to_i32(base : Int = 10, whitespace : Bool = true, underscore : Bool = false, prefix : Bool = false, strict : Bool = true, leading_zero_is_octal : Bool = false, &block)
    new_gen_to_ Int32, UInt32, 2147483647, 2147483648
  end

  # Same as `#to_i` but returns an `UInt32`.
  def new_to_u32(base : Int = 10, whitespace : Bool = true, underscore : Bool = false, prefix : Bool = false, strict : Bool = true, leading_zero_is_octal : Bool = false) : UInt32
    new_to_u32(base, whitespace, underscore, prefix, strict, leading_zero_is_octal) { raise ArgumentError.new("Invalid UInt32: #{self}") }
  end

  # Same as `#to_i` but returns an `UInt32` or `nil`.
  def new_to_u32?(base : Int = 10, whitespace : Bool = true, underscore : Bool = false, prefix : Bool = false, strict : Bool = true, leading_zero_is_octal : Bool = false) : UInt32?
    new_to_u32(base, whitespace, underscore, prefix, strict, leading_zero_is_octal) { nil }
  end

  # Same as `#to_i` but returns an `UInt32` or the block's value.
  def new_to_u32(base : Int = 10, whitespace : Bool = true, underscore : Bool = false, prefix : Bool = false, strict : Bool = true, leading_zero_is_octal : Bool = false, &block)
    new_gen_to_ UInt32, UInt32
  end

  # Same as `#to_i` but returns an `Int64`.
  def new_to_i64(base : Int = 10, whitespace : Bool = true, underscore : Bool = false, prefix : Bool = false, strict : Bool = true, leading_zero_is_octal : Bool = false) : Int64
    new_to_i64(base, whitespace, underscore, prefix, strict, leading_zero_is_octal) { raise ArgumentError.new("Invalid Int64: #{self}") }
  end

  # Same as `#to_i` but returns an `Int64` or `nil`.
  def new_to_i64?(base : Int = 10, whitespace : Bool = true, underscore : Bool = false, prefix : Bool = false, strict : Bool = true, leading_zero_is_octal : Bool = false) : Int64?
    new_to_i64(base, whitespace, underscore, prefix, strict, leading_zero_is_octal) { nil }
  end

  # Same as `#to_i` but returns an `Int64` or the block's value.
  def new_to_i64(base : Int = 10, whitespace : Bool = true, underscore : Bool = false, prefix : Bool = false, strict : Bool = true, leading_zero_is_octal : Bool = false, &block)
    new_gen_to_ Int64, UInt64, 9223372036854775807, 9223372036854775808
  end

  # Same as `#to_i` but returns an `UInt64`.
  def new_to_u64(base : Int = 10, whitespace : Bool = true, underscore : Bool = false, prefix : Bool = false, strict : Bool = true, leading_zero_is_octal : Bool = false) : UInt64
    new_to_u64(base, whitespace, underscore, prefix, strict, leading_zero_is_octal) { raise ArgumentError.new("Invalid UInt64: #{self}") }
  end

  # Same as `#to_i` but returns an `UInt64` or `nil`.
  def new_to_u64?(base : Int = 10, whitespace : Bool = true, underscore : Bool = false, prefix : Bool = false, strict : Bool = true, leading_zero_is_octal : Bool = false) : UInt64?
    new_to_u64(base, whitespace, underscore, prefix, strict, leading_zero_is_octal) { nil }
  end

  # Same as `#to_i` but returns an `UInt64` or the block's value.
  def new_to_u64(base : Int = 10, whitespace : Bool = true, underscore : Bool = false, prefix : Bool = false, strict : Bool = true, leading_zero_is_octal : Bool = false, &block)
    new_gen_to_ UInt64, UInt64
  end

  # Same as `#to_i` but returns an `Int128`.
  def new_to_i128(base : Int = 10, whitespace : Bool = true, underscore : Bool = false, prefix : Bool = false, strict : Bool = true, leading_zero_is_octal : Bool = false) : Int128
    new_to_i128(base, whitespace, underscore, prefix, strict, leading_zero_is_octal) { raise ArgumentError.new("Invalid Int128: #{self}") }
  end

  # Same as `#to_i` but returns an `Int128` or `nil`.
  def new_to_i128?(base : Int = 10, whitespace : Bool = true, underscore : Bool = false, prefix : Bool = false, strict : Bool = true, leading_zero_is_octal : Bool = false) : Int128?
    new_to_i128(base, whitespace, underscore, prefix, strict, leading_zero_is_octal) { nil }
  end

  # Same as `#to_i` but returns an `Int128` or the block's value.
  def new_to_i128(base : Int = 10, whitespace : Bool = true, underscore : Bool = false, prefix : Bool = false, strict : Bool = true, leading_zero_is_octal : Bool = false, &block)
    new_gen_to_ Int128, UInt128, Int128::MAX, (UInt128.new(Int128::MAX) + 1)
  end

  # Same as `#to_i` but returns an `UInt128`.
  def new_to_u128(base : Int = 10, whitespace : Bool = true, underscore : Bool = false, prefix : Bool = false, strict : Bool = true, leading_zero_is_octal : Bool = false) : UInt128
    new_to_u128(base, whitespace, underscore, prefix, strict, leading_zero_is_octal) { raise ArgumentError.new("Invalid UInt128: #{self}") }
  end

  # Same as `#to_i` but returns an `UInt128` or `nil`.
  def new_to_u128?(base : Int = 10, whitespace : Bool = true, underscore : Bool = false, prefix : Bool = false, strict : Bool = true, leading_zero_is_octal : Bool = false) : UInt128?
    new_to_u128(base, whitespace, underscore, prefix, strict, leading_zero_is_octal) { nil }
  end

  # Same as `#to_i` but returns an `UInt128` or the block's value.
  def new_to_u128(base : Int = 10, whitespace : Bool = true, underscore : Bool = false, prefix : Bool = false, strict : Bool = true, leading_zero_is_octal : Bool = false, &block)
    new_gen_to_ UInt128, UInt128
  end

  private macro new_gen_to_(int_class, unsigned_int_class, max_positive = nil, max_negative = nil)
    {% unsigned = int_class == unsigned_int_class %}
    info = to_unsigned_info({{unsigned_int_class}}, base, whitespace, underscore, prefix, strict, leading_zero_is_octal, unsigned: {{unsigned}})

    return yield if info["invalid"]

    if info["negative"]
      {% if max_negative %}
        return yield if info["value"] > {{max_negative}}
        (~info["value"] &+ 1).unsafe_as({{int_class}})
      {% else %}
        return yield
      {% end %}
    else
      {% if max_positive %}
        return yield if info["value"] > {{max_positive}}
      {% end %}
      {{int_class}}.new(info["value"])
    end
  end

  private def to_unsigned_info(int_class, base, whitespace, underscore, prefix, strict, leading_zero_is_octal, unsigned)
    raise ArgumentError.new("Invalid base #{base}") unless 2 <= base <= 36 || base == 62

    ptr = to_unsafe

    # Skip leading whitespace
    if whitespace
      ptr += calc_excess_left
    end

    negative = false

    # Check + and -
    case ptr.value.unsafe_chr
    when '-'
      if unsigned
        return {value: int_class.new(0), negative: true, invalid: true}
      end
      negative = true
      ptr += 1
    when '+'
      ptr += 1
    else
      # no sign prefix
    end

    found_digit = false

    # Check leading zero
    if ptr.value.unsafe_chr == '0'
      ptr += 1

      if prefix
        case ptr.value.unsafe_chr
        when 'b'
          base = 2
          ptr += 1
        when 'x'
          base = 16
          ptr += 1
        when 'o'
          base = 8
          ptr += 1
        else
          if leading_zero_is_octal
            base = 8
          else
            base = 10
            found_digit = true
          end
        end
      elsif leading_zero_is_octal
        base = 8
      else
        found_digit = true
      end
    end

    value = int_class.new(0)
    mul_overflow = ~(int_class.new(0)) // base
    last_is_underscore = true
    invalid = false

    digits = (base == 62 ? CHAR_TO_DIGIT62 : CHAR_TO_DIGIT).to_unsafe
    while ptr.value != 0
      if underscore && ptr.value.unsafe_chr == '_'
        break if last_is_underscore
        last_is_underscore = true
        ptr += 1
        next
      end

      last_is_underscore = false
      digit = digits[ptr.value]
      if digit == -1 || digit >= base
        break
      end

      if value > mul_overflow
        invalid = true
        break
      end

      value *= base

      old = value
      value &+= digit
      if value < old
        invalid = true
        break
      end

      found_digit = true
      ptr += 1
    end

    if found_digit
      unless ptr.value == 0
        if whitespace
          ptr += calc_excess_right
        end

        if strict && ptr.value != 0
          invalid = true
        end
      end
    else
      invalid = true
    end

    {value: value, negative: negative, invalid: invalid}
  end

end

require "benchmark"

## UInt128

puts "Benchmark: String#to_u128"

ints = [] of String
10000.times { ints << {rand(UInt64::MAX), rand(UInt64::MAX)}.unsafe_as(UInt128).to_s }

Benchmark.ips do |x|
  x.report("new") { ints.map { |i| i.new_to_u128 } }
end

puts "\n"

## UInt64

puts "Benchmark: String#to_u64"

ints = [] of String
10000.times { ints << rand(UInt64::MAX).to_s }

Benchmark.ips do |x|
  x.report("old") { ints.map { |i| i.to_u64 } }
  x.report("new") { ints.map { |i| i.new_to_u64 } }
end

puts "\n"

# UInt32

puts "Benchmark: String#to_u32"

ints = [] of String
10000.times { ints << rand(UInt32::MAX).to_s }

Benchmark.ips do |x|
  x.report("old") { ints.map { |i| i.to_u32 } }
  x.report("new") { ints.map { |i| i.new_to_u32 } }
end

puts "\n"

# UInt16

puts "Benchmark: String#to_u16"

ints = [] of String
10000.times { ints << rand(UInt16::MAX).to_s }

Benchmark.ips do |x|
  x.report("old") { ints.map { |i| i.to_u16 } }
  x.report("new") { ints.map { |i| i.new_to_u16 } }
end

puts "\n"

# UInt8

puts "Benchmark: String#to_u8"

ints = [] of String
10000.times { ints << rand(UInt8::MAX).to_s }

Benchmark.ips do |x|
  x.report("old") { ints.map { |i| i.to_u8 } }
  x.report("new") { ints.map { |i| i.new_to_u8 } }
end

puts "\n"

## Int128

puts "Benchmark: String#to_i128"

ints = [] of String
10000.times { ints << {rand(Int64::MAX), rand(Int64::MAX)}.unsafe_as(Int128).to_s }

Benchmark.ips do |x|
  x.report("new") { ints.map { |i| i.new_to_i128 } }
end

puts "\n"

## Int64

puts "Benchmark: String#to_i64"

ints = [] of String
10000.times { ints << rand(Int64::MAX).to_s }

Benchmark.ips do |x|
  x.report("old") { ints.map { |i| i.to_i64 } }
  x.report("new") { ints.map { |i| i.new_to_i64 } }
end

puts "\n"

# Int32

puts "Benchmark: String#to_i32"

ints = [] of String
10000.times { ints << rand(Int32::MAX).to_s }

Benchmark.ips do |x|
  x.report("old") { ints.map { |i| i.to_u32 } }
  x.report("new") { ints.map { |i| i.new_to_u32 } }
end

puts "\n"

# Int16

puts "Benchmark: String#to_i16"

ints = [] of String
10000.times { ints << rand(Int16::MAX).to_s }

Benchmark.ips do |x|
  x.report("old") { ints.map { |i| i.to_u16 } }
  x.report("new") { ints.map { |i| i.new_to_u16 } }
end

puts "\n"

# Int8

puts "Benchmark: String#to_i8"

ints = [] of String
10000.times { ints << rand(Int8::MAX).to_s }

Benchmark.ips do |x|
  x.report("old") { ints.map { |i| i.to_u8 } }
  x.report("new") { ints.map { |i| i.new_to_u8 } }
end

Benchmarks!

Benchmark: String#to_u128
new 839.54  (  1.19ms) (± 0.36%)  156kB/op  fastest

Benchmark: String#to_u64
old   2.23k (448.32µs) (± 0.92%)  78.1kB/op        fastest
new   2.23k (448.66µs) (± 1.26%)  78.1kB/op   1.00× slower

Benchmark: String#to_u32
old   3.58k (279.22µs) (± 0.80%)  39.1kB/op   1.50× slower
new   5.37k (186.16µs) (± 0.68%)  39.1kB/op        fastest

Benchmark: String#to_u16
old   5.46k (183.24µs) (± 0.57%)  19.5kB/op   1.28× slower
new   7.01k (142.75µs) (± 0.72%)  19.5kB/op        fastest

Benchmark: String#to_u8
old   6.75k (148.21µs) (± 0.80%)  9.8kB/op   1.29× slower
new   8.70k (114.89µs) (± 0.72%)  9.8kB/op        fastest

Benchmark: String#to_i128
new 830.74  (  1.20ms) (± 0.63%)  156kB/op  fastest

Benchmark: String#to_i64
old   2.32k (431.02µs) (± 0.68%)  78.1kB/op   1.01× slower
new   2.35k (424.74µs) (± 0.60%)  78.1kB/op        fastest

Benchmark: String#to_i32
old   3.44k (290.29µs) (± 0.92%)  39.1kB/op   1.47× slower
new   5.08k (196.94µs) (± 0.55%)  39.1kB/op        fastest

Benchmark: String#to_i16
old   5.42k (184.63µs) (± 0.91%)  19.5kB/op   1.23× slower
new   6.66k (150.12µs) (± 0.68%)  19.5kB/op        fastest

Benchmark: String#to_i8
old   7.47k (133.83µs) (± 0.55%)  9.8kB/op   1.36× slower
new  10.15k ( 98.51µs) (± 0.64%)  9.8kB/op        fastest

New specs were created for the 128-bit integer parsing (all ran fine).

Closes: #9516
Related: #8373


# Same as `#to_i` but returns an `Int128` or the block's value.
def to_i128(base : Int = 10, whitespace : Bool = true, underscore : Bool = false, prefix : Bool = false, strict : Bool = true, leading_zero_is_octal : Bool = false, &block)
gen_to_ Int128, UInt128, Int128::MAX, (UInt128.new(Int128::MAX) + 1)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Btw I couldn't use integer literals here because crystal doesn't support that. Hopefully it will do soon, then this can look nicer.

@BlobCodes
Copy link
Contributor Author

Note that some checks won't run successfully because of the bad 128bit support (complaining about missing __modti3, __divti3, __udivti3, __umodti3).

Comment on lines -503 to -506
record ToU64Info,
value : UInt64,
negative : Bool,
invalid : Bool
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can parameterize over the integer type directly:

record ToUnsignedInfo(T), value : T, # ...

There is no need to switch to a NamedTuple.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have improved this in #11093

@HertzDevil
Copy link
Contributor

HertzDevil commented Aug 14, 2021

The performance optimizations are independent from the added 128-bit methods. The former should belong in a separate PR since they should be merged even if we never get full 128-bit integer support.

@BlobCodes
Copy link
Contributor Author

BlobCodes commented Aug 20, 2021

Replaced by #11111 and #11093

@BlobCodes BlobCodes closed this Aug 20, 2021
@BlobCodes BlobCodes deleted the int-parsing-refactor branch January 29, 2022 22:38
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

can't convert string to [u|i]128: "string".to_[u|i]128

2 participants