stint/private/uint_mul.nim

# Stint
# Copyright 2018 Status Research & Development GmbH
# Licensed under either of
#
#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
#
# at your option. This file may not be copied, modified, or distributed except according to those terms.

import  macros,
        ./conversion,
        ./initialization,
        ./datatypes,
        ./uint_comparison,
        ./uint_addsub

# ################### Multiplication ################### #

func lo(x: uint64): uint64 {.inline.} =
  const
    p: uint64 = 32
    base: uint64 = 1 shl p
    mask: uint64 = base - 1
  result = x and mask

func hi(x: uint64): uint64 {.inline.} =
  const
    p = 32
  result = x shr p

# No generic, somehow Nim is given ambiguous call with the T: UintImpl overload
func extPrecMul*(result: var UintImpl[uint8], x, y: uint8) {.inline.}=
  ## Extended precision multiplication
  result = cast[type result](x.asDoubleUint * y.asDoubleUint)

func extPrecMul*(result: var UintImpl[uint16], x, y: uint16) {.inline.}=
  ## Extended precision multiplication
  result = cast[type result](x.asDoubleUint * y.asDoubleUint)

func extPrecMul*(result: var UintImpl[uint32], x, y: uint32) {.inline.}=
  ## Extended precision multiplication
  result = cast[type result](x.asDoubleUint * y.asDoubleUint)

func extPrecAddMul[T: uint8 or uint16 or uint32](result: var UintImpl[T], x, y: T) {.inline.}=
  ## Extended precision fused in-place addition & multiplication
  result += cast[type result](x.asDoubleUint * y.asDoubleUint)

template extPrecMulImpl(result: var UintImpl[uint64], op: untyped, u, v: uint64) =
  const
    p = 64 div 2
    base: uint64 = 1 shl p

  var
    x0, x1, x2, x3: uint64

  let
    ul = lo(u)
    uh = hi(u)
    vl = lo(v)
    vh = hi(v)

  x0 = ul * vl
  x1 = ul * vh
  x2 = uh * vl
  x3 = uh * vh

  x1 += hi(x0)          # This can't carry
  x1 += x2              # but this can
  if x1 < x2:           # if carry, add it to x3
    x3 += base

  op(result.hi, x3 + hi(x1))
  op(result.lo, (x1 shl p) or lo(x0))

func extPrecMul*(result: var UintImpl[uint64], u, v: uint64) =
  ## Extended precision multiplication
  extPrecMulImpl(result, `=`, u, v)

func extPrecAddMul(result: var UintImpl[uint64], u, v: uint64) =
  ## Extended precision fused in-place addition & multiplication
  extPrecMulImpl(result, `+=`, u, v)

macro eqSym(x, y: untyped): untyped =
  let eq = $x == $y # Unfortunately eqIdent compares to string.
  result = quote do: `eq`

func extPrecAddMul[T](result: var UintImpl[UintImpl[T]], u, v: UintImpl[T])
func extPrecMul*[T](result: var UintImpl[UintImpl[T]], u, v: UintImpl[T])
  # Forward declaration

template extPrecMulImpl*[T](result: var UintImpl[UintImpl[T]], op: untyped, x, y: UintImpl[T]) =
  # See details at
  # https://en.wikipedia.org/wiki/Karatsuba_algorithm
  # https://locklessinc.com/articles/256bit_arithmetic/
  # https://www.miracl.com/press/missing-a-trick-karatsuba-variations-michael-scott
  #
  # We use the naive school grade multiplication instead of Karatsuba I.e.
  # z1 = x.hi * y.lo + x.lo * y.hi (Naive) = (x.lo - x.hi)(y.hi - y.lo) + z0 + z2 (Karatsuba)
  #
  # On modern architecture:
  #   - addition and multiplication have the same cost
  #   - Karatsuba would require to deal with potentially negative intermediate result
  #     and introduce branching
  #   - More total operations means more register moves

  var z1: type x

  # Low part and hi part - z0 & z2
  when eqSym(op, `+=`):
    extPrecAddMul(result.lo, x.lo, y.lo)
    extPrecAddMul(result.hi, x.hi, y.hi)
  else:
    extPrecMul(result.lo, x.lo, y.lo)
    extPrecMul(result.hi, x.hi, y.hi)

  ## TODO - fuse those parts and reduce the number of carry checks
  # Middle part - z1 - 1st mul
  extPrecMul(z1, x.hi, y.lo)
  result.lo.hi += z1.lo
  if result.lo.hi < z1.lo:
    inc result.hi

  result.hi.lo += z1.hi
  if result.hi.lo < z1.hi:
    inc result.hi.hi

  # Middle part - z1 - 2nd mul
  extPrecMul(z1, x.lo, y.hi)
  result.lo.hi += z1.lo
  if result.lo.hi < z1.lo:
    inc result.hi

  result.hi.lo += z1.hi
  if result.hi.lo < z1.hi:
    inc result.hi.hi

func extPrecAddMul[T](result: var UintImpl[UintImpl[T]], u, v: UintImpl[T]) =
  ## Extended precision fused in-place addition & multiplication
  extPrecMulImpl(result, `+=`, u, v)

func extPrecMul*[T](result: var UintImpl[UintImpl[T]], u, v: UintImpl[T]) =
  ## Extended precision multiplication
  extPrecMulImpl(result, `=`, u, v)

func `*`*[T](x, y: UintImpl[T]): UintImpl[T] {.inline.}=
  ## Multiplication for multi-precision unsigned uint
  #
  # For our representation, it is similar to school grade multiplication
  # Consider hi and lo as if they were digits
  #
  #     12
  # X   15
  # ------
  #     10   lo*lo -> z0
  #     5    hi*lo -> z1
  #     2    lo*hi -> z1
  #    10    hi*hi -- z2
  # ------
  #    180
  #
  # If T is a type
  # For T * T --> T we don't need to compute z2 as it always overflow
  # For T * T --> 2T (uint64 * uint64 --> uint128) we use extra precision multiplication

  extPrecMul(result, x.lo, y.lo)
  result.hi += x.lo * y.hi + x.hi * y.lo