Add random-sized benchmarking methods

hajimes · hajimes · commit d39ef3af1448 · 2024-09-14T22:53:12.000+09:00
diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
@@ -2,6 +2,8 @@
 
 import hashlib
 import itertools
+import math
+import random
 import time
 from collections.abc import Callable
 from typing import Final
@@ -15,19 +17,6 @@
 K2: Final[int] = 0b1100001010110010101011100011110100100111110101001110101101001111
 MASK: Final[int] = 0xFFFFFFFFFFFFFFFF
 
-HASHES = {
-    "mmh3_32": mmh3.mmh3_32_digest,
-    "mmh3_128": mmh3.mmh3_x64_128_digest,
-    "xxh_32": xxhash.xxh32_digest,
-    "xxh_64": xxhash.xxh64_digest,
-    "xxh3_64": xxhash.xxh3_64_digest,
-    "xxh3_128": xxhash.xxh3_128_digest,
-    "md5": lambda ba: hashlib.md5(ba).digest(),
-    "sha1": lambda ba: hashlib.sha1(ba).digest(),
-    "pymmh3_32": pymmh3.hash,
-    "pymmh3_128": pymmh3.hash128,
-}
-
 
 def init_buffer(ba: bytearray) -> bytearray:
     """Initializes a byte array with a pattern.
@@ -50,8 +39,24 @@ def init_buffer(ba: bytearray) -> bytearray:
     return ba
 
 
+def generate_size(size: int, p: float) -> int:
+    """Generate a random size for a buffer.
+
+    Args:
+        size: The size of the buffer to hash.
+        p: The percentage of the buffer size to vary.
+
+    Returns:
+        The random size of the buffer.
+    """
+    lower = math.ceil(size * (1 - p))
+    upper = math.floor(size * (1 + p))
+
+    return random.randint(lower, upper)
+
+
 def perf_hash(loops: int, f: Callable, size: int) -> float:
-    """Benchmark the mmh3 hash function.
+    """Benchmark a hash function.
 
     Args:
         loops: The number of outer loops to run.
@@ -63,6 +68,9 @@ def perf_hash(loops: int, f: Callable, size: int) -> float:
     """
     # pylint: disable=too-many-locals
 
+    if size <= 0:
+        raise ValueError("size must be greater than 0")
+
     range_it = itertools.repeat(None, loops)
 
     data = bytearray(size + 9)
@@ -77,7 +85,61 @@ def perf_hash(loops: int, f: Callable, size: int) -> float:
     data6 = bytes(data[6 : size + 6])
     data7 = bytes(data[7 : size + 7])
     data8 = bytes(data[8 : size + 8])
-    data9 = bytes(data[8 : size + 9])
+    data9 = bytes(data[9 : size + 9])
+
+    t0 = time.perf_counter()
+    for _ in range_it:
+        f(data0)
+        f(data1)
+        f(data2)
+        f(data3)
+        f(data4)
+        f(data5)
+        f(data6)
+        f(data7)
+        f(data8)
+        f(data9)
+
+    return time.perf_counter() - t0
+
+
+def perf_hash_random(loops: int, f: Callable, size: int) -> float:
+    """Benchmark a hash function with varying data sizes.
+
+    Args:
+        loops: The number of outer loops to run.
+        f: The hash function to benchmark
+        size: The size of the buffer to hash.
+
+    Returns:
+        The time taken to hash the buffer in fractional seconds.
+    """
+    # pylint: disable=too-many-locals
+
+    if size <= 0:
+        raise ValueError("size must be greater than 0")
+
+    range_it = itertools.repeat(None, loops)
+    random.seed(42)
+    inner_loops = 10
+    extra_size = 255
+
+    data = bytearray(size + extra_size)
+    data = init_buffer(data)
+
+    pos_list = [random.randint(0, extra_size) for _ in range(inner_loops)]
+    size_list = [generate_size(size, 0.1) for _ in range(inner_loops)]
+
+    data0 = bytes(data[pos_list[0] : pos_list[0] + size_list[0]])
+    data1 = bytes(data[pos_list[1] : pos_list[1] + size_list[1]])
+    data2 = bytes(data[pos_list[2] : pos_list[2] + size_list[2]])
+    data3 = bytes(data[pos_list[3] : pos_list[3] + size_list[3]])
+    data4 = bytes(data[pos_list[4] : pos_list[4] + size_list[4]])
+    data5 = bytes(data[pos_list[5] : pos_list[5] + size_list[5]])
+    data6 = bytes(data[pos_list[6] : pos_list[6] + size_list[6]])
+    data7 = bytes(data[pos_list[7] : pos_list[7] + size_list[7]])
+    data8 = bytes(data[pos_list[8] : pos_list[8] + size_list[8]])
+    data9 = bytes(data[pos_list[9] : pos_list[9] + size_list[9]])
 
     t0 = time.perf_counter()
     for _ in range_it:
@@ -95,6 +157,60 @@ def perf_hash(loops: int, f: Callable, size: int) -> float:
     return time.perf_counter() - t0
 
 
+def perf_hash_latency(loops: int, f: Callable, size: int) -> float:
+    """Benchmark a hash function with overhead costs with varying data sizes.
+
+    Based on xxHash's ``benchLatency`` function.
+    https://github.com/Cyan4973/xxHash/blob/dev/tests/bench/benchHash.c
+
+    Args:
+        loops: The number of outer loops to run.
+        f: The hash function to benchmark
+        size: The size of the buffer to hash.
+
+    Returns:
+        The time taken to hash the buffer in fractional seconds.
+    """
+    # pylint: disable=too-many-locals
+
+    if size <= 0:
+        raise ValueError("size must be greater than 0")
+
+    range_it = itertools.repeat(None, loops)
+    random.seed(42)
+
+    n = 0
+
+    size0 = generate_size(size, 0.1)
+    size1 = generate_size(size, 0.1)
+    size2 = generate_size(size, 0.1)
+    size3 = generate_size(size, 0.1)
+    size4 = generate_size(size, 0.1)
+    size5 = generate_size(size, 0.1)
+    size6 = generate_size(size, 0.1)
+    size7 = generate_size(size, 0.1)
+    size8 = generate_size(size, 0.1)
+    size9 = generate_size(size, 0.1)
+
+    data = bytearray(math.floor(size * 1.1) + 255)
+    view_to_hash = memoryview(bytes(init_buffer(data)))
+
+    t0 = time.perf_counter()
+    for _ in range_it:
+        n = f(view_to_hash[n : n + size0])[0]
+        n = f(view_to_hash[n : n + size1])[0]
+        n = f(view_to_hash[n : n + size2])[0]
+        n = f(view_to_hash[n : n + size3])[0]
+        n = f(view_to_hash[n : n + size4])[0]
+        n = f(view_to_hash[n : n + size5])[0]
+        n = f(view_to_hash[n : n + size6])[0]
+        n = f(view_to_hash[n : n + size7])[0]
+        n = f(view_to_hash[n : n + size8])[0]
+        n = f(view_to_hash[n : n + size9])[0]
+
+    return time.perf_counter() - t0
+
+
 def add_cmdline_args(cmd: list, args) -> None:
     """Add command line arguments to the runner.
 
@@ -103,9 +219,30 @@ def add_cmdline_args(cmd: list, args) -> None:
         args: The parsed command line arguments.
     """
     cmd.extend(("--test-hash", args.test_hash))
+    cmd.extend(("--test-type", args.test_type))
     cmd.extend(("--test-buffer-size-max", str(args.test_buffer_size_max)))
 
 
+HASHES = {
+    "mmh3_32": mmh3.mmh3_32_digest,
+    "mmh3_128": mmh3.mmh3_x64_128_digest,
+    "xxh_32": xxhash.xxh32_digest,
+    "xxh_64": xxhash.xxh64_digest,
+    "xxh3_64": xxhash.xxh3_64_digest,
+    "xxh3_128": xxhash.xxh3_128_digest,
+    "md5": lambda ba: hashlib.md5(ba).digest(),
+    "sha1": lambda ba: hashlib.sha1(ba).digest(),
+    "pymmh3_32": pymmh3.hash,
+    "pymmh3_128": pymmh3.hash128,
+}
+
+BENCHMARKING_TYPES = {
+    "naive": perf_hash,
+    "random": perf_hash_random,
+    "latency": perf_hash_latency,
+}
+
+
 if __name__ == "__main__":
     runner = pyperf.Runner(add_cmdline_args=add_cmdline_args)
 
@@ -117,6 +254,14 @@ def add_cmdline_args(cmd: list, args) -> None:
         choices=HASHES.keys(),
     )
 
+    runner.argparser.add_argument(
+        "--test-type",
+        type=str,
+        help="Type of benchmarking to perform",
+        choices=BENCHMARKING_TYPES.keys(),
+        default="random",
+    )
+
     runner.argparser.add_argument(
         "--test-buffer-size-max",
         type=int,
@@ -130,7 +275,7 @@ def add_cmdline_args(cmd: list, args) -> None:
     while fib1 <= process_args.test_buffer_size_max:
         runner.bench_time_func(
             f"{fib1} bytes",
-            perf_hash,
+            BENCHMARKING_TYPES[process_args.test_type],
             HASHES[process_args.test_hash],
             fib1,
             inner_loops=10,