Skip to content

Commit d39ef3a

Browse files
committed
Add random-sized benchmarking methods
1 parent c34c627 commit d39ef3a

File tree

1 file changed

+161
-16
lines changed

1 file changed

+161
-16
lines changed

benchmark/benchmark.py

+161-16
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
import hashlib
44
import itertools
5+
import math
6+
import random
57
import time
68
from collections.abc import Callable
79
from typing import Final
@@ -15,19 +17,6 @@
1517
K2: Final[int] = 0b1100001010110010101011100011110100100111110101001110101101001111
1618
MASK: Final[int] = 0xFFFFFFFFFFFFFFFF
1719

18-
HASHES = {
19-
"mmh3_32": mmh3.mmh3_32_digest,
20-
"mmh3_128": mmh3.mmh3_x64_128_digest,
21-
"xxh_32": xxhash.xxh32_digest,
22-
"xxh_64": xxhash.xxh64_digest,
23-
"xxh3_64": xxhash.xxh3_64_digest,
24-
"xxh3_128": xxhash.xxh3_128_digest,
25-
"md5": lambda ba: hashlib.md5(ba).digest(),
26-
"sha1": lambda ba: hashlib.sha1(ba).digest(),
27-
"pymmh3_32": pymmh3.hash,
28-
"pymmh3_128": pymmh3.hash128,
29-
}
30-
3120

3221
def init_buffer(ba: bytearray) -> bytearray:
3322
"""Initializes a byte array with a pattern.
@@ -50,8 +39,24 @@ def init_buffer(ba: bytearray) -> bytearray:
5039
return ba
5140

5241

42+
def generate_size(size: int, p: float) -> int:
43+
"""Generate a random size for a buffer.
44+
45+
Args:
46+
size: The size of the buffer to hash.
47+
p: The percentage of the buffer size to vary.
48+
49+
Returns:
50+
The random size of the buffer.
51+
"""
52+
lower = math.ceil(size * (1 - p))
53+
upper = math.floor(size * (1 + p))
54+
55+
return random.randint(lower, upper)
56+
57+
5358
def perf_hash(loops: int, f: Callable, size: int) -> float:
54-
"""Benchmark the mmh3 hash function.
59+
"""Benchmark a hash function.
5560
5661
Args:
5762
loops: The number of outer loops to run.
@@ -63,6 +68,9 @@ def perf_hash(loops: int, f: Callable, size: int) -> float:
6368
"""
6469
# pylint: disable=too-many-locals
6570

71+
if size <= 0:
72+
raise ValueError("size must be greater than 0")
73+
6674
range_it = itertools.repeat(None, loops)
6775

6876
data = bytearray(size + 9)
@@ -77,7 +85,61 @@ def perf_hash(loops: int, f: Callable, size: int) -> float:
7785
data6 = bytes(data[6 : size + 6])
7886
data7 = bytes(data[7 : size + 7])
7987
data8 = bytes(data[8 : size + 8])
80-
data9 = bytes(data[8 : size + 9])
88+
data9 = bytes(data[9 : size + 9])
89+
90+
t0 = time.perf_counter()
91+
for _ in range_it:
92+
f(data0)
93+
f(data1)
94+
f(data2)
95+
f(data3)
96+
f(data4)
97+
f(data5)
98+
f(data6)
99+
f(data7)
100+
f(data8)
101+
f(data9)
102+
103+
return time.perf_counter() - t0
104+
105+
106+
def perf_hash_random(loops: int, f: Callable, size: int) -> float:
107+
"""Benchmark a hash function with varying data sizes.
108+
109+
Args:
110+
loops: The number of outer loops to run.
111+
f: The hash function to benchmark
112+
size: The size of the buffer to hash.
113+
114+
Returns:
115+
The time taken to hash the buffer in fractional seconds.
116+
"""
117+
# pylint: disable=too-many-locals
118+
119+
if size <= 0:
120+
raise ValueError("size must be greater than 0")
121+
122+
range_it = itertools.repeat(None, loops)
123+
random.seed(42)
124+
inner_loops = 10
125+
extra_size = 255
126+
127+
data = bytearray(size + extra_size)
128+
data = init_buffer(data)
129+
130+
pos_list = [random.randint(0, extra_size) for _ in range(inner_loops)]
131+
size_list = [generate_size(size, 0.1) for _ in range(inner_loops)]
132+
133+
data0 = bytes(data[pos_list[0] : pos_list[0] + size_list[0]])
134+
data1 = bytes(data[pos_list[1] : pos_list[1] + size_list[1]])
135+
data2 = bytes(data[pos_list[2] : pos_list[2] + size_list[2]])
136+
data3 = bytes(data[pos_list[3] : pos_list[3] + size_list[3]])
137+
data4 = bytes(data[pos_list[4] : pos_list[4] + size_list[4]])
138+
data5 = bytes(data[pos_list[5] : pos_list[5] + size_list[5]])
139+
data6 = bytes(data[pos_list[6] : pos_list[6] + size_list[6]])
140+
data7 = bytes(data[pos_list[7] : pos_list[7] + size_list[7]])
141+
data8 = bytes(data[pos_list[8] : pos_list[8] + size_list[8]])
142+
data9 = bytes(data[pos_list[9] : pos_list[9] + size_list[9]])
81143

82144
t0 = time.perf_counter()
83145
for _ in range_it:
@@ -95,6 +157,60 @@ def perf_hash(loops: int, f: Callable, size: int) -> float:
95157
return time.perf_counter() - t0
96158

97159

160+
def perf_hash_latency(loops: int, f: Callable, size: int) -> float:
161+
"""Benchmark a hash function with overhead costs with varying data sizes.
162+
163+
Based on xxHash's ``benchLatency`` function.
164+
https://github.com/Cyan4973/xxHash/blob/dev/tests/bench/benchHash.c
165+
166+
Args:
167+
loops: The number of outer loops to run.
168+
f: The hash function to benchmark
169+
size: The size of the buffer to hash.
170+
171+
Returns:
172+
The time taken to hash the buffer in fractional seconds.
173+
"""
174+
# pylint: disable=too-many-locals
175+
176+
if size <= 0:
177+
raise ValueError("size must be greater than 0")
178+
179+
range_it = itertools.repeat(None, loops)
180+
random.seed(42)
181+
182+
n = 0
183+
184+
size0 = generate_size(size, 0.1)
185+
size1 = generate_size(size, 0.1)
186+
size2 = generate_size(size, 0.1)
187+
size3 = generate_size(size, 0.1)
188+
size4 = generate_size(size, 0.1)
189+
size5 = generate_size(size, 0.1)
190+
size6 = generate_size(size, 0.1)
191+
size7 = generate_size(size, 0.1)
192+
size8 = generate_size(size, 0.1)
193+
size9 = generate_size(size, 0.1)
194+
195+
data = bytearray(math.floor(size * 1.1) + 255)
196+
view_to_hash = memoryview(bytes(init_buffer(data)))
197+
198+
t0 = time.perf_counter()
199+
for _ in range_it:
200+
n = f(view_to_hash[n : n + size0])[0]
201+
n = f(view_to_hash[n : n + size1])[0]
202+
n = f(view_to_hash[n : n + size2])[0]
203+
n = f(view_to_hash[n : n + size3])[0]
204+
n = f(view_to_hash[n : n + size4])[0]
205+
n = f(view_to_hash[n : n + size5])[0]
206+
n = f(view_to_hash[n : n + size6])[0]
207+
n = f(view_to_hash[n : n + size7])[0]
208+
n = f(view_to_hash[n : n + size8])[0]
209+
n = f(view_to_hash[n : n + size9])[0]
210+
211+
return time.perf_counter() - t0
212+
213+
98214
def add_cmdline_args(cmd: list, args) -> None:
99215
"""Add command line arguments to the runner.
100216
@@ -103,9 +219,30 @@ def add_cmdline_args(cmd: list, args) -> None:
103219
args: The parsed command line arguments.
104220
"""
105221
cmd.extend(("--test-hash", args.test_hash))
222+
cmd.extend(("--test-type", args.test_type))
106223
cmd.extend(("--test-buffer-size-max", str(args.test_buffer_size_max)))
107224

108225

226+
HASHES = {
227+
"mmh3_32": mmh3.mmh3_32_digest,
228+
"mmh3_128": mmh3.mmh3_x64_128_digest,
229+
"xxh_32": xxhash.xxh32_digest,
230+
"xxh_64": xxhash.xxh64_digest,
231+
"xxh3_64": xxhash.xxh3_64_digest,
232+
"xxh3_128": xxhash.xxh3_128_digest,
233+
"md5": lambda ba: hashlib.md5(ba).digest(),
234+
"sha1": lambda ba: hashlib.sha1(ba).digest(),
235+
"pymmh3_32": pymmh3.hash,
236+
"pymmh3_128": pymmh3.hash128,
237+
}
238+
239+
BENCHMARKING_TYPES = {
240+
"naive": perf_hash,
241+
"random": perf_hash_random,
242+
"latency": perf_hash_latency,
243+
}
244+
245+
109246
if __name__ == "__main__":
110247
runner = pyperf.Runner(add_cmdline_args=add_cmdline_args)
111248

@@ -117,6 +254,14 @@ def add_cmdline_args(cmd: list, args) -> None:
117254
choices=HASHES.keys(),
118255
)
119256

257+
runner.argparser.add_argument(
258+
"--test-type",
259+
type=str,
260+
help="Type of benchmarking to perform",
261+
choices=BENCHMARKING_TYPES.keys(),
262+
default="random",
263+
)
264+
120265
runner.argparser.add_argument(
121266
"--test-buffer-size-max",
122267
type=int,
@@ -130,7 +275,7 @@ def add_cmdline_args(cmd: list, args) -> None:
130275
while fib1 <= process_args.test_buffer_size_max:
131276
runner.bench_time_func(
132277
f"{fib1} bytes",
133-
perf_hash,
278+
BENCHMARKING_TYPES[process_args.test_type],
134279
HASHES[process_args.test_hash],
135280
fib1,
136281
inner_loops=10,

0 commit comments

Comments
 (0)