Skip to content

Commit 8a74b7c

Browse files
author
Christian Convey
committed
[hexagon] 'add_hvx' test to explore HVX usage.
Add a unit test named 'add_hvx' to explore how various scheduling choices, tensor sizes, etc. impact efficient usage of Hexagon HVX units.
1 parent f745f06 commit 8a74b7c

File tree

1 file changed

+283
-0
lines changed

1 file changed

+283
-0
lines changed
Lines changed: 283 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,283 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
import os
19+
import os.path
20+
import pathlib
21+
import sys
22+
import pytest
23+
import numpy as np
24+
import logging
25+
import tempfile
26+
import csv
27+
28+
import tvm.testing
29+
from tvm import te
30+
from tvm import relay
31+
from tvm.relay.backend import Executor, Runtime
32+
from tvm.contrib import utils, ndk
33+
from tvm.contrib.hexagon.build import HexagonLauncher
34+
import tvm.contrib.hexagon as hexagon
35+
36+
from .conftest import requires_hexagon_toolchain
37+
38+
RPC_SERVER_PORT = 7070
39+
40+
# This is a fixed detail of the v68 architecture.
41+
HVX_VECTOR_BYTES = 128
42+
43+
# NOTE on server ports:
44+
# These tests use different port numbers for the RPC server (7070 + ...).
45+
# The reason is that an RPC session cannot be gracefully closed without
46+
# triggering TIME_WAIT state on the server socket. This prevents another
47+
# server to bind to the same port until the wait time elapses.
48+
49+
50+
@requires_hexagon_toolchain
51+
def test_elemwise_add(android_serial_number, hexagon_launcher):
52+
"""
53+
Starting with an elementwise-add computation, try various schedules / optimizations to
54+
see the impact they have on performance.
55+
56+
The main motivation for this test is to explore the relationship between these
57+
schedules / optimizations vs. how effectively the primfunc uses the Hexagon's
58+
HVX units.
59+
"""
60+
host_output_dir = tempfile.mkdtemp()
61+
62+
print("-" * 80)
63+
print("OUTPUT DIRECTORY: {}".format(host_output_dir))
64+
print("-" * 80)
65+
print()
66+
67+
# TODO: We should move this into a separate test fixture, to make it easier to write
68+
# additional benchmarking functions. We'd just need to generalize the assumptions regarding
69+
# the particular fields being tracked as independent variables.
70+
class benchmark_results_collection:
71+
def __init__(self):
72+
self.row_dicts_ = []
73+
74+
def num_failures(self):
75+
num = 0
76+
for d in self.row_dicts_:
77+
if d["status"] == "FAIL":
78+
num += 1
79+
return num
80+
81+
def record_success(
82+
self, dtype, sched_type, mem_scope, num_vecs_per_tensor, benchmark_result
83+
):
84+
median_usec = benchmark_result.median * 1000000
85+
min_usec = benchmark_result.min * 1000000
86+
max_usec = benchmark_result.max * 1000000
87+
88+
self.row_dicts_.append(
89+
{
90+
"dtype": dtype,
91+
"sched_type": sched_type,
92+
"mem_scope": mem_scope,
93+
"num_vecs_per_tensor": num_vecs_per_tensor,
94+
"status": "OK",
95+
"median(µsec)": f"{median_usec:.3}",
96+
"min(µsec)": f"{min_usec:.3}",
97+
"max(µsec)": f"{max_usec:.3}",
98+
}
99+
)
100+
101+
def record_failure(self, dtype, sched_type, mem_scope, num_vecs_per_tensor, error_text):
102+
self.row_dicts_.append(
103+
{
104+
"dtype": dtype,
105+
"sched_type": sched_type,
106+
"mem_scope": mem_scope,
107+
"num_vecs_per_tensor": num_vecs_per_tensor,
108+
"status": "FAIL",
109+
"comment": error_text,
110+
}
111+
)
112+
113+
def dump(self, f):
114+
csv.register_dialect(
115+
"benchmarks",
116+
delimiter="\t",
117+
quotechar='"',
118+
quoting=csv.QUOTE_MINIMAL,
119+
)
120+
121+
fieldnames = [
122+
"dtype",
123+
"sched_type",
124+
"mem_scope",
125+
"num_vecs_per_tensor",
126+
"status",
127+
"median(µsec)",
128+
"min(µsec)",
129+
"max(µsec)",
130+
"comment",
131+
]
132+
133+
writer = csv.DictWriter(f, fieldnames, dialect="benchmarks", restval="")
134+
135+
writer.writeheader()
136+
for d in self.row_dicts_:
137+
writer.writerow(d)
138+
139+
br = benchmark_results_collection()
140+
141+
# Create and benchmark a single primfunc.
142+
# If an unexpected problem occurs, raise an exception. Otherwise add a row of output to 'br'.
143+
def test_one_config(dtype, sched_type, mem_scope, num_vectors_per_tensor):
144+
version_name = f"dtype:{dtype}-schedtype:{sched_type}-memscope:{mem_scope}-numvecs:{num_vectors_per_tensor}"
145+
print(f"CONFIGURATION: {version_name}")
146+
147+
dtype_bits = tvm._ffi.runtime_ctypes.DataType(dtype).bits
148+
assert dtype_bits % 8 == 0
149+
dtype_bytes = dtype_bits // 8
150+
151+
elem_per_hvx_vector = HVX_VECTOR_BYTES // dtype_bytes
152+
153+
# Note! We're providing the complete input tensor shapes now,
154+
# whereas the original code only reveals the exact shape when
155+
# about to call the kernel.
156+
157+
shape = [
158+
num_vectors_per_tensor,
159+
elem_per_hvx_vector,
160+
]
161+
162+
A = tvm.te.placeholder(shape, dtype=dtype)
163+
B = tvm.te.placeholder(shape, dtype=dtype)
164+
C = tvm.te.compute(A.shape, lambda i, j: A[i, j] + B[i, j], name="C")
165+
166+
sched = tvm.te.create_schedule(C.op)
167+
168+
if sched_type == 1:
169+
pass
170+
elif sched_type == 2:
171+
sched[C].vectorize(C.op.axis[1])
172+
else:
173+
raise Exception("Unknown schedule type")
174+
175+
# This module is only created so humans can inspect its IR.
176+
module_for_ir_dump = tvm.lower(sched, [A, B, C], "foo")
177+
178+
report_path = os.path.join(host_output_dir, f"{version_name}.txt")
179+
180+
with open(report_path, "w") as f:
181+
f.write("LOWERED IR MODULE:\n")
182+
f.write(str(module_for_ir_dump))
183+
f.write("\n")
184+
185+
target_hexagon = tvm.target.hexagon("v68", link_params=True)
186+
func = tvm.build(
187+
sched,
188+
[A, B, C],
189+
tvm.target.Target(target_hexagon, host=target_hexagon),
190+
name="elemwise_add",
191+
)
192+
193+
host_dso_binary_path = os.path.join(host_output_dir, f"test_binary-{version_name}.so")
194+
target_dso_binary_filename = "test_binary.so"
195+
196+
func.save(str(host_dso_binary_path))
197+
print("SAVED BINARY TO HOST PATH: {}".format(str(host_dso_binary_path)))
198+
199+
hexagon_launcher.upload(host_dso_binary_path, target_dso_binary_filename)
200+
hexagon_launcher.start_server()
201+
202+
try:
203+
with hexagon_launcher.start_session() as sess:
204+
mod = hexagon_launcher.load_module(target_dso_binary_filename, sess)
205+
206+
host_numpy_A_data = np.ndarray(shape, dtype=dtype)
207+
host_numpy_B_data = np.ndarray(shape, dtype=dtype)
208+
209+
for i in range(shape[0]):
210+
for j in range(shape[1]):
211+
host_numpy_A_data[i, j] = i + j
212+
host_numpy_B_data[i, j] = (i + 1) * (j + 1)
213+
214+
host_numpy_C_data_expected = host_numpy_A_data + host_numpy_B_data
215+
host_numpy_C_data = np.zeros_like(host_numpy_C_data_expected)
216+
217+
A_data = tvm.nd.empty(shape, dtype, sess.device, mem_scope)
218+
A_data.copyfrom(host_numpy_A_data)
219+
220+
B_data = tvm.nd.empty(shape, dtype, sess.device, mem_scope)
221+
B_data.copyfrom(host_numpy_B_data)
222+
223+
C_data = tvm.nd.empty(shape, dtype, sess.device, mem_scope)
224+
225+
# NOTE: We may want to soften these numbers, depending on future findings.
226+
timer = mod.time_evaluator("elemwise_add", sess.device, number=10, repeat=1)
227+
timing_result = timer(A_data, B_data, C_data)
228+
229+
print("TIMING RESULT: {}".format(timing_result))
230+
231+
# Verify that the computation actually happened, and produced the correct result.
232+
result = C_data.numpy()
233+
tvm.testing.assert_allclose(host_numpy_C_data_expected, result)
234+
235+
br.record_success(
236+
dtype, sched_type, mem_scope, num_vectors_per_tensor, timing_result
237+
)
238+
239+
except Exception as err:
240+
f.write("ERROR:\n")
241+
f.write("{}\n".format(err))
242+
br.record_failure(
243+
dtype, sched_type, mem_scope, num_vectors_per_tensor, f"See {report_path}"
244+
)
245+
246+
hexagon_launcher.stop_server()
247+
248+
# -----------------------------------------------------------------------------------------------
249+
250+
# Hexagon v69 allows more dtypes, but we're sticking with v68 for now.
251+
for dtype in [
252+
"int8",
253+
]:
254+
255+
# These numbers are only meaningful in the context of this script.
256+
for sched_type in [
257+
1,
258+
2,
259+
]:
260+
261+
for mem_scope in ["global", "global.vtcm"]:
262+
263+
# These numbers are fairly arbitrary, but they're meant to stress memory/caches to
264+
# various extents.
265+
for num_vectors_per_tensor in [1, 16, 64, 512, 2048]:
266+
267+
test_one_config(dtype, sched_type, mem_scope, num_vectors_per_tensor)
268+
269+
# Report our progress.
270+
br.dump(sys.stdout)
271+
272+
print("-" * 80)
273+
print(f"OUTPUT DIRECTORY: {host_output_dir}")
274+
print("-" * 80)
275+
print()
276+
277+
tabular_output_filename = os.path.join(host_output_dir, "benchmark-results.csv")
278+
with open(tabular_output_filename, "w") as csv_file:
279+
br.dump(csv_file)
280+
print(f"BENCHMARK RESULTS FILE: {tabular_output_filename}")
281+
282+
if br.num_failures() > 0:
283+
pytest.fail("At least one benchmark configuration failed", pytrace=False)

0 commit comments

Comments
 (0)