Skip to content

Commit 0d762b3

Browse files
Christian Conveyaltanh
authored andcommitted
[hexagon] 'add_hvx' test to explore HVX usage. (apache#10604)
Add a unit test named 'add_hvx' to explore how various scheduling choices, tensor sizes, etc. impact efficient usage of Hexagon HVX units.
1 parent b0c2c23 commit 0d762b3

File tree

1 file changed

+335
-0
lines changed

1 file changed

+335
-0
lines changed
Lines changed: 335 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,335 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
import os
19+
import os.path
20+
import pathlib
21+
import sys
22+
import pytest
23+
import numpy as np
24+
import logging
25+
import tempfile
26+
import csv
27+
28+
import tvm.testing
29+
from tvm import te
30+
from tvm import relay
31+
from tvm.relay.backend import Executor, Runtime
32+
from tvm.contrib import utils, ndk
33+
from tvm.contrib.hexagon.build import HexagonLauncher
34+
import tvm.contrib.hexagon as hexagon
35+
36+
from .conftest import requires_hexagon_toolchain
37+
38+
RPC_SERVER_PORT = 7070
39+
40+
# This is a fixed detail of the v68 architecture.
41+
HVX_VECTOR_BYTES = 128
42+
43+
# NOTE on server ports:
44+
# These tests use different port numbers for the RPC server (7070 + ...).
45+
# The reason is that an RPC session cannot be gracefully closed without
46+
# triggering TIME_WAIT state on the server socket. This prevents another
47+
# server to bind to the same port until the wait time elapses.
48+
49+
50+
@requires_hexagon_toolchain
51+
def test_elemwise_add(android_serial_number, hexagon_launcher):
52+
"""
53+
Starting with an elementwise-add computation, try various schedules / optimizations to
54+
see the impact they have on performance.
55+
56+
The main motivation for this test is to explore the relationship between these
57+
schedules / optimizations vs. how effectively the primfunc uses the Hexagon's
58+
HVX units.
59+
"""
60+
host_output_dir = tempfile.mkdtemp()
61+
62+
print("-" * 80)
63+
print("OUTPUT DIRECTORY: {}".format(host_output_dir))
64+
print("-" * 80)
65+
print()
66+
67+
# TODO: We should move this into a separate test fixture, to make it easier to write
68+
# additional benchmarking functions. We'd just need to generalize the assumptions regarding
69+
# the particular fields being tracked as independent variables.
70+
class benchmark_results_collection:
71+
def __init__(self):
72+
self.row_dicts_ = []
73+
74+
def num_failures(self):
75+
num = 0
76+
for d in self.row_dicts_:
77+
if d["status"] == "FAIL":
78+
num += 1
79+
return num
80+
81+
def num_skips(self):
82+
num = 0
83+
for d in self.row_dicts_:
84+
if d["status"] == "SKIP":
85+
num += 1
86+
return num
87+
88+
def record_success(
89+
self, dtype, sched_type, mem_scope, num_vecs_per_tensor, benchmark_result
90+
):
91+
median_usec = benchmark_result.median * 1000000
92+
min_usec = benchmark_result.min * 1000000
93+
max_usec = benchmark_result.max * 1000000
94+
95+
self.row_dicts_.append(
96+
{
97+
"dtype": dtype,
98+
"sched_type": sched_type,
99+
"mem_scope": mem_scope,
100+
"num_vecs_per_tensor": num_vecs_per_tensor,
101+
"status": "OK",
102+
"median(µsec)": f"{median_usec:.3}",
103+
"min(µsec)": f"{min_usec:.3}",
104+
"max(µsec)": f"{max_usec:.3}",
105+
}
106+
)
107+
108+
def record_failure(self, dtype, sched_type, mem_scope, num_vecs_per_tensor, error_text):
109+
self.row_dicts_.append(
110+
{
111+
"dtype": dtype,
112+
"sched_type": sched_type,
113+
"mem_scope": mem_scope,
114+
"num_vecs_per_tensor": num_vecs_per_tensor,
115+
"status": "FAIL",
116+
"comment": error_text,
117+
}
118+
)
119+
120+
def record_skip(self, dtype, sched_type, mem_scope, num_vecs_per_tensor, comment_text):
121+
self.row_dicts_.append(
122+
{
123+
"dtype": dtype,
124+
"sched_type": sched_type,
125+
"mem_scope": mem_scope,
126+
"num_vecs_per_tensor": num_vecs_per_tensor,
127+
"status": "SKIP",
128+
"comment": comment_text,
129+
}
130+
)
131+
132+
def dump(self, f):
133+
csv.register_dialect(
134+
"benchmarks",
135+
delimiter="\t",
136+
quotechar='"',
137+
quoting=csv.QUOTE_MINIMAL,
138+
)
139+
140+
fieldnames = [
141+
"dtype",
142+
"sched_type",
143+
"mem_scope",
144+
"num_vecs_per_tensor",
145+
"status",
146+
"median(µsec)",
147+
"min(µsec)",
148+
"max(µsec)",
149+
"comment",
150+
]
151+
152+
writer = csv.DictWriter(f, fieldnames, dialect="benchmarks", restval="")
153+
154+
writer.writeheader()
155+
for d in self.row_dicts_:
156+
writer.writerow(d)
157+
158+
br = benchmark_results_collection()
159+
160+
# Create and benchmark a single primfunc.
161+
# If an unexpected problem occurs, raise an exception. Otherwise add a row of output to 'br'.
162+
def test_one_config(dtype, sched_type, mem_scope, num_vectors_per_tensor):
163+
version_name = f"dtype:{dtype}-schedtype:{sched_type}-memscope:{mem_scope}-numvecs:{num_vectors_per_tensor}"
164+
print(f"CONFIGURATION: {version_name}")
165+
166+
if num_vectors_per_tensor == 1 and mem_scope == "global.vtcm":
167+
# 2022-04-12 (cconvey): There's currently a bug in which TVM doesn't
168+
# recognize the mapping of 1D memory <--> 2D memory as being bijective
169+
# when num_vectors_per_tensor == 1.
170+
br.record_skip(
171+
dtype,
172+
sched_type,
173+
mem_scope,
174+
num_vectors_per_tensor,
175+
f"Expect to hit bug where 1D-2D bijective transform not recognized.",
176+
)
177+
return
178+
179+
if num_vectors_per_tensor == 2048 and mem_scope == "global.vtcm":
180+
br.record_skip(
181+
dtype,
182+
sched_type,
183+
mem_scope,
184+
num_vectors_per_tensor,
185+
f"Expect to exceed VTCM budget.",
186+
)
187+
return
188+
189+
dtype_bits = tvm._ffi.runtime_ctypes.DataType(dtype).bits
190+
assert dtype_bits % 8 == 0
191+
dtype_bytes = dtype_bits // 8
192+
193+
elem_per_hvx_vector = HVX_VECTOR_BYTES // dtype_bytes
194+
195+
# Note! We're providing the complete input tensor shapes now,
196+
# whereas the original code only reveals the exact shape when
197+
# about to call the kernel.
198+
199+
shape = [
200+
num_vectors_per_tensor,
201+
elem_per_hvx_vector,
202+
]
203+
204+
A = tvm.te.placeholder(shape, dtype=dtype)
205+
B = tvm.te.placeholder(shape, dtype=dtype)
206+
C = tvm.te.compute(A.shape, lambda i, j: A[i, j] + B[i, j], name="C")
207+
208+
sched = tvm.te.create_schedule(C.op)
209+
210+
if sched_type == 1:
211+
pass
212+
elif sched_type == 2:
213+
sched[C].vectorize(C.op.axis[1])
214+
else:
215+
raise Exception("Unknown schedule type")
216+
217+
# If we're using VTCM, we *must* add a transform_layout step to the schedule.
218+
# Otherwise the generated code will crash.
219+
# As of 2022-04-12 the crash does not provide a useful error message to the
220+
# host Python code.
221+
if mem_scope == "global.vtcm":
222+
for tensor in [A, B, C]:
223+
sched[tensor].transform_layout(lambda i, j: [i, te.AXIS_SEPARATOR, j])
224+
225+
# This module is only created so humans can inspect its IR.
226+
module_for_ir_dump = tvm.lower(sched, [A, B, C], "foo")
227+
228+
report_path = os.path.join(host_output_dir, f"{version_name}.txt")
229+
230+
with open(report_path, "w") as f:
231+
f.write("LOWERED IR MODULE:\n")
232+
f.write(str(module_for_ir_dump))
233+
f.write("\n")
234+
235+
target_hexagon = tvm.target.hexagon("v68", link_params=True)
236+
func = tvm.build(
237+
sched,
238+
[A, B, C],
239+
tvm.target.Target(target_hexagon, host=target_hexagon),
240+
name="elemwise_add",
241+
)
242+
243+
host_dso_binary_path = os.path.join(host_output_dir, f"test_binary-{version_name}.so")
244+
target_dso_binary_filename = "test_binary.so"
245+
246+
func.save(str(host_dso_binary_path))
247+
print("SAVED BINARY TO HOST PATH: {}".format(str(host_dso_binary_path)))
248+
249+
hexagon_launcher.upload(host_dso_binary_path, target_dso_binary_filename)
250+
251+
try:
252+
with hexagon_launcher.start_session() as sess:
253+
mod = hexagon_launcher.load_module(target_dso_binary_filename, sess)
254+
255+
host_numpy_A_data = np.ndarray(shape, dtype=dtype)
256+
host_numpy_B_data = np.ndarray(shape, dtype=dtype)
257+
258+
for i in range(shape[0]):
259+
for j in range(shape[1]):
260+
host_numpy_A_data[i, j] = i + j
261+
host_numpy_B_data[i, j] = (i + 1) * (j + 1)
262+
263+
host_numpy_C_data_expected = host_numpy_A_data + host_numpy_B_data
264+
265+
A_data = tvm.nd.empty(shape, dtype, sess.device, mem_scope)
266+
A_data.copyfrom(host_numpy_A_data)
267+
268+
B_data = tvm.nd.empty(shape, dtype, sess.device, mem_scope)
269+
B_data.copyfrom(host_numpy_B_data)
270+
271+
C_data = tvm.nd.empty(shape, dtype, sess.device, mem_scope)
272+
273+
# NOTE: We may want to soften these numbers, depending on future findings.
274+
timer = mod.time_evaluator("elemwise_add", sess.device, number=10, repeat=1)
275+
timing_result = timer(A_data, B_data, C_data)
276+
277+
print("TIMING RESULT: {}".format(timing_result))
278+
279+
# Verify that the computation actually happened, and produced the correct result.
280+
result = C_data.numpy()
281+
tvm.testing.assert_allclose(host_numpy_C_data_expected, result)
282+
283+
br.record_success(
284+
dtype, sched_type, mem_scope, num_vectors_per_tensor, timing_result
285+
)
286+
287+
except Exception as err:
288+
f.write("ERROR:\n")
289+
f.write("{}\n".format(err))
290+
br.record_failure(
291+
dtype, sched_type, mem_scope, num_vectors_per_tensor, f"See {report_path}"
292+
)
293+
294+
# -----------------------------------------------------------------------------------------------
295+
296+
# Hexagon v69 allows more dtypes, but we're sticking with v68 for now.
297+
for dtype in [
298+
"int8",
299+
]:
300+
301+
# These numbers are only meaningful in the context of this script.
302+
for sched_type in [
303+
1,
304+
2,
305+
]:
306+
307+
for mem_scope in ["global", "global.vtcm"]:
308+
309+
# These numbers are fairly arbitrary, but they're meant to stress memory/caches to
310+
# various extents.
311+
for num_vectors_per_tensor in [
312+
1,
313+
16,
314+
64,
315+
512,
316+
2048,
317+
]:
318+
319+
test_one_config(dtype, sched_type, mem_scope, num_vectors_per_tensor)
320+
321+
# Report our progress.
322+
br.dump(sys.stdout)
323+
324+
print("-" * 80)
325+
print(f"OUTPUT DIRECTORY: {host_output_dir}")
326+
print("-" * 80)
327+
print()
328+
329+
tabular_output_filename = os.path.join(host_output_dir, "benchmark-results.csv")
330+
with open(tabular_output_filename, "w") as csv_file:
331+
br.dump(csv_file)
332+
print(f"BENCHMARK RESULTS FILE: {tabular_output_filename}")
333+
334+
if br.num_failures() > 0:
335+
pytest.fail("At least one benchmark configuration failed", pytrace=False)

0 commit comments

Comments
 (0)