[hexagon] 'add_hvx' test to explore HVX usage.

Christian Convey · Christian Convey · commit 3b1ad0513392 · 2022-03-24T12:05:58.000-07:00
Add a unit test named 'add_hvx' to explore how various
scheduling choices, tensor sizes, etc. impact efficient usage of Hexagon
HVX units.
diff --git a/tests/python/contrib/test_hexagon/test_launcher.py b/tests/python/contrib/test_hexagon/test_launcher.py
@@ -16,11 +16,13 @@
 # under the License.
 
 import os
+import os.path
 import pathlib
 import sys
 import pytest
 import numpy as np
 import logging
+import tempfile
 
 import tvm.testing
 from tvm import te
@@ -40,6 +42,251 @@
 # triggering TIME_WAIT state on the server socket. This prevents another
 # server to bind to the same port until the wait time elapses.
 
+@requires_hexagon_toolchain
+def test_add_hvx(android_serial_number, tvm_tracker_host, tvm_tracker_port, adb_server_socket):
+    """
+    Starting with an elementwise-add computation, try various schedules / optimizations to
+    see the impact they have on performance.
+
+    The main motivation for this test is to explore the relationship between these
+    schedules / optimizations vs. how effectively the primfunc uses the Hexagon's
+    HVX units.
+    """
+
+    host_output_dir = tempfile.mkdtemp()
+
+    print("-"*80)
+    print("OUTPUT DIRECTORY: {}".format(host_output_dir))
+    print("-"*80)
+    print()
+
+    class benchmark_results_collection:
+        def __init__(self):
+            # We'll store the results in corresponding arrays, for simplicity.
+            self.dtypes = []
+            self.sched_types = []
+            self.mem_scopes = []
+            self.nums_vecs_per_tensor = []
+            self.benchmark_results = []
+            self.failure_texts = []
+
+        def record_success(self, dtype, sched_type, mem_scope, num_vecs_per_tensor, benchmark_result):
+            self.dtypes.append(dtype)
+            self.sched_types.append(sched_type)
+            self.mem_scopes.append(mem_scope)
+            self.nums_vecs_per_tensor.append(num_vecs_per_tensor)
+            self.benchmark_results.append(benchmark_result)
+            self.failure_texts.append(None)
+
+        def record_failure(self, dtype, sched_type, mem_scope, num_vecs_per_tensor, outcome):
+            self.dtypes.append(dtype)
+            self.sched_types.append(sched_type)
+            self.mem_scopes.append(mem_scope)
+            self.nums_vecs_per_tensor.append(num_vecs_per_tensor)
+            self.benchmark_results.append(None)
+            self.failure_texts.append(outcome)
+
+
+        def dump(self, f):
+            delim = '\t'
+
+            f.write(f'dtype')
+
+            f.write(delim)
+            f.write(f'sched_type')
+
+            f.write(delim)
+            f.write(f'mem_scope')
+
+            f.write(delim)
+            f.write(f'num_vecs_per_tensor')
+
+            f.write(delim)
+            f.write(f'median(µsec)')
+
+            f.write(delim)
+            f.write(f'min(µsec)')
+
+            f.write(delim)
+            f.write(f'max(µsec)')
+
+            f.write(delim)
+            f.write(f'comment')
+
+            f.write('\n')
+
+            for i in range(len(self.dtypes)):
+                f.write('{}'.format(self.dtypes[i]))
+
+                f.write(delim)
+                f.write('{}'.format(self.sched_types[i]))
+
+                f.write(delim)
+                f.write('{}'.format(self.mem_scopes[i]))
+
+                f.write(delim)
+                f.write('{}'.format(self.nums_vecs_per_tensor[i]))
+
+                r = self.benchmark_results[i]
+                ft = self.failure_texts[i]
+
+                if r is None:
+                    f.write(delim)
+                    f.write(delim)
+                    f.write(delim)
+                else:
+                    median_usec = r.median * 1000000
+                    min_usec = r.min * 1000000
+                    max_usec = r.max * 1000000
+
+                    f.write(delim)
+                    f.write(f'{median_usec:.3}')
+
+                    f.write(delim)
+                    f.write(f'{min_usec:.3}')
+
+                    f.write(delim)
+                    f.write(f'{max_usec:.3}')
+
+                if ft is None:
+                    f.write(delim)
+                    f.write('OK')
+                else:
+                    f.write(delim)
+                    f.write(f'FAILURE: {ft}')
+
+                f.write('\n')
+
+    br = benchmark_results_collection()
+
+    for dtype in ['int8',]:  # Hexagon v68 allows more dtypes, but we're sticking with v68 for now.
+        for sched_type in [1,2,]:
+            for mem_scope in [None, "global.vtcm"]:
+
+                # These numbers are fairly arbitrary, but they're meant to stress memory/caches to
+                # various extents.
+                for num_vectors_per_tensor in [1,16,64,512,2048]:
+
+                    version_name = 'dtype:{}-schedtype:{}-memscope{}-numvecs:{}'.format(dtype, str(sched_type), str(mem_scope), num_vectors_per_tensor)
+                    print("CONFIGURATION: {}".format(version_name))
+
+                    # This is a fixed detail of the v68 architecture.
+                    HVX_VECTOR_BYTES=128
+
+                    dtype_bits = tvm._ffi.runtime_ctypes.DataType(dtype).bits
+                    assert dtype_bits % 8 == 0
+                    dtype_bytes = dtype_bits // 8
+
+                    elem_per_hvx_vector = HVX_VECTOR_BYTES // dtype_bytes
+
+                    # Note!  We're providing the complete input tensor shapes now,
+                    # whereas the original code only reveals the exact shape when
+                    # about to call the kernel.
+
+                    shape = [num_vectors_per_tensor, elem_per_hvx_vector,]
+
+                    A = tvm.te.placeholder(shape, dtype=dtype)
+                    B = tvm.te.placeholder(shape, dtype=dtype)
+                    C = tvm.te.compute(A.shape, lambda i,j: A[i,j] + B[i,j], name="C")
+
+                    sched = tvm.te.create_schedule(C.op)
+
+                    if sched_type == 1:
+                        pass
+                    elif sched_type == 2:
+                        sched[C].vectorize(C.op.axis[1])
+                    else:
+                        raise Exception("Unknown schedule type")
+
+                    # This module is only created so humans can inspect its IR.
+                    module_for_ir_dump = tvm.lower(sched, [A,B,C], "foo")
+
+                    report_path = os.path.join(host_output_dir, f'{version_name}.txt')
+
+                    with open(report_path, 'w') as f:
+                        f.write("LOWERED IR MODULE:\n")
+                        f.write(str(module_for_ir_dump))
+                        f.write('\n')
+
+                        target_hexagon = tvm.target.hexagon("v68", link_params=True)
+                        func = tvm.build(
+                            sched, [A, B, C], tvm.target.Target(target_hexagon, host=target_hexagon), name="add_hvx"
+                        )
+
+                        host_dso_binary_path = os.path.join(host_output_dir, f'test_binary-{version_name}.so')
+                        target_dso_binary_filename = 'test_binary.so'
+
+                        func.save(str(host_dso_binary_path))
+                        print("SAVED BINARY TO HOST PATH: {}".format(str(host_dso_binary_path)))
+
+                        if not android_serial_number:
+                            pytest.skip(msg="Skip hardware test since ANDROID_SERIAL_NUMBER is not set.")
+
+                        rpc_info = {
+                            "rpc_tracker_host": tvm_tracker_host,
+                            "rpc_tracker_port": tvm_tracker_port,
+                            "rpc_server_port": RPC_SERVER_PORT + 0,  # See note at the beginning of the file
+                            "adb_server_socket": adb_server_socket,
+                        }
+                        launcher = HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info)
+                        launcher.upload(str(host_dso_binary_path), str(target_dso_binary_filename))
+                        launcher.start_server()
+
+                        try:
+                            with launcher.start_session() as sess:
+                                mod = launcher.load_module(target_dso_binary_filename, sess)
+
+                                host_numpy_A_data = np.ndarray(shape, dtype=dtype)
+                                host_numpy_B_data = np.ndarray(shape, dtype=dtype)
+                                host_numpy_C_data = np.ndarray(shape, dtype=dtype)
+                                host_numpy_C_data_expected = np.ndarray(shape, dtype=dtype)
+
+                                def intended_val_A(i,j):
+                                    return i + j
+
+                                def intended_val_B(i,j):
+                                    return (i+1) * (j+1)
+
+                                for i in range(shape[0]):
+                                    for j in range(shape[1]):
+                                        host_numpy_A_data[i,j] = intended_val_A(i,j)
+                                        host_numpy_B_data[i,j] = intended_val_B(i,j)
+                                        host_numpy_C_data_expected[i,j] = intended_val_A(i,j) + intended_val_B(i,j)
+
+                                A_data = tvm.nd.empty(shape, dtype, sess.device, mem_scope)
+                                A_data.copyfrom(host_numpy_A_data)
+
+                                B_data = tvm.nd.empty(shape, dtype, sess.device, mem_scope)
+                                B_data.copyfrom(host_numpy_B_data)
+
+                                C_data = tvm.nd.empty(shape, dtype, sess.device, mem_scope)
+
+                                timer = mod.time_evaluator("add_hvx", sess.device, number=100, repeat=1, min_repeat_ms=1000)
+                                timing_result = timer(A_data, B_data, C_data)
+
+                                print("TIMING RESULT: {}".format(timing_result))
+
+                                # Verify that the computation actually happened, and produced the correct result.
+                                result = C_data.numpy()
+                                assert (result == host_numpy_C_data_expected).all()
+
+                                br.record_success(dtype, sched_type, mem_scope, num_vectors_per_tensor, timing_result)
+                        except:
+                                br.record_failure(dtype, sched_type, mem_scope, num_vectors_per_tensor, 'failed')
+
+                        launcher.stop_server()
+                        br.dump(sys.stdout)
+
+    print("-"*80)
+    print(f"OUTPUT DIRECTORY: {host_output_dir}")
+    print("-"*80)
+    print()
+
+    tabular_output_filename = os.path.join(host_output_dir, 'benchmark-results.csv')
+    with open(tabular_output_filename, 'w') as csv_file:
+        br.dump(csv_file)
+    print(f'BENCHMARK RESULTS FILE: {tabular_output_filename}')
+
 
 @requires_hexagon_toolchain
 def test_add(android_serial_number, tvm_tracker_host, tvm_tracker_port, adb_server_socket):