From 4d4250fef41849a46bee23e94379e7e6dce90b5f Mon Sep 17 00:00:00 2001 From: Paul Keller Date: Mon, 6 Jan 2025 18:46:03 +0000 Subject: [PATCH] Stress NOC mcast test --- tests/tt_metal/tt_metal/CMakeLists.txt | 1 + .../test_kernels/stress_noc_mcast.cpp | 58 +++++ .../test_kernels/sweep_stress_noc_mcast.sh | 53 ++++ .../tt_metal/test_stress_noc_mcast.cpp | 236 ++++++++++++++++++ 4 files changed, 348 insertions(+) create mode 100644 tests/tt_metal/tt_metal/test_kernels/stress_noc_mcast.cpp create mode 100755 tests/tt_metal/tt_metal/test_kernels/sweep_stress_noc_mcast.sh create mode 100644 tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp diff --git a/tests/tt_metal/tt_metal/CMakeLists.txt b/tests/tt_metal/tt_metal/CMakeLists.txt index 0d515c70959..da019aa8393 100644 --- a/tests/tt_metal/tt_metal/CMakeLists.txt +++ b/tests/tt_metal/tt_metal/CMakeLists.txt @@ -14,6 +14,7 @@ set(TT_METAL_TESTS_SRCS # test_eltwise_unary.cpp <- not tested in run_tt_metal.py test_matmul_single_tile_bfp8b.cpp test_matmul_single_tile_output_in_l1.cpp + test_stress_noc_mcast.cpp test_dram_loopback_single_core.cpp test_datacopy_bfp8b.cpp test_datacopy.cpp diff --git a/tests/tt_metal/tt_metal/test_kernels/stress_noc_mcast.cpp b/tests/tt_metal/tt_metal/test_kernels/stress_noc_mcast.cpp new file mode 100644 index 00000000000..0bdf6bb17de --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/stress_noc_mcast.cpp @@ -0,0 +1,58 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +constexpr bool mcaster = get_compile_time_arg_val(0); +constexpr uint32_t tlx = get_compile_time_arg_val(1); +constexpr uint32_t tly = get_compile_time_arg_val(2); +constexpr uint32_t width = get_compile_time_arg_val(3); +constexpr uint32_t height = get_compile_time_arg_val(4); +constexpr uint64_t duration = (uint64_t)get_compile_time_arg_val(5) * 1000 * 1000 * 1000; +constexpr uint32_t ucast_size = get_compile_time_arg_val(6); +constexpr uint32_t mcast_size = get_compile_time_arg_val(7); +constexpr uint32_t virtual_grid_offset = get_compile_time_arg_val(8); +constexpr uint32_t nrands = get_compile_time_arg_val(9); +constexpr bool enable_rnd_delay = get_compile_time_arg_val(10); +constexpr uint32_t ucast_l1_addr = get_compile_time_arg_val(11); +constexpr uint32_t mcast_l1_addr = get_compile_time_arg_val(12); + +inline uint32_t next_rand(tt_l1_ptr uint8_t* rnds, uint32_t& rnd_index) { + uint32_t rnd = rnds[rnd_index]; + rnd_index = (rnd_index + 1) & (nrands - 1); + return rnd; +} + +void kernel_main() { + uint64_t done_time = c_tensix_core::read_wall_clock() + duration; + tt_l1_ptr uint8_t* rnds = (tt_l1_ptr uint8_t*)(get_arg_addr(0)); + uint32_t rnd_index = 0; + + uint64_t stall_time = 0; + while (c_tensix_core::read_wall_clock() < done_time) { + for (uint32_t count = 0; count < 1000; count++) { + if (enable_rnd_delay) { + // reading time here biases us to have more ~0 cycle stalls as this + // includes the write time + while (c_tensix_core::read_wall_clock() < stall_time); + stall_time = c_tensix_core::read_wall_clock() + next_rand(rnds, rnd_index); + } + + if (mcaster) { + uint64_t dst_noc_multicast_addr = + get_noc_multicast_addr(tlx, tly, tlx + width - 1, tly + height - 1, ucast_l1_addr); + noc_async_write_multicast(mcast_l1_addr, dst_noc_multicast_addr, mcast_size, width * height, false); + } else { + uint32_t dst_x, dst_y; + uint8_t noc_addr = next_rand(rnds, rnd_index); + dst_x = (noc_addr & 0xf) + virtual_grid_offset; + dst_y = (noc_addr >> 4) + virtual_grid_offset; + uint64_t noc_write_addr = NOC_XY_ADDR(NOC_X(dst_x), NOC_Y(dst_y), ucast_l1_addr); + noc_async_write(ucast_l1_addr, noc_write_addr, ucast_size); + } + } + } + + noc_async_write_barrier(); +} diff --git a/tests/tt_metal/tt_metal/test_kernels/sweep_stress_noc_mcast.sh b/tests/tt_metal/tt_metal/test_kernels/sweep_stress_noc_mcast.sh new file mode 100755 index 00000000000..99f870f92d9 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/sweep_stress_noc_mcast.sh @@ -0,0 +1,53 @@ +#/bin/bash + +if [ "$ARCH_NAME" = "wormhole_b0" ]; then + echo "Configured core range for wormhole_b0" + width=8 + height=8 +elif [ "$ARCH_NAME" = "blackhole" ]; then + echo "Configured core range for blackhole" + width=12 + height=10 +else + echo "Unknown arch: $ARCH_NAME" + exit 1 +fi + +function run_set() { + echo "running: $@" + TT_METAL_SLOW_DISPATCH_MODE=1 build/test/tt_metal/test_stress_noc_mcast -t 120 $@ +} + +function run_all() { + run_set $@ -u 32 -m 32 + run_set $@ -u 32 -m 256 + run_set $@ -u 32 -m 2048 + run_set $@ -u 32 -m 4096 + run_set $@ -u 32 -m 8192 + run_set $@ -u 256 -m 32 + run_set $@ -u 2048 -m 256 + run_set $@ -u 4096 -m 2048 + run_set $@ -u 8192 -m 4096 +} + +# sweep w/ randomized noc address, tensix mcast +for (( i=0; i<$width; i++ )); do + h=$((height -1)) + run_all -x 0 -y 0 -width $width -height $h -mx $i -my $h +done + +# sweep w/ randomized delay+noc address, tensix mcast +for (( i=0; i<=11; i++ )); do + h=$((height -1)) + run_all -x 0 -y 0 -width $width -height $h -mx $i -my $h -rdelay +done + +# sweep w/ randomized noc address, eth mcast +for (( i=0; i<=11; i++ )); do + run_all -e $i -width $width -height $height +done + +# sweep w/ randomized delay+noc address, eth mcast +for (( i=0; i<=11; i++ )); do + run_all -e $i -width $width -height $height -rdelay +done diff --git a/tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp b/tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp new file mode 100644 index 00000000000..80ddbe1964b --- /dev/null +++ b/tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp @@ -0,0 +1,236 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +// This test stresses NOC mcast by: +// - using 1 mcast core (future work to add multiple) either tensix or eth +// - rapidly mcast into a grid of tensix workers +// - rapidly grid of tensix workers generates random noc traffic +// - does not verify correct transactions, just runs til termination + +#include +#include +#include +#include +#include + +#include "core_coord.hpp" +#include "logger.hpp" +#include "tt_metal/host_api.hpp" +#include "tt_metal/detail/tt_metal.hpp" +#include "tt_metal/llrt/rtoptions.hpp" +#include "tt_metal/common/metal_soc_descriptor.h" +#include "tt_metal/impl/event/event.hpp" +#include "tt_metal/impl/dispatch/command_queue.hpp" +#include "tt_metal/impl/device/device.hpp" +#include "tt_metal/common/metal_soc_descriptor.h" +#include "llrt/hal.hpp" + +using namespace tt; + +const uint32_t CB_ELEMENTS = 2048; +const uint32_t DEFAULT_SECONDS = 10; +const uint32_t DEFAULT_TARGET_WIDTH = 1; +const uint32_t DEFAULT_TARGET_HEIGHT = 1; +const uint32_t N_RANDS = 512; + +uint32_t device_num_g = 0; +uint32_t time_secs_g = DEFAULT_SECONDS; +uint32_t tlx_g = 0; +uint32_t tly_g = 0; +uint32_t width_g = DEFAULT_TARGET_WIDTH; +uint32_t height_g = DEFAULT_TARGET_HEIGHT; +uint32_t mcast_x_g = 0; +uint32_t mcast_y_g = 0; +uint32_t mcast_size_g = 16; +uint32_t ucast_size_g = 8192; +uint32_t mcast_from_n_eth_g; +bool mcast_from_eth_g; +bool rnd_delay_g = false; +bool rnd_coord_g = true; + +void init(int argc, char** argv) { + std::vector input_args(argv, argv + argc); + + if (test_args::has_command_option(input_args, "-h") || test_args::has_command_option(input_args, "--help")) { + log_info(LogTest, "Usage:"); + log_info(LogTest, " -v: device number to run on (default 0) ", DEFAULT_SECONDS); + log_info(LogTest, " -t: time in seconds (default {})", DEFAULT_SECONDS); + log_info(LogTest, " -x: grid top left x"); + log_info(LogTest, " -y: grid top left y"); + log_info(LogTest, " -width: unicast grid width (default {})", DEFAULT_TARGET_WIDTH); + log_info(LogTest, "-height: unicast grid height (default {})", DEFAULT_TARGET_HEIGHT); + log_info(LogTest, " -mx: mcast core x"); + log_info(LogTest, " -my: mcast core y"); + log_info(LogTest, " -e: mcast from nth idle eth core (ignores -mx,-my)"); + log_info(LogTest, " -m: mcast packet size"); + log_info(LogTest, " -u: ucast packet size"); + log_info(LogTest, "-rdelay: insert random delay between noc transactions"); + log_info(LogTest, " -s: seed random number generator"); + exit(0); + } + + device_num_g = test_args::get_command_option_uint32(input_args, "-v", 0); + time_secs_g = test_args::get_command_option_uint32(input_args, "-t", DEFAULT_SECONDS); + tlx_g = test_args::get_command_option_uint32(input_args, "-x", 0); + tly_g = test_args::get_command_option_uint32(input_args, "-y", 0); + width_g = test_args::get_command_option_uint32(input_args, "-width", DEFAULT_TARGET_WIDTH); + height_g = test_args::get_command_option_uint32(input_args, "-height", DEFAULT_TARGET_HEIGHT); + mcast_x_g = test_args::get_command_option_uint32(input_args, "-mx", 0); + mcast_y_g = test_args::get_command_option_uint32(input_args, "-my", 0); + mcast_from_n_eth_g = test_args::get_command_option_uint32(input_args, "-e", 0xffff); + mcast_size_g = test_args::get_command_option_uint32(input_args, "-m", 16); + ucast_size_g = test_args::get_command_option_uint32(input_args, "-u", 8192); + mcast_from_eth_g = (mcast_from_n_eth_g != 0xffff); + rnd_delay_g = test_args::has_command_option(input_args, "-rdelay"); + uint32_t seed = test_args::get_command_option_uint32(input_args, "-s", 0); + srand(seed); + + if (!mcast_from_eth_g && mcast_x_g >= tlx_g && mcast_x_g <= tlx_g + width_g - 1 && mcast_y_g >= tly_g && + mcast_y_g <= tly_g + height_g - 1) { + log_fatal("Mcast core can't be within mcast grid"); + exit(-1); + } +} + +int main(int argc, char** argv) { + init(argc, argv); + + tt_metal::IDevice* device = tt_metal::CreateDevice(device_num_g); + tt_metal::Program program = tt_metal::CreateProgram(); + + const auto& eth_cores = device->get_inactive_ethernet_cores(); + + CoreRange workers_logical({tlx_g, tly_g}, {tlx_g + width_g - 1, tly_g + height_g - 1}); + CoreCoord mcast_logical(mcast_x_g, mcast_y_g); + CoreCoord tl_core = device->worker_core_from_logical_core({tlx_g, tly_g}); + + if (mcast_from_eth_g) { + CoreCoord eth_logical(0, mcast_from_n_eth_g); + bool found = false; + for (const auto& eth_core : eth_cores) { + if (eth_logical == eth_core) { + found = true; + break; + } + } + if (!found) { + log_fatal("{} not found in the list of idle eth cores", mcast_from_n_eth_g); + tt_metal::CloseDevice(device); + exit(-1); + } + mcast_logical = eth_logical; + } + + CoreCoord virtual_offset = device->worker_core_from_logical_core({0, 0}); + TT_ASSERT(virtual_offset.x == virtual_offset.y); + std::vector compile_args = { + false, + tl_core.x, + tl_core.y, + width_g, + height_g, + time_secs_g, + ucast_size_g, + mcast_size_g, + virtual_offset.x, + N_RANDS, + rnd_delay_g, + tt::tt_metal::hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED), + tt::tt_metal::hal.get_dev_addr( + mcast_from_eth_g ? HalProgrammableCoreType::IDLE_ETH : HalProgrammableCoreType::TENSIX, + HalL1MemAddrType::UNRESERVED), + }; + + KernelHandle ucast_kernel = tt_metal::CreateKernel( + program, + "tests/tt_metal/tt_metal/test_kernels/stress_noc_mcast.cpp", + workers_logical, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_0, + .noc = tt_metal::NOC::RISCV_0_default, + .compile_args = compile_args, + }); + + for (CoreCoord coord : workers_logical) { + std::vector runtime_args; + // Not particularly random since all cores are getting the same data + // N_RANDS in bytes + CoreCoord grid_size = device->logical_grid_size(); + for (int i = 0; i < N_RANDS / sizeof(uint32_t); i++) { + uint32_t rnd = 0; + for (int j = 0; j < sizeof(uint32_t); j++) { + uint32_t x = rand() % grid_size.x; + uint32_t y = rand() % grid_size.y; + rnd = (rnd << 8) | (y << 4) | x; + } + runtime_args.push_back(rnd); + } + tt::tt_metal::SetRuntimeArgs(program, ucast_kernel, coord, runtime_args); + } + + compile_args[0] = true; + KernelHandle mcast_kernel; + if (mcast_from_eth_g) { + mcast_kernel = tt_metal::CreateKernel( + program, + "tests/tt_metal/tt_metal/test_kernels/stress_noc_mcast.cpp", + mcast_logical, + tt_metal::EthernetConfig{ + .eth_mode = Eth::IDLE, + .noc = tt_metal::NOC::NOC_0, + .compile_args = compile_args, + }); + } else { + mcast_kernel = tt_metal::CreateKernel( + program, + "tests/tt_metal/tt_metal/test_kernels/stress_noc_mcast.cpp", + mcast_logical, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_0, + .noc = tt_metal::NOC::RISCV_0_default, + .compile_args = compile_args, + }); + } + std::vector runtime_args; + for (int i = 0; i < 128; i++) { + runtime_args.push_back(rand()); + } + tt::tt_metal::SetRuntimeArgs(program, mcast_kernel, mcast_logical, runtime_args); + + CoreCoord mcast_virtual; + CoreCoord mcast_physical; + if (mcast_from_eth_g) { + mcast_virtual = device->ethernet_core_from_logical_core(mcast_logical); + mcast_physical = + tt::Cluster::instance().get_soc_desc(device_num_g).get_physical_ethernet_core_from_logical(mcast_logical); + } else { + mcast_virtual = device->worker_core_from_logical_core(mcast_logical); + mcast_physical = + tt::Cluster::instance().get_soc_desc(device_num_g).get_physical_tensix_core_from_logical(mcast_logical); + } + + log_info( + LogTest, + "MCast {} core: {}, virtual {}, physical {}, writing {} bytes per xfer", + mcast_from_eth_g ? "ETH" : "TENSIX", + mcast_logical, + mcast_virtual, + mcast_physical, + mcast_size_g); + log_info(LogTest, "Unicast grid: {}, writing {} bytes per xfer", workers_logical.str(), ucast_size_g); + + if (rnd_coord_g) { + log_info("Randomizing ucast noc write destinations"); + } else { + log_info("Non-random ucast noc write destinations TBD"); + } + + if (rnd_delay_g) { + log_info("Randomizing delay"); + } + log_info(LogTest, "Running for {} seconds", time_secs_g); + + tt::tt_metal::detail::LaunchProgram(device, program, true); + tt_metal::CloseDevice(device); +}