Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stress NOC mcast test #16639

Merged
merged 1 commit into from
Jan 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions tests/tt_metal/tt_metal/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ set(TT_METAL_TESTS_SRCS
# test_eltwise_unary.cpp <- not tested in run_tt_metal.py
test_matmul_single_tile_bfp8b.cpp
test_matmul_single_tile_output_in_l1.cpp
test_stress_noc_mcast.cpp
test_dram_loopback_single_core.cpp
test_datacopy_bfp8b.cpp
test_datacopy.cpp
Expand Down
58 changes: 58 additions & 0 deletions tests/tt_metal/tt_metal/test_kernels/stress_noc_mcast.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include <c_tensix_core.h>

constexpr bool mcaster = get_compile_time_arg_val(0);
constexpr uint32_t tlx = get_compile_time_arg_val(1);
constexpr uint32_t tly = get_compile_time_arg_val(2);
constexpr uint32_t width = get_compile_time_arg_val(3);
constexpr uint32_t height = get_compile_time_arg_val(4);
constexpr uint64_t duration = (uint64_t)get_compile_time_arg_val(5) * 1000 * 1000 * 1000;
constexpr uint32_t ucast_size = get_compile_time_arg_val(6);
constexpr uint32_t mcast_size = get_compile_time_arg_val(7);
constexpr uint32_t virtual_grid_offset = get_compile_time_arg_val(8);
constexpr uint32_t nrands = get_compile_time_arg_val(9);
constexpr bool enable_rnd_delay = get_compile_time_arg_val(10);
constexpr uint32_t ucast_l1_addr = get_compile_time_arg_val(11);
constexpr uint32_t mcast_l1_addr = get_compile_time_arg_val(12);

inline uint32_t next_rand(tt_l1_ptr uint8_t* rnds, uint32_t& rnd_index) {
uint32_t rnd = rnds[rnd_index];
rnd_index = (rnd_index + 1) & (nrands - 1);
return rnd;
}

void kernel_main() {
uint64_t done_time = c_tensix_core::read_wall_clock() + duration;
tt_l1_ptr uint8_t* rnds = (tt_l1_ptr uint8_t*)(get_arg_addr(0));
uint32_t rnd_index = 0;

uint64_t stall_time = 0;
while (c_tensix_core::read_wall_clock() < done_time) {
for (uint32_t count = 0; count < 1000; count++) {
if (enable_rnd_delay) {
// reading time here biases us to have more ~0 cycle stalls as this
// includes the write time
while (c_tensix_core::read_wall_clock() < stall_time);
stall_time = c_tensix_core::read_wall_clock() + next_rand(rnds, rnd_index);
}

if (mcaster) {
uint64_t dst_noc_multicast_addr =
get_noc_multicast_addr(tlx, tly, tlx + width - 1, tly + height - 1, ucast_l1_addr);
noc_async_write_multicast(mcast_l1_addr, dst_noc_multicast_addr, mcast_size, width * height, false);
} else {
uint32_t dst_x, dst_y;
uint8_t noc_addr = next_rand(rnds, rnd_index);
dst_x = (noc_addr & 0xf) + virtual_grid_offset;
dst_y = (noc_addr >> 4) + virtual_grid_offset;
uint64_t noc_write_addr = NOC_XY_ADDR(NOC_X(dst_x), NOC_Y(dst_y), ucast_l1_addr);
noc_async_write(ucast_l1_addr, noc_write_addr, ucast_size);
}
}
}

noc_async_write_barrier();
}
53 changes: 53 additions & 0 deletions tests/tt_metal/tt_metal/test_kernels/sweep_stress_noc_mcast.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#/bin/bash

if [ "$ARCH_NAME" = "wormhole_b0" ]; then
echo "Configured core range for wormhole_b0"
width=8
height=8
elif [ "$ARCH_NAME" = "blackhole" ]; then
echo "Configured core range for blackhole"
width=12
height=10
else
echo "Unknown arch: $ARCH_NAME"
exit 1
fi

function run_set() {
echo "running: $@"
TT_METAL_SLOW_DISPATCH_MODE=1 build/test/tt_metal/test_stress_noc_mcast -t 120 $@
}

function run_all() {
run_set $@ -u 32 -m 32
run_set $@ -u 32 -m 256
run_set $@ -u 32 -m 2048
run_set $@ -u 32 -m 4096
run_set $@ -u 32 -m 8192
run_set $@ -u 256 -m 32
run_set $@ -u 2048 -m 256
run_set $@ -u 4096 -m 2048
run_set $@ -u 8192 -m 4096
}

# sweep w/ randomized noc address, tensix mcast
for (( i=0; i<$width; i++ )); do
h=$((height -1))
run_all -x 0 -y 0 -width $width -height $h -mx $i -my $h
done

# sweep w/ randomized delay+noc address, tensix mcast
for (( i=0; i<=11; i++ )); do
h=$((height -1))
run_all -x 0 -y 0 -width $width -height $h -mx $i -my $h -rdelay
done

# sweep w/ randomized noc address, eth mcast
for (( i=0; i<=11; i++ )); do
run_all -e $i -width $width -height $height
done

# sweep w/ randomized delay+noc address, eth mcast
for (( i=0; i<=11; i++ )); do
run_all -e $i -width $width -height $height -rdelay
done
236 changes: 236 additions & 0 deletions tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

// This test stresses NOC mcast by:
// - using 1 mcast core (future work to add multiple) either tensix or eth
// - rapidly mcast into a grid of tensix workers
// - rapidly grid of tensix workers generates random noc traffic
// - does not verify correct transactions, just runs til termination

#include <algorithm>
#include <cstdint>
#include <functional>
#include <random>
#include <string>

#include "core_coord.hpp"
#include "logger.hpp"
#include "tt_metal/host_api.hpp"
#include "tt_metal/detail/tt_metal.hpp"
#include "tt_metal/llrt/rtoptions.hpp"
#include "tt_metal/common/metal_soc_descriptor.h"
#include "tt_metal/impl/event/event.hpp"
#include "tt_metal/impl/dispatch/command_queue.hpp"
#include "tt_metal/impl/device/device.hpp"
#include "tt_metal/common/metal_soc_descriptor.h"
#include "llrt/hal.hpp"

using namespace tt;

const uint32_t CB_ELEMENTS = 2048;
const uint32_t DEFAULT_SECONDS = 10;
const uint32_t DEFAULT_TARGET_WIDTH = 1;
const uint32_t DEFAULT_TARGET_HEIGHT = 1;
const uint32_t N_RANDS = 512;

uint32_t device_num_g = 0;
uint32_t time_secs_g = DEFAULT_SECONDS;
uint32_t tlx_g = 0;
uint32_t tly_g = 0;
uint32_t width_g = DEFAULT_TARGET_WIDTH;
uint32_t height_g = DEFAULT_TARGET_HEIGHT;
uint32_t mcast_x_g = 0;
uint32_t mcast_y_g = 0;
uint32_t mcast_size_g = 16;
uint32_t ucast_size_g = 8192;
uint32_t mcast_from_n_eth_g;
bool mcast_from_eth_g;
bool rnd_delay_g = false;
bool rnd_coord_g = true;

void init(int argc, char** argv) {
std::vector<std::string> input_args(argv, argv + argc);

if (test_args::has_command_option(input_args, "-h") || test_args::has_command_option(input_args, "--help")) {
log_info(LogTest, "Usage:");
log_info(LogTest, " -v: device number to run on (default 0) ", DEFAULT_SECONDS);
log_info(LogTest, " -t: time in seconds (default {})", DEFAULT_SECONDS);
log_info(LogTest, " -x: grid top left x");
log_info(LogTest, " -y: grid top left y");
log_info(LogTest, " -width: unicast grid width (default {})", DEFAULT_TARGET_WIDTH);
log_info(LogTest, "-height: unicast grid height (default {})", DEFAULT_TARGET_HEIGHT);
log_info(LogTest, " -mx: mcast core x");
log_info(LogTest, " -my: mcast core y");
log_info(LogTest, " -e: mcast from nth idle eth core (ignores -mx,-my)");
log_info(LogTest, " -m: mcast packet size");
log_info(LogTest, " -u: ucast packet size");
log_info(LogTest, "-rdelay: insert random delay between noc transactions");
log_info(LogTest, " -s: seed random number generator");
exit(0);
}

device_num_g = test_args::get_command_option_uint32(input_args, "-v", 0);
time_secs_g = test_args::get_command_option_uint32(input_args, "-t", DEFAULT_SECONDS);
tlx_g = test_args::get_command_option_uint32(input_args, "-x", 0);
tly_g = test_args::get_command_option_uint32(input_args, "-y", 0);
width_g = test_args::get_command_option_uint32(input_args, "-width", DEFAULT_TARGET_WIDTH);
height_g = test_args::get_command_option_uint32(input_args, "-height", DEFAULT_TARGET_HEIGHT);
mcast_x_g = test_args::get_command_option_uint32(input_args, "-mx", 0);
mcast_y_g = test_args::get_command_option_uint32(input_args, "-my", 0);
mcast_from_n_eth_g = test_args::get_command_option_uint32(input_args, "-e", 0xffff);
mcast_size_g = test_args::get_command_option_uint32(input_args, "-m", 16);
ucast_size_g = test_args::get_command_option_uint32(input_args, "-u", 8192);
mcast_from_eth_g = (mcast_from_n_eth_g != 0xffff);
rnd_delay_g = test_args::has_command_option(input_args, "-rdelay");
uint32_t seed = test_args::get_command_option_uint32(input_args, "-s", 0);
srand(seed);

if (!mcast_from_eth_g && mcast_x_g >= tlx_g && mcast_x_g <= tlx_g + width_g - 1 && mcast_y_g >= tly_g &&
mcast_y_g <= tly_g + height_g - 1) {
log_fatal("Mcast core can't be within mcast grid");
exit(-1);
}
}

int main(int argc, char** argv) {
init(argc, argv);

tt_metal::IDevice* device = tt_metal::CreateDevice(device_num_g);
tt_metal::Program program = tt_metal::CreateProgram();

const auto& eth_cores = device->get_inactive_ethernet_cores();

CoreRange workers_logical({tlx_g, tly_g}, {tlx_g + width_g - 1, tly_g + height_g - 1});
CoreCoord mcast_logical(mcast_x_g, mcast_y_g);
CoreCoord tl_core = device->worker_core_from_logical_core({tlx_g, tly_g});

if (mcast_from_eth_g) {
CoreCoord eth_logical(0, mcast_from_n_eth_g);
bool found = false;
for (const auto& eth_core : eth_cores) {
if (eth_logical == eth_core) {
found = true;
break;
}
}
if (!found) {
log_fatal("{} not found in the list of idle eth cores", mcast_from_n_eth_g);
tt_metal::CloseDevice(device);
exit(-1);
}
mcast_logical = eth_logical;
}

CoreCoord virtual_offset = device->worker_core_from_logical_core({0, 0});
TT_ASSERT(virtual_offset.x == virtual_offset.y);
std::vector<uint32_t> compile_args = {
false,
tl_core.x,
tl_core.y,
width_g,
height_g,
time_secs_g,
ucast_size_g,
mcast_size_g,
virtual_offset.x,
N_RANDS,
rnd_delay_g,
tt::tt_metal::hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED),
tt::tt_metal::hal.get_dev_addr(
mcast_from_eth_g ? HalProgrammableCoreType::IDLE_ETH : HalProgrammableCoreType::TENSIX,
HalL1MemAddrType::UNRESERVED),
};

KernelHandle ucast_kernel = tt_metal::CreateKernel(
program,
"tests/tt_metal/tt_metal/test_kernels/stress_noc_mcast.cpp",
workers_logical,
tt_metal::DataMovementConfig{
.processor = tt_metal::DataMovementProcessor::RISCV_0,
.noc = tt_metal::NOC::RISCV_0_default,
.compile_args = compile_args,
});

for (CoreCoord coord : workers_logical) {
std::vector<uint32_t> runtime_args;
// Not particularly random since all cores are getting the same data
// N_RANDS in bytes
CoreCoord grid_size = device->logical_grid_size();
for (int i = 0; i < N_RANDS / sizeof(uint32_t); i++) {
uint32_t rnd = 0;
for (int j = 0; j < sizeof(uint32_t); j++) {
uint32_t x = rand() % grid_size.x;
uint32_t y = rand() % grid_size.y;
rnd = (rnd << 8) | (y << 4) | x;
}
runtime_args.push_back(rnd);
}
tt::tt_metal::SetRuntimeArgs(program, ucast_kernel, coord, runtime_args);
}

compile_args[0] = true;
KernelHandle mcast_kernel;
if (mcast_from_eth_g) {
mcast_kernel = tt_metal::CreateKernel(
program,
"tests/tt_metal/tt_metal/test_kernels/stress_noc_mcast.cpp",
mcast_logical,
tt_metal::EthernetConfig{
.eth_mode = Eth::IDLE,
.noc = tt_metal::NOC::NOC_0,
.compile_args = compile_args,
});
} else {
mcast_kernel = tt_metal::CreateKernel(
program,
"tests/tt_metal/tt_metal/test_kernels/stress_noc_mcast.cpp",
mcast_logical,
tt_metal::DataMovementConfig{
.processor = tt_metal::DataMovementProcessor::RISCV_0,
.noc = tt_metal::NOC::RISCV_0_default,
.compile_args = compile_args,
});
}
std::vector<uint32_t> runtime_args;
for (int i = 0; i < 128; i++) {
runtime_args.push_back(rand());
}
tt::tt_metal::SetRuntimeArgs(program, mcast_kernel, mcast_logical, runtime_args);

CoreCoord mcast_virtual;
CoreCoord mcast_physical;
if (mcast_from_eth_g) {
mcast_virtual = device->ethernet_core_from_logical_core(mcast_logical);
mcast_physical =
tt::Cluster::instance().get_soc_desc(device_num_g).get_physical_ethernet_core_from_logical(mcast_logical);
} else {
mcast_virtual = device->worker_core_from_logical_core(mcast_logical);
mcast_physical =
tt::Cluster::instance().get_soc_desc(device_num_g).get_physical_tensix_core_from_logical(mcast_logical);
}

log_info(
LogTest,
"MCast {} core: {}, virtual {}, physical {}, writing {} bytes per xfer",
mcast_from_eth_g ? "ETH" : "TENSIX",
mcast_logical,
mcast_virtual,
mcast_physical,
mcast_size_g);
log_info(LogTest, "Unicast grid: {}, writing {} bytes per xfer", workers_logical.str(), ucast_size_g);

if (rnd_coord_g) {
log_info("Randomizing ucast noc write destinations");
} else {
log_info("Non-random ucast noc write destinations TBD");
}

if (rnd_delay_g) {
log_info("Randomizing delay");
}
log_info(LogTest, "Running for {} seconds", time_secs_g);

tt::tt_metal::detail::LaunchProgram(device, program, true);
tt_metal::CloseDevice(device);
}
Loading