module @Embedding attributes {tt.device = #tt.device (0, d0, d1)>, l1Map = (d0, d1)[s0, s1] -> (0, d0 floordiv s0, d1 floordiv s1, (d0 mod s0) * s1 + d1 mod s1), dramMap = (d0, d1)[s0, s1] -> (0, 0, ((((d0 floordiv s0) * 8 + d1 floordiv s1) * (s1 * s0) + (d0 mod s0) * s1 + d1 mod s1) floordiv 8192) mod 12, (((d0 floordiv s0) * 8 + d1 floordiv s1) * (s1 * s0) + (d0 mod s0) * s1 + d1 mod s1) floordiv 98304 + (((d0 floordiv s0) * 8 + d1 floordiv s1) * (s1 * s0) + (d0 mod s0) * s1 + d1 mod s1) mod 8192), meshShape = , chipIds = [0]>, tt.system_desc = #tt.system_desc<[{role = host, target_triple = "x86_64-pc-linux-gnu"}], [{arch = , grid = 8x8, l1_size = 1499136, num_dram_channels = 12, dram_channel_size = 1073741824, noc_l1_address_align_bytes = 16, pcie_address_align_bytes = 32, noc_dram_address_align_bytes = 32, l1_unreserved_base = 1024, erisc_l1_unreserved_base = 1024, dram_unreserved_base = 1024, dram_unreserved_end = 1073741824, physical_cores = {worker = [ 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 1x0, 1x1, 1x2, 1x3, 1x4, 1x5, 1x6, 1x7, 2x0, 2x1, 2x2, 2x3, 2x4, 2x5, 2x6, 2x7, 3x0, 3x1, 3x2, 3x3, 3x4, 3x5, 3x6, 3x7, 4x0, 4x1, 4x2, 4x3, 4x4, 4x5, 4x6, 4x7, 5x0, 5x1, 5x2, 5x3, 5x4, 5x5, 5x6, 5x7, 6x0, 6x1, 6x2, 6x3, 6x4, 6x5, 6x6, 6x7, 7x0, 7x1, 7x2, 7x3, 7x4, 7x5, 7x6, 7x7] dram = [ 8x0, 9x0, 10x0, 8x1, 9x1, 10x1, 8x2, 9x2, 10x2, 8x3, 9x3, 10x3]}, supported_data_types = [, , , , , , , , , , , ], supported_tile_sizes = [ 4x16, 16x16, 32x16, 4x32, 16x32, 32x32], num_cbs = 32}], [0], [3 : i32], [ 0x0x0x0]>} { func.func @forward(%arg0: tensor<1x12xi32, #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x12xi32, #ttnn.buffer_type>>> {ttir.name = "input"}, %arg1: tensor<32000x3200xbf16, #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<32000x3200xbf16, #ttnn.buffer_type>>> {ttir.name = "weight"}) -> (tensor<1x12x3200xbf16, #ttnn.ttnn_layout<(d0, d1, d2) -> (d0 * 12 + d1, d2), <1x1>, memref<12x3200xbf16, #ttnn.buffer_type>>> {ttir.name = "Embedding.output_embedding_0"}) { %0 = "ttnn.get_device"() <{mesh_shape = #ttnn}> : () -> !tt.device< (0, d0, d1)>, l1Map = (d0, d1)[s0, s1] -> (0, d0 floordiv s0, d1 floordiv s1, (d0 mod s0) * s1 + d1 mod s1), dramMap = (d0, d1)[s0, s1] -> (0, 0, ((((d0 floordiv s0) * 8 + d1 floordiv s1) * (s1 * s0) + (d0 mod s0) * s1 + d1 mod s1) floordiv 8192) mod 12, (((d0 floordiv s0) * 8 + d1 floordiv s1) * (s1 * s0) + (d0 mod s0) * s1 + d1 mod s1) floordiv 98304 + (((d0 floordiv s0) * 8 + d1 floordiv s1) * (s1 * s0) + (d0 mod s0) * s1 + d1 mod s1) mod 8192), meshShape = , chipIds = [0]>> %1 = "ttnn.to_device"(%arg0, %0) <{memory_config = #ttnn.memory_config<, , <<1x12>>>}> : (tensor<1x12xi32, #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x12xi32, #ttnn.buffer_type>>>, !tt.device< (0, d0, d1)>, l1Map = (d0, d1)[s0, s1] -> (0, d0 floordiv s0, d1 floordiv s1, (d0 mod s0) * s1 + d1 mod s1), dramMap = (d0, d1)[s0, s1] -> (0, 0, ((((d0 floordiv s0) * 8 + d1 floordiv s1) * (s1 * s0) + (d0 mod s0) * s1 + d1 mod s1) floordiv 8192) mod 12, (((d0 floordiv s0) * 8 + d1 floordiv s1) * (s1 * s0) + (d0 mod s0) * s1 + d1 mod s1) floordiv 98304 + (((d0 floordiv s0) * 8 + d1 floordiv s1) * (s1 * s0) + (d0 mod s0) * s1 + d1 mod s1) mod 8192), meshShape = , chipIds = [0]>>) -> tensor<1x12xi32, #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x12xi32, #ttnn.buffer_type>, interleaved>> %2 = "ttnn.to_device"(%arg1, %0) <{memory_config = #ttnn.memory_config<, , <<32000x3200>>>}> : (tensor<32000x3200xbf16, #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<32000x3200xbf16, #ttnn.buffer_type>>>, !tt.device< (0, d0, d1)>, l1Map = (d0, d1)[s0, s1] -> (0, d0 floordiv s0, d1 floordiv s1, (d0 mod s0) * s1 + d1 mod s1), dramMap = (d0, d1)[s0, s1] -> (0, 0, ((((d0 floordiv s0) * 8 + d1 floordiv s1) * (s1 * s0) + (d0 mod s0) * s1 + d1 mod s1) floordiv 8192) mod 12, (((d0 floordiv s0) * 8 + d1 floordiv s1) * (s1 * s0) + (d0 mod s0) * s1 + d1 mod s1) floordiv 98304 + (((d0 floordiv s0) * 8 + d1 floordiv s1) * (s1 * s0) + (d0 mod s0) * s1 + d1 mod s1) mod 8192), meshShape = , chipIds = [0]>>) -> tensor<32000x3200xbf16, #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<32000x3200xbf16, #ttnn.buffer_type>, interleaved>> %3 = "ttnn.empty"(%0) <{dtype = #tt.supportedDataTypes, layout = #ttnn.layout, memory_config = #ttnn.memory_config<, , <<12x3200>>>, shape = #ttnn.shape<1x12x3200>}> : (!tt.device< (0, d0, d1)>, l1Map = (d0, d1)[s0, s1] -> (0, d0 floordiv s0, d1 floordiv s1, (d0 mod s0) * s1 + d1 mod s1), dramMap = (d0, d1)[s0, s1] -> (0, 0, ((((d0 floordiv s0) * 8 + d1 floordiv s1) * (s1 * s0) + (d0 mod s0) * s1 + d1 mod s1) floordiv 8192) mod 12, (((d0 floordiv s0) * 8 + d1 floordiv s1) * (s1 * s0) + (d0 mod s0) * s1 + d1 mod s1) floordiv 98304 + (((d0 floordiv s0) * 8 + d1 floordiv s1) * (s1 * s0) + (d0 mod s0) * s1 + d1 mod s1) mod 8192), meshShape = , chipIds = [0]>>) -> tensor<1x12x3200xbf16, #ttnn.ttnn_layout<(d0, d1, d2) -> (d0 * 12 + d1, d2), <1x1>, memref<12x3200xbf16, #ttnn.buffer_type>, interleaved>> %4 = "ttnn.embedding"(%1, %3, %2) : (tensor<1x12xi32, #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x12xi32, #ttnn.buffer_type>, interleaved>>, tensor<1x12x3200xbf16, #ttnn.ttnn_layout<(d0, d1, d2) -> (d0 * 12 + d1, d2), <1x1>, memref<12x3200xbf16, #ttnn.buffer_type>, interleaved>>, tensor<32000x3200xbf16, #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<32000x3200xbf16, #ttnn.buffer_type>, interleaved>>) -> tensor<1x12x3200xbf16, #ttnn.ttnn_layout<(d0, d1, d2) -> (d0 * 12 + d1, d2), <1x1>, memref<12x3200xbf16, #ttnn.buffer_type>, interleaved>> "ttnn.deallocate"(%2) <{force = false}> : (tensor<32000x3200xbf16, #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<32000x3200xbf16, #ttnn.buffer_type>, interleaved>>) -> () "ttnn.deallocate"(%1) <{force = false}> : (tensor<1x12xi32, #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x12xi32, #ttnn.buffer_type>, interleaved>>) -> () %5 = "ttnn.from_device"(%4) : (tensor<1x12x3200xbf16, #ttnn.ttnn_layout<(d0, d1, d2) -> (d0 * 12 + d1, d2), <1x1>, memref<12x3200xbf16, #ttnn.buffer_type>, interleaved>>) -> tensor<1x12x3200xbf16, #ttnn.ttnn_layout<(d0, d1, d2) -> (d0 * 12 + d1, d2), <1x1>, memref<12x3200xbf16, #ttnn.buffer_type>>> "ttnn.deallocate"(%3) <{force = false}> : (tensor<1x12x3200xbf16, #ttnn.ttnn_layout<(d0, d1, d2) -> (d0 * 12 + d1, d2), <1x1>, memref<12x3200xbf16, #ttnn.buffer_type>, interleaved>>) -> () %6 = "ttnn.to_layout"(%5) <{layout = #ttnn.layout}> : (tensor<1x12x3200xbf16, #ttnn.ttnn_layout<(d0, d1, d2) -> (d0 * 12 + d1, d2), <1x1>, memref<12x3200xbf16, #ttnn.buffer_type>>>) -> tensor<1x12x3200xbf16, #ttnn.ttnn_layout<(d0, d1, d2) -> (d0 * 12 + d1, d2), <1x1>, memref<12x3200xbf16, #ttnn.buffer_type>>> "ttnn.deallocate"(%5) <{force = false}> : (tensor<1x12x3200xbf16, #ttnn.ttnn_layout<(d0, d1, d2) -> (d0 * 12 + d1, d2), <1x1>, memref<12x3200xbf16, #ttnn.buffer_type>>>) -> () return %6 : tensor<1x12x3200xbf16, #ttnn.ttnn_layout<(d0, d1, d2) -> (d0 * 12 + d1, d2), <1x1>, memref<12x3200xbf16, #ttnn.buffer_type>>> } }