Use tilized dram-interleaved as default input-output layout

tenstorrent · Jan 14, 2025 · 1a31acb · 1a31acb
1 parent 5b4122a
commit 1a31acb
Show file tree

Hide file tree

Showing 95 changed files with 799 additions and 652 deletions.
diff --git a/include/ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.td b/include/ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.td
@@ -88,11 +88,7 @@ def TTNN_MemoryConfigAttr : TTNN_Attr<"MemoryConfig", "memory_config"> {
   let assemblyFormat = "`<` params `>`";
 
   let extraClassDeclaration = [{
-    ::llvm::ArrayRef<int64_t> getShardShapeArray() const
-    {
-      return this->getShardSpec().getShardShape().getShape();
-    }
-
+    llvm::ArrayRef<int64_t> getShardShape(bool convertTileToScalar = true) const;
     MemoryConfigAttr withBufferType(::mlir::MLIRContext *context, BufferType bufferType);
     MemoryConfigAttr withMemoryLayout(::mlir::MLIRContext *context, TensorMemoryLayout memLayout);
   }];

diff --git a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
@@ -3,7 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "ttmlir/Conversion/TTIRToTTNN/TTIRToTTNN.h"
-
 #include "ttmlir/Conversion/TTIRToTTNN/Utils.h"
 #include "ttmlir/Dialect/TT/IR/TTOpsTypes.h"
 #include "ttmlir/Dialect/TTIR/IR/TTIROps.h"
@@ -13,6 +12,7 @@
 #include "ttmlir/Dialect/TTNN/Utils/TransformUtils.h"
 #include "ttmlir/Dialect/TTNN/Utils/Utils.h"
 
+#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
@@ -170,6 +170,9 @@ class ToLayoutOpConversionPattern
       rewriter.eraseOp(emptyOp);
     }
 
+    assert(mlir::isa<mlir::RankedTensorType>(adaptor.getInput().getType()) &&
+           "Expected RankedTensorType for ToLayoutOp input");
+
     auto outputLayoutAttr = mlir::cast<ttnn::TTNNLayoutAttr>(
         op.getResult().getType().getEncoding());
 
@@ -186,32 +189,6 @@ class ToLayoutOpConversionPattern
     bool isOutputOnHost = (outputBufferType == ttnn::BufferType::SystemMemory);
 
     RankedTensorType result = mlir::cast<RankedTensorType>(op.getType());
-    if (!isOutputOnHost) {
-      // TODO(bug #665):
-      // Binary ops fail with row major layout in ttnn, defaulting to and
-      // assuming tile layout for all device tensors...
-      // Note: mlir doesn't know about this, so tensors may still appear as row
-      // major in the generated mlir
-      // TODO(bug #875):
-      // Remove the following code block once constraints modelling is
-      // implemented on dialect level
-      //
-      // Default to Tile layout unless op supports only RowMajor layout
-      //
-      ttnn::Layout newOutputLayoutEnum =
-          shouldForceRowMajor(op) ? ttnn::Layout::RowMajor : ttnn::Layout::Tile;
-
-      // If the layout of the output tensor changed as a result of forcing the
-      // layout update the tensor type
-      if (outputLayoutEnum != newOutputLayoutEnum) {
-        result =
-            getLayoutForcedResultTensor(rewriter, result, newOutputLayoutEnum);
-        op.getResult().setType(result);
-        outputLayoutAttr =
-            mlir::cast<ttnn::TTNNLayoutAttr>(result.getEncoding());
-        outputLayoutEnum = newOutputLayoutEnum;
-      }
-    }
 
     ttnn::LayoutAttr outputLayout =
         ttnn::LayoutAttr::get(rewriter.getContext(), outputLayoutEnum);
@@ -235,68 +212,6 @@ class ToLayoutOpConversionPattern
 
     return success();
   }
-
-private:
-  bool shouldForceRowMajor(ttir::ToLayoutOp op) const {
-    // Check if the output tensor is used by an op that only supports row major.
-    //
-    // EmbeddingBackwardOp supports row major layout for the first and second
-    // operands.
-    for (mlir::Operation *user : op.getResult().getUsers()) {
-      if (isa<ttir::Conv2dOp>(user) || isa<ttir::SliceOp>(user) ||
-          (isa<ttir::EmbeddingBackwardOp>(user) &&
-           (user->getOperand(0) == op || user->getOperand(1) == op))) {
-        return true;
-      }
-    }
-
-    return false;
-  }
-
-  RankedTensorType
-  getLayoutForcedResultTensor(ConversionPatternRewriter &rewriter,
-                              RankedTensorType oldOutput,
-                              ttnn::Layout newOutputLayoutEnum) const {
-    auto oldOutputLayoutAttr =
-        mlir::cast<ttnn::TTNNLayoutAttr>(oldOutput.getEncoding());
-    DataType outputDtype = oldOutputLayoutAttr.getDataType();
-    SmallVector<std::int64_t> oldShardShape =
-        oldOutputLayoutAttr.getShardShape();
-    size_t shardShapeSize = oldShardShape.size();
-    assert(shardShapeSize >= 2 && "expected at least 2D shape");
-
-    if (newOutputLayoutEnum == ttnn::Layout::RowMajor) {
-      // Set shard shape to match convention of row major layout
-      auto tileType =
-          mlir::cast<TileType>(oldOutputLayoutAttr.getElementType());
-      llvm::SmallVector<int64_t> newShardShape(oldShardShape.begin(),
-                                               oldShardShape.end());
-      newShardShape[shardShapeSize - 2] =
-          oldShardShape[shardShapeSize - 2] * tileType.getHeight();
-      newShardShape[shardShapeSize - 1] =
-          oldShardShape[shardShapeSize - 1] * tileType.getWidth();
-      Type newElementType = ttnn::utils::createRowMajorTypeFromDtype(
-          rewriter.getContext(), outputDtype);
-      RankedTensorType result = RankedTensorType::get(
-          oldOutput.getShape(), oldOutput.getElementType(),
-          oldOutputLayoutAttr
-              .withElementType(rewriter.getContext(), newElementType)
-              .withShardShape(rewriter.getContext(), newShardShape));
-      return result;
-    }
-
-    if (newOutputLayoutEnum == ttnn::Layout::Tile) {
-      TileType tileType =
-          TileType::get(rewriter.getContext(),
-                        {ttnn::TILE_HEIGHT, ttnn::TILE_WIDTH}, outputDtype);
-      RankedTensorType result = RankedTensorType::get(
-          oldOutput.getShape(), oldOutput.getElementType(),
-          oldOutputLayoutAttr.withElementType(rewriter.getContext(), tileType));
-      return result;
-    }
-
-    llvm_unreachable("Unreachable code path. Unexpected output layout enum");
-  }
 };
 
 template <typename TTIROpTy, typename TTNNOpTy,
@@ -703,24 +618,65 @@ class ConstantOpConversionPattern
   matchAndRewrite(ttir::ConstantOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     ::mlir::ElementsAttr valueAttr = op.getValue();
-
     LogicalResult legalityResult = checkBasicLegality(op, valueAttr, rewriter);
     if (!legalityResult.succeeded()) {
       return legalityResult;
     }
 
     if (valueAttr.isSplat()) {
       Value device = ::ttnn::utils::getOrInsertDevice(rewriter, op);
+      auto outputType = mlir::cast<RankedTensorType>(op.getType());
+      ttnn::TTNNLayoutAttr layoutAttr =
+          mlir::cast<ttnn::TTNNLayoutAttr>(outputType.getEncoding());
       float fillValue =
           valueAttr.getElementType().isInteger()
               ? getIntegerValue(valueAttr)
               : valueAttr.getSplatValue<mlir::APFloat>().convertToFloat();
 
       ::mlir::FloatAttr fillValueAttr = rewriter.getF32FloatAttr(fillValue);
-      rewriter.replaceOpWithNewOp<ttnn::FullOp>(
-          op, this->getTypeConverter()->convertType(op.getType()), device,
-          fillValueAttr);
-
+      if (outputType.getRank() > 1 ||
+          !mlir::isa<TileType>(layoutAttr.getElementType())) {
+        rewriter.replaceOpWithNewOp<ttnn::FullOp>(
+            op, this->getTypeConverter()->convertType(outputType), device,
+            fillValueAttr);
+      } else {
+        // ttnn::FullOp does not support 1D tilized tensors
+        // If the output of full is a 1D tensor and is tiled
+        // we need to convert it to row major layout then tilize separately
+
+        // Can't use withElementType because the shard shape would be wrong
+        ttnn::TTNNLayoutAttr rowMajorLayoutAttr = ttnn::TTNNLayoutAttr::get(
+            rewriter.getContext(), outputType.getShape(),
+            layoutAttr.getScalarElementType(), layoutAttr.getBufferType(),
+            layoutAttr.getGrid(), layoutAttr.getMemLayout());
+
+        auto fullOpOutputType = RankedTensorType::get(
+            outputType.getShape(), outputType.getElementType(),
+            rowMajorLayoutAttr);
+        auto fullOp = rewriter.create<ttnn::FullOp>(
+            op.getLoc(),
+            this->getTypeConverter()->convertType(fullOpOutputType), device,
+            fillValueAttr);
+
+        // Tilize the fullOp output separately
+        ttnn::MemoryConfigAttr memConfigAttr =
+            rewriter.getAttr<ttnn::MemoryConfigAttr>(
+                rewriter.getAttr<ttnn::BufferTypeAttr>(
+                    layoutAttr.getBufferType()),
+                rewriter.getAttr<ttnn::ShardSpecAttr>(
+                    rewriter.getAttr<ttnn::ShapeAttr>(
+                        layoutAttr.getShardShape())),
+                layoutAttr.getMemLayout());
+
+        bool isOutputOnHost =
+            (layoutAttr.getBufferType() == ttnn::BufferType::SystemMemory);
+
+        rewriter.replaceOpWithNewOp<ttnn::ToLayoutOp>(
+            op, this->getTypeConverter()->convertType(op.getType()), fullOp,
+            ttnn::Layout::Tile,
+            DataTypeAttr::get(rewriter.getContext(), layoutAttr.getDataType()),
+            memConfigAttr, isOutputOnHost ? nullptr : device);
+      }
     } else {
       return rewriter.notifyMatchFailure(
           op, "TTNN doesn't currently support tensor creation from multiple "

diff --git a/lib/Dialect/TT/IR/TTOpsTypes.cpp b/lib/Dialect/TT/IR/TTOpsTypes.cpp
@@ -128,9 +128,9 @@ mlir::tt::SystemDescAttr::getDefault(MLIRContext *context) {
 mlir::tt::SystemDescAttr
 mlir::tt::SystemDescAttr::getFromPath(MLIRContext *context, std::string &path) {
   // Check if file exists
-  assert(!path.empty() && "cluster desc path must not be empty!");
+  assert(!path.empty() && "system desc path must not be empty!");
   std::ifstream fbb(path, std::ios::binary | std::ios::ate);
-  assert(fbb.good() && "cluster desc does not exist!");
+  assert(fbb.good() && "system desc does not exist!");
   std::streampos size = fbb.tellg();
   fbb.seekg(0, std::ios::beg);
   auto buffer = std::shared_ptr<void>(std::malloc(size), std::free);

diff --git a/lib/Dialect/TTNN/Analysis/BFInterleavedPolicy.cpp b/lib/Dialect/TTNN/Analysis/BFInterleavedPolicy.cpp
@@ -117,6 +117,25 @@ void BFInterleavedPolicy::run() {
       scheduler.scheduleOp(nextOpForScheduling);
     }
 
+    // TODO (#0000): This is a temporary solution
+    // Currently ReturnOps are not considered when calculating L1 usage
+    llvm::SmallVector<mlir::Operation *> eraseableL1UsageOps;
+    for (auto &[op, usage] : currentL1UsagePerOp) {
+      for (Operation *user : op->getUsers()) {
+        if (isa<mlir::func::ReturnOp>(user)) {
+          usage.numOfUnscheduledUsers -= 1;
+        }
+      }
+      if (usage.numOfUnscheduledUsers == 0) {
+        eraseableL1UsageOps.push_back(op);
+      }
+    }
+
+    for (Operation *op : eraseableL1UsageOps) {
+      currentL1Usage -= currentL1UsagePerOp[op].l1MemUsagePerUser;
+      currentL1UsagePerOp.erase(op);
+    }
+
     assert(currentL1Usage == 0);
     assert(currentL1UsagePerOp.size() == 0);
 

diff --git a/lib/Dialect/TTNN/Transforms/Optimizer.cpp b/lib/Dialect/TTNN/Transforms/Optimizer.cpp
@@ -224,6 +224,22 @@ class TTNNOptimizer : public impl::TTNNOptimizerBase<TTNNOptimizer> {
       // If schedule is set, apply order of operations to func.
       //
       if (opSchedule[func].size() > 1) {
+        // TODO (#0000): This is a temporary solution - when defaulting to dram
+        // tile input/output layout, GetDeviceOp can randomly appear as the last
+        // op in the graph instead of the first. This workaround ensures
+        // getDeviceOp is always in the beginning of the schedule.
+        // To reproduce, remove this workaround and run
+        // Silicon/TTNN/optimizer/mnist_sharding.mlir multiple times (as it is
+        // non-deterministic).
+        Operation **it =
+            std::find_if(opSchedule[func].begin(), opSchedule[func].end(),
+                         [](Operation *op) { return isa<GetDeviceOp>(op); });
+        if (it != opSchedule[func].end()) {
+          GetDeviceOp deviceOp = mlir::cast<GetDeviceOp>(*it);
+          opSchedule[func].erase(it);
+          opSchedule[func].insert(opSchedule[func].begin(), deviceOp);
+        }
+
         for (size_t i = 0; i < opSchedule[func].size() - 1; i++) {
           Operation *op = opSchedule[func][i];
 

diff --git a/lib/Dialect/TTNN/Transforms/TTNNDecomposeLayouts.cpp b/lib/Dialect/TTNN/Transforms/TTNNDecomposeLayouts.cpp
@@ -30,8 +30,11 @@ class TTNNDecomposeLayouts
       });
     });
     for (Operation *op : opsToReplace) {
-      this->createLayoutConversionOps(mlir::cast<ttnn::ToLayoutOp>(op),
-                                      rewriter);
+      if (failed(createLayoutConversionOps(mlir::cast<ttnn::ToLayoutOp>(op),
+                                           rewriter))) {
+        signalPassFailure();
+        return;
+      }
       rewriter.eraseOp(op);
     }
   }
@@ -42,6 +45,7 @@ class TTNNDecomposeLayouts
     ttnn::Layout layoutEnum;
     DataType dataType;
     ttnn::TensorMemoryLayoutAttr tensorMemoryLayout;
+    GridAttr shardGrid;
     llvm::SmallVector<int64_t> shardShape;
 
     ttnn::MemoryConfigAttr createMemoryConfigAttr(MLIRContext *context) const {
@@ -51,7 +55,9 @@ class TTNNDecomposeLayouts
                                    ttnn::ShapeAttr::get(context, shardShape)),
           tensorMemoryLayout);
     }
-
+    bool isL1Sharded() const {
+      return isShardedMemoryLayout(tensorMemoryLayout.getValue());
+    }
     bool isOnHost() const {
       return bufferType == ttnn::BufferType::SystemMemory;
     }
@@ -115,6 +121,9 @@ class TTNNDecomposeLayouts
     auto inputLayoutAttr =
         mlir::cast<TTNNLayoutAttr>(op.getInput().getType().getEncoding());
 
+    auto outputLayoutAttr =
+        mlir::cast<TTNNLayoutAttr>(op.getResult().getType().getEncoding());
+
     assert(op.getMemoryConfig().has_value());
     MemoryConfigAttr outputMemoryConfig = op.getMemoryConfig().value();
 
@@ -131,9 +140,12 @@ class TTNNDecomposeLayouts
     input.tensorMemoryLayout = inputLayoutAttr.getMemLayout();
     output.tensorMemoryLayout = outputMemoryConfig.getTensorMemoryLayout();
 
+    input.shardGrid = inputLayoutAttr.getGrid();
+    output.shardGrid = outputLayoutAttr.getGrid();
+
     input.shardShape = inputLayoutAttr.getShardShape();
-    output.shardShape =
-        llvm::SmallVector<int64_t>{outputMemoryConfig.getShardShapeArray()};
+    output.shardShape = outputLayoutAttr.getShardShape();
+
     return {input, output};
   }
 
@@ -148,14 +160,6 @@ class TTNNDecomposeLayouts
 
     opsToCreate.createTypecastOp = input.dataType != output.dataType;
     opsToCreate.createToLayoutOp = input.layoutEnum != output.layoutEnum;
-    // TODO(bug #665):
-    // Insert a ToLayoutOp manually if we're moving from device to host to
-    // untilize. Since we're hardcoding tile layout, the tensor may be row
-    // major in mlir, and therefore it would appear as if we don't need to
-    // untilize
-    opsToCreate.createToLayoutOp |=
-        (opsToCreate.createFromDeviceOp and
-         output.layoutEnum == ttnn::Layout::RowMajor);
 
     // ToDeviceOp can handle the creation of the memory config of the initial
     // device tensor
@@ -168,8 +172,10 @@ class TTNNDecomposeLayouts
            output.bufferType == ttnn::BufferType::L1) or
           (input.bufferType == ttnn::BufferType::L1 and
            output.bufferType == ttnn::BufferType::DRAM);
+      // If shard grids don't match we need to reshard
       opsToCreate.createToMemoryConfigOp |=
-          (input.shardShape != output.shardShape);
+          (input.isL1Sharded() and output.isL1Sharded() and
+           input.shardGrid != output.shardGrid);
     }
     return opsToCreate;
   }
@@ -764,24 +770,30 @@ class TTNNDecomposeLayouts
    *   sizeof(uint32_t). For now, we will always untilize on host. We rarely
    * need device to device untilize, so the perf hit should be acceptable.
    */
-  void createLayoutConversionOps(ttnn::ToLayoutOp op,
-                                 IRRewriter &rewriter) const {
+  mlir::LogicalResult createLayoutConversionOps(ttnn::ToLayoutOp op,
+                                                IRRewriter &rewriter) const {
     auto [input, output] = getInputOutputLayouts(op);
     OpsToCreate opsToCreate = determineRequiredOps(input, output);
-    assert(isCreationValid(op, input, output, opsToCreate) &&
-           "Invalid layout conversion");
+    if (not isCreationValid(op, input, output, opsToCreate)) {
+      return failure();
+    }
+
     auto device = op.getDevice();
-    assert((device || output.isOnHost()) &&
-           "Op device must be set for output tensors on device");
+    if (not device and not output.isOnHost()) {
+      op->emitError("Device not specified for device tensor");
+      return failure();
+    }
+
     OpCreationInfo info(device, input, output, opsToCreate);
 
     Value currentInput = op.getInput();
 
     if (input.isOnHost()) {
       handleHostInputLayoutConversion(op, rewriter, currentInput, info);
-      return;
+      return success();
     }
     handleDeviceInputLayoutConversion(op, rewriter, currentInput, info);
+    return success();
   }
 };
 } // namespace mlir::tt::ttnn