From cb3e406c460fae5cb15b7267f71401630b93ced7 Mon Sep 17 00:00:00 2001
From: Marko Rakita <mrakita@tenstorrent.com>
Date: Fri, 20 Dec 2024 23:36:26 +0100
Subject: [PATCH] Add Reduce ops workaround for keepDim=false (#1625)

This PR adds TTNN workarounds for these Metal issues:
- https://github.com/tenstorrent/tt-metal/issues/13361 - By decomposing
`reduce(keepDim=false)` into `reduce(keepDim=true) + reshape`
- https://github.com/tenstorrent/tt-metal/issues/16118 - By annulling
dimensions argument when all dims are being reduced

As part of this work I've also:
- Enabled conversion of `stablehlo.reduce` op with multiple reduce
dimensions
- Added reduce ops verifiers in TTIR
- Added a separate function in TTNNWorkarounds to run rewrite patterns
for decomposition and layout workarounds
- Added lots of unit tests for reduce ops to cover conversions and
verifiers
- Added lots of silicon tests for reduce ops

Opened issue https://github.com/tenstorrent/tt-mlir/issues/1624 on
myself to revert these workarounds once Metal issues are fixed.

Closes #805, #848

After implementing these workarounds and running tests, I've encountered
[another Metal
issue](https://github.com/tenstorrent/tt-metal/issues/16104), this time
in `reshape` op. I've debugged it and I have a local fix, I will send a
PR to fix it in Metal repo, confirmed with reshape op owners. I've
opened myself an issue
https://github.com/tenstorrent/tt-mlir/issues/1640 to enable Reduce ops
silicon tests after this fix is uplifted.

Another issue that I've encountered while working on this is that after
the workaround pass decompositions, if we are changing the shapes of the
ops tensors, that means that their layout needs to be changed too, but
layout pass is done before the workaround pass. I've managed to solve it
by reusing the layout of the input tensor, but I am not sure if that is
a good solution and maybe we need to repeat some of the layout logic
again after workaround decompositions. FYI @sdjordjevicTT

Here is the example TTNN IR before the workarounds:
```
%3 = "ttnn.sum"(%2) <{dim_arg = [0: i32, 1 : i32, 2: i32], keep_dim = false}> : (tensor<128x32x4xf32, #ttnn_layout2>) -> tensor<1xf32, #ttnn_layout2>
```

and after the workarounds:
```
%3 = "ttnn.sum"(%2) <{keep_dim = true}> : (tensor<128x32x4xf32, #ttnn_layout2>) -> tensor<1x1x1xf32, #ttnn_layout2>
%4 = "ttnn.reshape"(%3) <{shape = [1 : i32]}> : (tensor<1x1x1xf32, #ttnn_layout2>) -> tensor<1xf32, #ttnn_layout3>
```
---
 include/ttmlir/Dialect/TTIR/IR/TTIROps.td     |   2 +
 include/ttmlir/Dialect/TTNN/IR/TTNNOps.td     |   2 +
 .../ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.td    |   1 +
 .../Decomposition/ReduceOpsRewritePattern.h   | 140 ++++++++++++++++++
 .../StableHLOToTTIRPatterns.cpp               |   7 +-
 lib/Dialect/TTIR/IR/TTIROps.cpp               |  87 +++++++++--
 lib/Dialect/TTNN/IR/TTNNOps.cpp               |  48 ++++++
 lib/Dialect/TTNN/IR/TTNNOpsAttrs.cpp          |  18 +++
 lib/Dialect/TTNN/Transforms/CMakeLists.txt    |   3 +-
 .../Decomposition/ReduceOpsRewritePattern.cpp |  50 +++++++
 .../{ => Workarounds}/TTNNWorkarounds.cpp     |  76 ++++++----
 .../StableHLOToTTIR/reduce_add_op.mlir        | 109 +++++++++++++-
 .../StableHLOToTTIR/reduce_maximum_op.mlir    | 109 +++++++++++++-
 .../reduce_ops/negative_invalid_dim_high.mlir |   9 ++
 .../reduce_ops/negative_invalid_dim_low.mlir  |   9 ++
 .../reduce_ops/negative_repeating_dims.mlir   |   9 ++
 .../TTNN/reduction/max_op_negative.mlir       |  10 ++
 .../TTNN/reduction/mean_op_negative.mlir      |  10 ++
 .../TTNN/reduction/sum_op_negative.mlir       |  10 ++
 .../Silicon/StableHLO/reduce_add_op.mlir      | 106 +++++++++++--
 .../Silicon/StableHLO/reduce_maximum_op.mlir  | 106 +++++++++++--
 test/ttmlir/Silicon/TTNN/simple_max.mlir      |  39 +++++
 test/ttmlir/Silicon/TTNN/simple_mean.mlir     |  37 ++++-
 test/ttmlir/Silicon/TTNN/simple_sum.mlir      |  39 +++++
 24 files changed, 954 insertions(+), 82 deletions(-)
 create mode 100644 include/ttmlir/Dialect/TTNN/Transforms/Workarounds/Decomposition/ReduceOpsRewritePattern.h
 create mode 100644 lib/Dialect/TTNN/Transforms/Workarounds/Decomposition/ReduceOpsRewritePattern.cpp
 rename lib/Dialect/TTNN/Transforms/{ => Workarounds}/TTNNWorkarounds.cpp (88%)
 create mode 100644 test/ttmlir/Dialect/TTIR/reduce_ops/negative_invalid_dim_high.mlir
 create mode 100644 test/ttmlir/Dialect/TTIR/reduce_ops/negative_invalid_dim_low.mlir
 create mode 100644 test/ttmlir/Dialect/TTIR/reduce_ops/negative_repeating_dims.mlir
 create mode 100644 test/ttmlir/Dialect/TTNN/reduction/max_op_negative.mlir
 create mode 100644 test/ttmlir/Dialect/TTNN/reduction/mean_op_negative.mlir
 create mode 100644 test/ttmlir/Dialect/TTNN/reduction/sum_op_negative.mlir
 create mode 100644 test/ttmlir/Silicon/TTNN/simple_max.mlir
 create mode 100644 test/ttmlir/Silicon/TTNN/simple_sum.mlir

diff --git a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
index 842d353626..b571287c82 100644
--- a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
+++ b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
@@ -651,6 +651,8 @@ class TTIR_ReductionOp<string mnemonic, list<Trait> traits = []> :
         return {builder.getAffineMapArrayAttr(indexingMaps),
                 builder.getArrayAttr(iteratorTypes)};}
     }];
+
+    let hasVerifier = 1;
 }
 
 def TTIR_SumOp : TTIR_ReductionOp<"sum"> {
diff --git a/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td b/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td
index b915959474..0d1d235bb8 100644
--- a/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td
+++ b/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td
@@ -581,6 +581,8 @@ class TTNN_ReductionOp<string mnemonic, list<Trait> traits = []> : TTNN_Op<mnemo
                          OptionalAttr<I32ArrayAttr>:$dim_arg);
 
     let results = (outs AnyRankedTensor:$result);
+
+    let hasVerifier = 1;
 }
 
 def TTNN_SumOp : TTNN_ReductionOp<"sum"> {
diff --git a/include/ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.td b/include/ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.td
index 94d05eadcb..8d20a2bcc5 100644
--- a/include/ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.td
+++ b/include/ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.td
@@ -145,6 +145,7 @@ def TTNN_TTNNLayoutAttr: TTNN_Attr<"TTNNLayout", "ttnn_layout"> {
     TTNNLayoutAttr withMemoryLayout(::mlir::MLIRContext *context, TensorMemoryLayoutAttr memLayoutAttr);
     TTNNLayoutAttr withMemoryLayout(::mlir::MLIRContext *context, TensorMemoryLayout memLayout);
     TTNNLayoutAttr withShardShape(::mlir::MLIRContext *context, llvm::SmallVector<int64_t> shardShape);
+    TTNNLayoutAttr withTensorShape(::mlir::MLIRContext *context, ArrayRef<int64_t> tensorShape);
 
     bool isSystemBufferType() const { return ::mlir::tt::ttnn::isSystemBufferType(getBufferType()); }
     bool isDeviceBufferType() const { return ::mlir::tt::ttnn::isDeviceBufferType(getBufferType()); }
diff --git a/include/ttmlir/Dialect/TTNN/Transforms/Workarounds/Decomposition/ReduceOpsRewritePattern.h b/include/ttmlir/Dialect/TTNN/Transforms/Workarounds/Decomposition/ReduceOpsRewritePattern.h
new file mode 100644
index 0000000000..741fbfc068
--- /dev/null
+++ b/include/ttmlir/Dialect/TTNN/Transforms/Workarounds/Decomposition/ReduceOpsRewritePattern.h
@@ -0,0 +1,140 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTMLIR_DIALECT_TTNN_TRANSFORMS_WORKAROUNDS_DECOMPOSITION_REDUCEOPSREWRITEPATTERN_H
+#define TTMLIR_DIALECT_TTNN_TRANSFORMS_WORKAROUNDS_DECOMPOSITION_REDUCEOPSREWRITEPATTERN_H
+
+#include "ttmlir/Dialect/TTNN/IR/TTNNOps.h"
+
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+
+namespace mlir::tt::ttnn::workarounds::decomposition {
+
+// Extracts reduce dimensions' values from the dimArg attribute. In case when
+// dimArg is not specified, returns empty vector.
+llvm::SmallVector<int64_t>
+getReduceDims(const std::optional<mlir::ArrayAttr> &dimArg);
+
+// Calculates the shape of the new Reduce op created in the workaround, based
+// on the input shape and reducing dimensions.
+llvm::SmallVector<int64_t>
+calculateNewReduceShape(RankedTensorType inputType,
+                        const std::optional<mlir::ArrayAttr> &dimArg);
+
+// This workaround addresses the next Metal issue:
+// https://github.com/tenstorrent/tt-metal/issues/13361
+//
+// TODO(mrakita): Remove this workaround once these Metal issues are fixed
+// (tracked by https://github.com/tenstorrent/tt-mlir/issues/1624).
+//
+template <typename ReduceOp>
+class ReduceOpsKeepDimRewritePattern : public OpRewritePattern<ReduceOp> {
+public:
+  using OpRewritePattern<ReduceOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ReduceOp srcOp,
+                                PatternRewriter &rewriter) const override {
+    if (srcOp.getKeepDim()) {
+      return failure();
+    }
+
+    RankedTensorType inputType = srcOp.getInput().getType();
+    RankedTensorType outputType = srcOp.getResult().getType();
+
+    ReduceOp newReduceOp =
+        createReduceOpWithKeepDim(srcOp, rewriter, inputType, outputType);
+
+    // Metal TTNN implementation of Reduce ops doesn't yet support
+    // keepDim=false. As a workaround, we convert Reduce ops to combination of
+    // Reduce op with keepDim=true + Reshape op to remove the reduce dims so
+    // that the rest of the graph is not affected. In case when this is not
+    // needed (for example because type converters already promoted rank of the
+    // op result) then we avoid adding unnecessary Reshape op.
+    if (outputType.getShape().size() < inputType.getShape().size()) {
+      replaceOpWithReshapeOp(srcOp, newReduceOp, rewriter, outputType);
+    } else {
+      rewriter.replaceOp(srcOp, newReduceOp);
+    }
+
+    return success();
+  }
+
+private:
+  ReduceOp createReduceOpWithKeepDim(ReduceOp srcOp, PatternRewriter &rewriter,
+                                     RankedTensorType inputType,
+                                     RankedTensorType outputType) const {
+    llvm::SmallVector<int64_t> outputShapeVec =
+        calculateNewReduceShape(inputType, srcOp.getDimArg());
+
+    TTNNLayoutAttr newOutputLayoutAttr =
+        mlir::cast<TTNNLayoutAttr>(outputType.getEncoding())
+            .withTensorShape(rewriter.getContext(), outputShapeVec);
+
+    RankedTensorType newOutputType = RankedTensorType::get(
+        outputShapeVec, outputType.getElementType(), newOutputLayoutAttr);
+
+    return rewriter.create<ReduceOp>(srcOp.getLoc(), newOutputType,
+                                     srcOp.getInput(), true /*keep_dim*/,
+                                     srcOp.getDimArg().value_or(nullptr));
+  }
+
+  void replaceOpWithReshapeOp(ReduceOp srcOp, ReduceOp newReduceOp,
+                              PatternRewriter &rewriter,
+                              RankedTensorType outputType) const {
+    mlir::ArrayAttr shapeAttr = rewriter.getI32ArrayAttr(
+        llvm::SmallVector<int32_t>(outputType.getShape()));
+
+    rewriter.replaceOpWithNewOp<mlir::tt::ttnn::ReshapeOp>(
+        srcOp, outputType, newReduceOp, shapeAttr);
+  }
+};
+
+// This workaround addresses the next Metal issue:
+// https://github.com/tenstorrent/tt-metal/issues/16118
+//
+// TODO(mrakita): Remove this workaround once these Metal issues are fixed
+// (tracked by https://github.com/tenstorrent/tt-mlir/issues/1624).
+//
+template <typename ReduceOp>
+class ReduceOpsAllDimsRewritePattern : public OpRewritePattern<ReduceOp> {
+public:
+  using OpRewritePattern<ReduceOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ReduceOp srcOp,
+                                PatternRewriter &rewriter) const override {
+    if (!srcOp.getDimArg() || srcOp.getDimArg()->empty()) {
+      return failure();
+    }
+
+    llvm::SmallVector<int64_t> reduceDims = getReduceDims(srcOp.getDimArg());
+    llvm::SmallSet<int64_t, 4> uniqueReduceDims(reduceDims.begin(),
+                                                reduceDims.end());
+
+    // Check if reduce is done over all dimensions of the input tensor.
+    if (uniqueReduceDims.size() !=
+        srcOp.getInput().getType().getShape().size()) {
+      return failure();
+    }
+
+    // In case when reduce is done over all dimensions of the input we need to
+    // unset the dimensions attribute, because Metal supports reduce over all
+    // dimensions for any tensor rank when reduce dimensions are not specified,
+    // but it doesn't support reduce for tensors with rank larger than 2 when
+    // reduce dimensions are specified.
+    rewriter.replaceOpWithNewOp<ReduceOp>(srcOp, srcOp.getResult().getType(),
+                                          srcOp.getInput(), srcOp.getKeepDim(),
+                                          nullptr);
+
+    return success();
+  }
+};
+
+} // namespace mlir::tt::ttnn::workarounds::decomposition
+
+#endif // TTMLIR_DIALECT_TTNN_TRANSFORMS_WORKAROUNDS_DECOMPOSITION_REDUCEOPSREWRITEPATTERN_H
diff --git a/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp b/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp
index 4eeec92dcd..cdab2a4b70 100644
--- a/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp
+++ b/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp
@@ -116,10 +116,9 @@ class StableHLOToTTIRReduceOpConversionPattern
     tensor::EmptyOp outputTensor = rewriter.create<tensor::EmptyOp>(
         srcOp.getLoc(), outputType.getShape(), outputType.getElementType());
 
-    mlir::ArrayAttr dimArg = rewriter.getArrayAttr(SmallVector<Attribute>(
-        1, rewriter.getI32IntegerAttr(adaptor.getDimensionsAttr().size() > 0
-                                          ? adaptor.getDimensionsAttr()[0]
-                                          : 1)));
+    // Can't reuse the original dimensions attribute because it uses i64 type.
+    mlir::ArrayAttr dimArg = rewriter.getI32ArrayAttr(
+        llvm::SmallVector<int32_t>(srcOp.getDimensions()));
 
     rewriter.replaceOpWithNewOp<DestOp>(
         srcOp, outputType, adaptor.getInputs().front(), outputTensor,
diff --git a/lib/Dialect/TTIR/IR/TTIROps.cpp b/lib/Dialect/TTIR/IR/TTIROps.cpp
index 52e68b8113..83bb98baa5 100644
--- a/lib/Dialect/TTIR/IR/TTIROps.cpp
+++ b/lib/Dialect/TTIR/IR/TTIROps.cpp
@@ -18,6 +18,7 @@
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Location.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/LogicalResult.h"
 
@@ -1672,32 +1673,32 @@ static void buildGenericEltwiseUnaryRegion(::mlir::Location loc,
   opBuilder.create<mlir::tt::ttir::YieldOp>(loc, mlir::ValueRange({result}));
 }
 
-// AddOp generic region builder
+// AddOp generic region builder.
 void mlir::tt::ttir::AddOp::buildGenericRegion(::mlir::OpBuilder &opBuilder,
                                                ::mlir::Block *block) {
   buildGenericEltwiseBinaryRegion<arith::AddFOp>(getLoc(), opBuilder, block);
 }
 
-// MultiplyOp generic region builder
+// MultiplyOp generic region builder.
 void mlir::tt::ttir::MultiplyOp::buildGenericRegion(
     ::mlir::OpBuilder &opBuilder, ::mlir::Block *block) {
   buildGenericEltwiseBinaryRegion<arith::MulFOp>(getLoc(), opBuilder, block);
 }
 
-// ExpOp generic region builder
+// ExpOp generic region builder.
 void mlir::tt::ttir::ExpOp::buildGenericRegion(::mlir::OpBuilder &opBuilder,
                                                ::mlir::Block *block) {
   buildGenericEltwiseUnaryRegion<math::ExpOp>(getLoc(), opBuilder, block);
 }
 
-// DivOp generic region builder
+// DivOp generic region builder.
 void mlir::tt::ttir::DivOp::buildGenericRegion(::mlir::OpBuilder &opBuilder,
                                                ::mlir::Block *block) {
   return buildGenericEltwiseBinaryRegion<arith::DivFOp>(getLoc(), opBuilder,
                                                         block);
 }
 
-// MaximumOp generic region builder
+// MaximumOp generic region builder.
 void mlir::tt::ttir::MaximumOp::buildGenericRegion(::mlir::OpBuilder &opBuilder,
                                                    ::mlir::Block *block) {
   buildGenericEltwiseBinaryRegion<arith::MaximumFOp>(getLoc(), opBuilder,
@@ -1708,7 +1709,7 @@ void mlir::tt::ttir::MaximumOp::buildGenericRegion(::mlir::OpBuilder &opBuilder,
 // KernelOp
 //===----------------------------------------------------------------------===//
 
-// KernelOp builders
+// KernelOp builders.
 static mlir::tt::ttir::KernelOp
 buildKernelOp(::mlir::OpBuilder &opBuilder, ::mlir::Location loc,
               ::mlir::StringRef kernelName, ::mlir::StringRef kernelKind,
@@ -1717,7 +1718,7 @@ buildKernelOp(::mlir::OpBuilder &opBuilder, ::mlir::Location loc,
       loc, outputs.getTypes(), kernelName, kernelKind, inputs, outputs);
 }
 
-// Reduce op kernel builder
+// Reduce op kernel builder.
 static void createReduceOp(::mlir::OpBuilder &opBuilder, ::mlir::Block *block,
                            mlir::Location loc, ::mlir::StringRef kernelKind) {
   auto kernelOp = buildKernelOp(opBuilder, loc, "reduce", kernelKind,
@@ -1725,23 +1726,81 @@ static void createReduceOp(::mlir::OpBuilder &opBuilder, ::mlir::Block *block,
   opBuilder.create<mlir::tt::ttir::YieldOp>(loc, kernelOp->getResults());
 }
 
-// Sum op kernel builder
-void mlir::tt::ttir::SumOp::buildGenericRegion(::mlir::OpBuilder &opBuilder,
+// Common verifier for all Reduce ops.
+static mlir::LogicalResult
+verifyReduceOp(mlir::Operation *reduceOp, mlir::RankedTensorType inputType,
+               const std::optional<mlir::ArrayAttr> &reduceDims) {
+  if (!reduceDims) {
+    return mlir::success();
+  }
+
+  int64_t inputTensorRank = inputType.getRank();
+
+  llvm::SmallSet<int64_t, 4> uniqueReduceDims;
+  for (mlir::Attribute reduceDim : *reduceDims) {
+    int64_t reduceDimInt = mlir::cast<mlir::IntegerAttr>(reduceDim).getInt();
+    if (reduceDimInt < -inputTensorRank || reduceDimInt >= inputTensorRank) {
+      return reduceOp->emitOpError("Reduce dimensions are out of range");
+    }
+    uniqueReduceDims.insert(reduceDimInt);
+  }
+
+  if (uniqueReduceDims.size() != reduceDims->size()) {
+    return reduceOp->emitOpError("Reduce dimensions are not unique");
+  }
+
+  // TODO(mrakita): Add a check that depending on inputShape, reduceDims and
+  // keepDim computes the expected output shape and checks if it matches the
+  // actual output shape. Tracked by:
+  // https://github.com/tenstorrent/tt-mlir/issues/1639
+
+  return mlir::success();
+}
+
+//===----------------------------------------------------------------------===//
+// MaxOp
+//===----------------------------------------------------------------------===//
+
+// MaxOp kernel builder.
+void mlir::tt::ttir::MaxOp::buildGenericRegion(::mlir::OpBuilder &opBuilder,
                                                ::mlir::Block *block) {
   // NOLINTNEXTLINE
-  createReduceOp(opBuilder, block, getLoc(), "sum");
+  createReduceOp(opBuilder, block, getLoc(), "max");
+}
+
+// MaxOp verification.
+::mlir::LogicalResult mlir::tt::ttir::MaxOp::verify() {
+  return verifyReduceOp(getOperation(), getInput().getType(), getDimArg());
 }
 
-// Mean op kernel builder
+//===----------------------------------------------------------------------===//
+// MeanOp
+//===----------------------------------------------------------------------===//
+
+// MeanOp kernel builder.
 void mlir::tt::ttir::MeanOp::buildGenericRegion(::mlir::OpBuilder &opBuilder,
                                                 ::mlir::Block *block) {
   // NOLINTNEXTLINE
   createReduceOp(opBuilder, block, getLoc(), "mean");
 }
 
-// Max op kernel builder
-void mlir::tt::ttir::MaxOp::buildGenericRegion(::mlir::OpBuilder &opBuilder,
+// MeanOp verification.
+::mlir::LogicalResult mlir::tt::ttir::MeanOp::verify() {
+  return verifyReduceOp(getOperation(), getInput().getType(), getDimArg());
+}
+
+//===----------------------------------------------------------------------===//
+// SumOp
+//===----------------------------------------------------------------------===//
+
+// SumOp kernel builder.
+void mlir::tt::ttir::SumOp::buildGenericRegion(::mlir::OpBuilder &opBuilder,
                                                ::mlir::Block *block) {
   // NOLINTNEXTLINE
-  createReduceOp(opBuilder, block, getLoc(), "max");
+  createReduceOp(opBuilder, block, getLoc(), "sum");
+}
+
+// SumOp verification.
+::mlir::LogicalResult mlir::tt::ttir::SumOp::verify() {
+  return verifyReduceOp(getOperation(), getInput().getType(), getDimArg());
 }
diff --git a/lib/Dialect/TTNN/IR/TTNNOps.cpp b/lib/Dialect/TTNN/IR/TTNNOps.cpp
index e3fc5a33c0..286393858d 100644
--- a/lib/Dialect/TTNN/IR/TTNNOps.cpp
+++ b/lib/Dialect/TTNN/IR/TTNNOps.cpp
@@ -1310,4 +1310,52 @@ ::mlir::LogicalResult mlir::tt::ttnn::PermuteOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// Reduction ops
+//===----------------------------------------------------------------------===//
+
+// Common verifier for all Reduction ops.
+static mlir::LogicalResult
+verifyReduceOp(mlir::Operation *reduceOp, mlir::RankedTensorType inputType,
+               const std::optional<mlir::ArrayAttr> &reduceDims) {
+  int64_t inputTensorRank = inputType.getRank();
+
+  // TODO(mrakita): Only last two dimensions can be reduced, check for that
+  // too.
+  if (reduceDims && reduceDims->size() > 2 &&
+      static_cast<int64_t>(reduceDims->size()) != inputTensorRank) {
+    return reduceOp->emitOpError("Reduce on more than two dimensions is not "
+                                 "currently supported by TTNN");
+  }
+
+  return mlir::success();
+}
+
+//===----------------------------------------------------------------------===//
+// MaxOp
+//===----------------------------------------------------------------------===//
+
+// MaxOp verification.
+::mlir::LogicalResult MaxOp::verify() {
+  return verifyReduceOp(getOperation(), getInput().getType(), getDimArg());
+}
+
+//===----------------------------------------------------------------------===//
+// MeanOp
+//===----------------------------------------------------------------------===//
+
+// MeanOp verification.
+::mlir::LogicalResult MeanOp::verify() {
+  return verifyReduceOp(getOperation(), getInput().getType(), getDimArg());
+}
+
+//===----------------------------------------------------------------------===//
+// SumOp
+//===----------------------------------------------------------------------===//
+
+// SumOp verification.
+::mlir::LogicalResult SumOp::verify() {
+  return verifyReduceOp(getOperation(), getInput().getType(), getDimArg());
+}
+
 } // namespace mlir::tt::ttnn
diff --git a/lib/Dialect/TTNN/IR/TTNNOpsAttrs.cpp b/lib/Dialect/TTNN/IR/TTNNOpsAttrs.cpp
index c7bf769ddc..d16a748226 100644
--- a/lib/Dialect/TTNN/IR/TTNNOpsAttrs.cpp
+++ b/lib/Dialect/TTNN/IR/TTNNOpsAttrs.cpp
@@ -494,6 +494,24 @@ TTNNLayoutAttr::withShardShape(::mlir::MLIRContext *context,
       getMemLayout());
 }
 
+// Construct a new TTNNLayoutAttr
+//
+// This function creates a deep copy of the current TTNNLayoutAttr and
+// applies changes necessary to fit new tensor shape.
+//
+// param context The MLIR context.
+// param tensorShape The new tensor shape.
+// return The new TTNNLayoutAttr with the given tensor shape.
+TTNNLayoutAttr TTNNLayoutAttr::withTensorShape(::mlir::MLIRContext *context,
+                                               ArrayRef<int64_t> tensorShape) {
+  // TODO(mrakita): This leaves default value of collapseIntervals parameter,
+  // which might be different than the original value used to create the layout
+  // attribute. This will work for now since we always use default value, but in
+  // the future we would need to take this into account.
+  return TTNNLayoutAttr::get(context, tensorShape, getElementType(),
+                             getBufferType(), getGrid(), getMemLayout());
+}
+
 // Construct a new TTNNLayoutAttr
 //
 // This function constructs a new TTNNLayoutAttr with the given parameters.
diff --git a/lib/Dialect/TTNN/Transforms/CMakeLists.txt b/lib/Dialect/TTNN/Transforms/CMakeLists.txt
index fd21e03d0c..1aae802c62 100644
--- a/lib/Dialect/TTNN/Transforms/CMakeLists.txt
+++ b/lib/Dialect/TTNN/Transforms/CMakeLists.txt
@@ -3,7 +3,8 @@ add_mlir_dialect_library(MLIRTTNNTransforms
         Passes.cpp
         TTNNLayout.cpp
         TTNNToCpp.cpp
-        TTNNWorkarounds.cpp
+        Workarounds/Decomposition/ReduceOpsRewritePattern.cpp
+        Workarounds/TTNNWorkarounds.cpp
 
         ADDITIONAL_HEADER_DIRS
         ${PROJECT_SOURCE_DIR}/include/ttmlir
diff --git a/lib/Dialect/TTNN/Transforms/Workarounds/Decomposition/ReduceOpsRewritePattern.cpp b/lib/Dialect/TTNN/Transforms/Workarounds/Decomposition/ReduceOpsRewritePattern.cpp
new file mode 100644
index 0000000000..99b61ef0b4
--- /dev/null
+++ b/lib/Dialect/TTNN/Transforms/Workarounds/Decomposition/ReduceOpsRewritePattern.cpp
@@ -0,0 +1,50 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "ttmlir/Dialect/TTNN/Transforms/Workarounds/Decomposition/ReduceOpsRewritePattern.h"
+
+#include <algorithm>
+
+namespace mlir::tt::ttnn::workarounds::decomposition {
+
+llvm::SmallVector<int64_t>
+getReduceDims(const std::optional<mlir::ArrayAttr> &dimArg) {
+  llvm::SmallVector<int64_t, 4> reduceDims;
+  if (!dimArg) {
+    return reduceDims;
+  }
+
+  for (const mlir::Attribute &reduceDim : *dimArg) {
+    reduceDims.push_back(mlir::cast<mlir::IntegerAttr>(reduceDim).getInt());
+  }
+
+  return reduceDims;
+}
+
+llvm::SmallVector<int64_t>
+calculateNewReduceShape(RankedTensorType inputType,
+                        const std::optional<mlir::ArrayAttr> &dimArg) {
+  llvm::SmallVector<int64_t> outputShapeVec(inputType.getShape());
+  llvm::SmallVector<int64_t> reduceDims = getReduceDims(dimArg);
+
+  if (reduceDims.empty()) {
+    // When reduce dimensions are not specified that means we are reducing over
+    // all dimensions, so all dimensions of the output shape become 1.
+    std::fill(outputShapeVec.begin(), outputShapeVec.end(), 1);
+  } else {
+    // Dimensions can be specified as negative numbers, so to calculate the
+    // index in the output shape vector we need to sum them with the output
+    // shape rank.
+    int64_t outputShapeRank = static_cast<int64_t>(outputShapeVec.size());
+    for (const int64_t reduceDim : reduceDims) {
+      int64_t outputShapeIndex =
+          reduceDim < 0 ? outputShapeRank + reduceDim : reduceDim;
+      outputShapeVec[static_cast<size_t>(outputShapeIndex)] = 1;
+    }
+  }
+
+  return outputShapeVec;
+}
+
+} // namespace mlir::tt::ttnn::workarounds::decomposition
diff --git a/lib/Dialect/TTNN/Transforms/TTNNWorkarounds.cpp b/lib/Dialect/TTNN/Transforms/Workarounds/TTNNWorkarounds.cpp
similarity index 88%
rename from lib/Dialect/TTNN/Transforms/TTNNWorkarounds.cpp
rename to lib/Dialect/TTNN/Transforms/Workarounds/TTNNWorkarounds.cpp
index 2c0c48dbcc..eed6af498b 100644
--- a/lib/Dialect/TTNN/Transforms/TTNNWorkarounds.cpp
+++ b/lib/Dialect/TTNN/Transforms/Workarounds/TTNNWorkarounds.cpp
@@ -8,6 +8,7 @@
 #include "ttmlir/Dialect/TTNN/IR/TTNNOps.h"
 #include "ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.h"
 #include "ttmlir/Dialect/TTNN/IR/TTNNWorkarounds.h"
+#include "ttmlir/Dialect/TTNN/Transforms/Workarounds/Decomposition/ReduceOpsRewritePattern.h"
 #include "ttmlir/Dialect/TTNN/Types/Types.h"
 #include "ttmlir/Dialect/TTNN/Utils/TransformUtils.h"
 #include "ttmlir/Dialect/TTNN/Utils/Utils.h"
@@ -399,44 +400,55 @@ class TTNNWorkarounds : public impl::TTNNWorkaroundsBase<TTNNWorkarounds> {
 
   void runOnOperation() final {
     if (decompositionWorkaroundsEnabled) {
-      // Placeholder for workaround decomposition patterns.
       RewritePatternSet patterns(&getContext());
-      patterns.add<TTNNAllReduceWorkarounds>(&getContext());
-
-      FrozenRewritePatternSet patternSet(std::move(patterns));
-      GreedyRewriteConfig config = GreedyRewriteConfig();
-      config.useTopDownTraversal = true;
-      config.maxIterations = GreedyRewriteConfig::kNoLimit;
-      if (failed(applyPatternsAndFoldGreedily(getOperation(), patternSet,
-                                              config))) {
-        signalPassFailure();
-        return;
-      }
+      patterns.add<TTNNAllReduceWorkarounds,
+                   workarounds::decomposition::ReduceOpsKeepDimRewritePattern<
+                       ttnn::SumOp>,
+                   workarounds::decomposition::ReduceOpsKeepDimRewritePattern<
+                       ttnn::MaxOp>,
+                   workarounds::decomposition::ReduceOpsKeepDimRewritePattern<
+                       ttnn::MeanOp>,
+                   workarounds::decomposition::ReduceOpsAllDimsRewritePattern<
+                       ttnn::SumOp>,
+                   workarounds::decomposition::ReduceOpsAllDimsRewritePattern<
+                       ttnn::MaxOp>,
+                   workarounds::decomposition::ReduceOpsAllDimsRewritePattern<
+                       ttnn::MeanOp>>(&getContext());
+
+      runRewritePatterns(std::move(patterns),
+                         GreedyRewriteConfig::kNoLimit /*maxIterations*/);
     }
     if (layouotWorkaroundsEnabled) {
       RewritePatternSet patterns(&getContext());
       patterns.add<TTNNOperandsWorkaroundsRewriter>(&getContext());
 
-      FrozenRewritePatternSet patternSet(std::move(patterns));
-      GreedyRewriteConfig config = GreedyRewriteConfig();
-      // This configuration specifies that the rewriter should traverse the IR
-      // in a top-down order.
-      config.useTopDownTraversal = true;
-      // This configuration specifies the maximum number of iterations the
-      // rewriter will perform on the IR. The rewriter will iterate through the
-      // IR until a fixpoint is reached. All workarounds should be applied
-      // during the first iteration. If the workarounds are not applied in the
-      // first iteration, it indicates a bug in the workarounds implementation.
-      // Although the workarounds are applied in the first iteration, the
-      // rewriter must iterate through the IR once more to confirm that the
-      // fixpoint is reached. If the fixpoint is not reached in the second
-      // iteration, it indicates a bug in the workarounds implementation.
-      config.maxIterations = 2;
-      if (failed(applyPatternsAndFoldGreedily(getOperation(), patternSet,
-                                              config))) {
-        signalPassFailure();
-        return;
-      }
+      // All layout workarounds should be applied during the first iteration. If
+      // the workarounds are not applied in the first iteration, it indicates a
+      // bug in the workarounds implementation. Although the workarounds are
+      // applied in the first iteration, the rewriter must iterate through the
+      // IR once more to confirm that the fixpoint is reached. If the fixpoint
+      // is not reached in the second iteration, it indicates a bug in the
+      // workarounds implementation.
+      const int64_t maxIterations = 2;
+      runRewritePatterns(std::move(patterns), maxIterations);
+    }
+  }
+
+private:
+  // Runs rewrite patterns with specified maximum number of iterations the
+  // rewriter will perform on the IR. The rewriter will iterate through the IR
+  // until a fixpoint is reached.
+  void runRewritePatterns(RewritePatternSet &&patterns, int64_t maxIterations) {
+    FrozenRewritePatternSet patternSet(std::move(patterns));
+    GreedyRewriteConfig config = GreedyRewriteConfig();
+    config.maxIterations = maxIterations;
+    // This configuration specifies that the rewriter should traverse the IR
+    // in a top-down order.
+    config.useTopDownTraversal = true;
+    if (failed(
+            applyPatternsAndFoldGreedily(getOperation(), patternSet, config))) {
+      signalPassFailure();
+      return;
     }
   }
 };
diff --git a/test/ttmlir/Conversion/StableHLOToTTIR/reduce_add_op.mlir b/test/ttmlir/Conversion/StableHLOToTTIR/reduce_add_op.mlir
index b7058b3dea..66f3ce4e10 100644
--- a/test/ttmlir/Conversion/StableHLOToTTIR/reduce_add_op.mlir
+++ b/test/ttmlir/Conversion/StableHLOToTTIR/reduce_add_op.mlir
@@ -1,10 +1,113 @@
 // REQUIRES: stablehlo
 // RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | FileCheck %s
 module @jit_reduce_add attributes {} {
-  func.func public @test_reduce_add(%arg0: tensor<128x10xf32>, %cst_0: tensor<f32>) -> tensor<128xf32> {
+  func.func public @test_reduce_add_4to3dim(%arg0: tensor<128x10x32x4xf32>, %cst_0: tensor<f32>) -> tensor<128x32x4xf32> {
+    // CHECK: tensor.empty
+    // CHECK: "ttir.sum"
+    // CHECK-SAME: dim_arg = [1 : i32]
+    // CHECK-SAME: keep_dim = false
+    // CHECK-SAME: tensor<128x10x32x4xf32>
+    // CHECK-SAME: -> tensor<128x32x4xf32>
+    %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.add across dimensions = [1] : (tensor<128x10x32x4xf32>, tensor<f32>) -> tensor<128x32x4xf32>
+    return %0 : tensor<128x32x4xf32>
+  }
+
+  func.func public @test_reduce_add_4to2dim(%arg0: tensor<128x10x32x4xf32>, %cst_0: tensor<f32>) -> tensor<128x32xf32> {
+    // CHECK: tensor.empty
+    // CHECK: "ttir.sum"
+    // CHECK-SAME: dim_arg = [1 : i32, 3 : i32]
+    // CHECK-SAME: keep_dim = false
+    // CHECK-SAME: tensor<128x10x32x4xf32>
+    // CHECK-SAME: -> tensor<128x32xf32>
+    %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.add across dimensions = [1, 3] : (tensor<128x10x32x4xf32>, tensor<f32>) -> tensor<128x32xf32>
+    return %0 : tensor<128x32xf32>
+  }
+
+  func.func public @test_reduce_add_4to1dim(%arg0: tensor<128x10x32x4xf32>, %cst_0: tensor<f32>) -> tensor<128xf32> {
+    // CHECK: tensor.empty
+    // CHECK: "ttir.sum"
+    // CHECK-SAME: dim_arg = [1 : i32, 2 : i32, 3 : i32]
+    // CHECK-SAME: keep_dim = false
+    // CHECK-SAME: tensor<128x10x32x4xf32>
+    // CHECK-SAME: -> tensor<128xf32>
+    %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.add across dimensions = [1, 2, 3] : (tensor<128x10x32x4xf32>, tensor<f32>) -> tensor<128xf32>
+    return %0 : tensor<128xf32>
+  }
+
+  func.func public @test_reduce_add_4to0dim(%arg0: tensor<128x10x32x4xf32>, %cst_0: tensor<f32>) -> tensor<f32> {
+    // CHECK: tensor.empty
+    // CHECK: "ttir.sum"
+    // CHECK-SAME: dim_arg = [0 : i32, 1 : i32, 2 : i32, 3 : i32]
+    // CHECK-SAME: keep_dim = false
+    // CHECK-SAME: tensor<128x10x32x4xf32>
+    // CHECK-SAME: -> tensor<1xf32>
+    %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.add across dimensions = [0, 1, 2, 3] : (tensor<128x10x32x4xf32>, tensor<f32>) -> tensor<f32>
+    return %0 : tensor<f32>
+  }
+
+  func.func public @test_reduce_add_3to2dim(%arg0: tensor<128x10x4xf32>, %cst_0: tensor<f32>) -> tensor<128x4xf32> {
+    // CHECK: tensor.empty
+    // CHECK: "ttir.sum"
+    // CHECK-SAME: dim_arg = [1 : i32]
+    // CHECK-SAME: keep_dim = false
+    // CHECK-SAME: tensor<128x10x4xf32>
+    // CHECK-SAME: -> tensor<128x4xf32>
+    %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.add across dimensions = [1] : (tensor<128x10x4xf32>, tensor<f32>) -> tensor<128x4xf32>
+    return %0 : tensor<128x4xf32>
+  }
+
+  func.func public @test_reduce_add_3to1dim(%arg0: tensor<128x10x4xf32>, %cst_0: tensor<f32>) -> tensor<128xf32> {
+    // CHECK: tensor.empty
+    // CHECK: "ttir.sum"
+    // CHECK-SAME: dim_arg = [1 : i32, 2 : i32]
+    // CHECK-SAME: keep_dim = false
+    // CHECK-SAME: tensor<128x10x4xf32>
+    // CHECK-SAME: -> tensor<128xf32>
+    %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.add across dimensions = [1, 2] : (tensor<128x10x4xf32>, tensor<f32>) -> tensor<128xf32>
+    return %0 : tensor<128xf32>
+  }
+
+  func.func public @test_reduce_add_3to0dim(%arg0: tensor<128x10x4xf32>, %cst_0: tensor<f32>) -> tensor<f32> {
+    // CHECK: tensor.empty
+    // CHECK: "ttir.sum"
+    // CHECK-SAME: dim_arg = [0 : i32, 1 : i32, 2 : i32]
+    // CHECK-SAME: keep_dim = false
+    // CHECK-SAME: tensor<128x10x4xf32>
+    // CHECK-SAME: -> tensor<1xf32>
+    %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.add across dimensions = [0, 1, 2] : (tensor<128x10x4xf32>, tensor<f32>) -> tensor<f32>
+    return %0 : tensor<f32>
+  }
+
+  func.func public @test_reduce_add_2to1dim(%arg0: tensor<128x10xf32>, %cst_0: tensor<f32>) -> tensor<128xf32> {
+    // CHECK: tensor.empty
+    // CHECK: "ttir.sum"
+    // CHECK-SAME: dim_arg = [1 : i32]
+    // CHECK-SAME: keep_dim = false
+    // CHECK-SAME: tensor<128x10xf32>
+    // CHECK-SAME: -> tensor<128xf32>
     %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.add across dimensions = [1] : (tensor<128x10xf32>, tensor<f32>) -> tensor<128xf32>
-    // CHECK: %[[C:.*]] = tensor.empty[[C:.*]]
-    // CHECK: %[[C:.*]] = "ttir.sum"[[C:.*]]
     return %0 : tensor<128xf32>
   }
+
+  func.func public @test_reduce_add_2to0dim(%arg0: tensor<128x10xf32>, %cst_0: tensor<f32>) -> tensor<f32> {
+    // CHECK: tensor.empty
+    // CHECK: "ttir.sum"
+    // CHECK-SAME: dim_arg = [0 : i32, 1 : i32]
+    // CHECK-SAME: keep_dim = false
+    // CHECK-SAME: tensor<128x10xf32>
+    // CHECK-SAME: -> tensor<1xf32>
+    %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.add across dimensions = [0, 1] : (tensor<128x10xf32>, tensor<f32>) -> tensor<f32>
+    return %0 : tensor<f32>
+  }
+
+  func.func public @test_reduce_add_1to0dim(%arg0: tensor<128xf32>, %cst_0: tensor<f32>) -> tensor<f32> {
+    // CHECK: tensor.empty
+    // CHECK: "ttir.sum"
+    // CHECK-SAME: dim_arg = [0 : i32]
+    // CHECK-SAME: keep_dim = false
+    // CHECK-SAME: tensor<128xf32>
+    // CHECK-SAME: -> tensor<1xf32>
+    %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.add across dimensions = [0] : (tensor<128xf32>, tensor<f32>) -> tensor<f32>
+    return %0 : tensor<f32>
+  }
 }
diff --git a/test/ttmlir/Conversion/StableHLOToTTIR/reduce_maximum_op.mlir b/test/ttmlir/Conversion/StableHLOToTTIR/reduce_maximum_op.mlir
index ea03c57669..81fb59bfb8 100644
--- a/test/ttmlir/Conversion/StableHLOToTTIR/reduce_maximum_op.mlir
+++ b/test/ttmlir/Conversion/StableHLOToTTIR/reduce_maximum_op.mlir
@@ -1,10 +1,113 @@
 // REQUIRES: stablehlo
 // RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | FileCheck %s
 module @jit_reduce_maximum attributes {} {
-  func.func public @test_reduce_maximum(%arg0: tensor<128x10xf32>, %cst_0: tensor<f32>) -> tensor<128xf32> {
+  func.func public @test_reduce_maximum_4to3dim(%arg0: tensor<128x10x32x4xf32>, %cst_0: tensor<f32>) -> tensor<128x32x4xf32> {
+    // CHECK: tensor.empty
+    // CHECK: "ttir.max"
+    // CHECK-SAME: dim_arg = [1 : i32]
+    // CHECK-SAME: keep_dim = false
+    // CHECK-SAME: tensor<128x10x32x4xf32>
+    // CHECK-SAME: -> tensor<128x32x4xf32>
+    %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.maximum across dimensions = [1] : (tensor<128x10x32x4xf32>, tensor<f32>) -> tensor<128x32x4xf32>
+    return %0 : tensor<128x32x4xf32>
+  }
+
+  func.func public @test_reduce_maximum_4to2dim(%arg0: tensor<128x10x32x4xf32>, %cst_0: tensor<f32>) -> tensor<128x32xf32> {
+    // CHECK: tensor.empty
+    // CHECK: "ttir.max"
+    // CHECK-SAME: dim_arg = [1 : i32, 3 : i32]
+    // CHECK-SAME: keep_dim = false
+    // CHECK-SAME: tensor<128x10x32x4xf32>
+    // CHECK-SAME: -> tensor<128x32xf32>
+    %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.maximum across dimensions = [1, 3] : (tensor<128x10x32x4xf32>, tensor<f32>) -> tensor<128x32xf32>
+    return %0 : tensor<128x32xf32>
+  }
+
+  func.func public @test_reduce_maximum_4to1dim(%arg0: tensor<128x10x32x4xf32>, %cst_0: tensor<f32>) -> tensor<128xf32> {
+    // CHECK: tensor.empty
+    // CHECK: "ttir.max"
+    // CHECK-SAME: dim_arg = [1 : i32, 2 : i32, 3 : i32]
+    // CHECK-SAME: keep_dim = false
+    // CHECK-SAME: tensor<128x10x32x4xf32>
+    // CHECK-SAME: -> tensor<128xf32>
+    %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.maximum across dimensions = [1, 2, 3] : (tensor<128x10x32x4xf32>, tensor<f32>) -> tensor<128xf32>
+    return %0 : tensor<128xf32>
+  }
+
+  func.func public @test_reduce_maximum_4to0dim(%arg0: tensor<128x10x32x4xf32>, %cst_0: tensor<f32>) -> tensor<f32> {
+    // CHECK: tensor.empty
+    // CHECK: "ttir.max"
+    // CHECK-SAME: dim_arg = [0 : i32, 1 : i32, 2 : i32, 3 : i32]
+    // CHECK-SAME: keep_dim = false
+    // CHECK-SAME: tensor<128x10x32x4xf32>
+    // CHECK-SAME: -> tensor<1xf32>
+    %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.maximum across dimensions = [0, 1, 2, 3] : (tensor<128x10x32x4xf32>, tensor<f32>) -> tensor<f32>
+    return %0 : tensor<f32>
+  }
+
+  func.func public @test_reduce_maximum_3to2dim(%arg0: tensor<128x10x4xf32>, %cst_0: tensor<f32>) -> tensor<128x4xf32> {
+    // CHECK: tensor.empty
+    // CHECK: "ttir.max"
+    // CHECK-SAME: dim_arg = [1 : i32]
+    // CHECK-SAME: keep_dim = false
+    // CHECK-SAME: tensor<128x10x4xf32>
+    // CHECK-SAME: -> tensor<128x4xf32>
+    %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.maximum across dimensions = [1] : (tensor<128x10x4xf32>, tensor<f32>) -> tensor<128x4xf32>
+    return %0 : tensor<128x4xf32>
+  }
+
+  func.func public @test_reduce_maximum_3to1dim(%arg0: tensor<128x10x4xf32>, %cst_0: tensor<f32>) -> tensor<128xf32> {
+    // CHECK: tensor.empty
+    // CHECK: "ttir.max"
+    // CHECK-SAME: dim_arg = [1 : i32, 2 : i32]
+    // CHECK-SAME: keep_dim = false
+    // CHECK-SAME: tensor<128x10x4xf32>
+    // CHECK-SAME: -> tensor<128xf32>
+    %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.maximum across dimensions = [1, 2] : (tensor<128x10x4xf32>, tensor<f32>) -> tensor<128xf32>
+    return %0 : tensor<128xf32>
+  }
+
+  func.func public @test_reduce_maximum_3to0dim(%arg0: tensor<128x10x4xf32>, %cst_0: tensor<f32>) -> tensor<f32> {
+    // CHECK: tensor.empty
+    // CHECK: "ttir.max"
+    // CHECK-SAME: dim_arg = [0 : i32, 1 : i32, 2 : i32]
+    // CHECK-SAME: keep_dim = false
+    // CHECK-SAME: tensor<128x10x4xf32>
+    // CHECK-SAME: -> tensor<1xf32>
+    %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.maximum across dimensions = [0, 1, 2] : (tensor<128x10x4xf32>, tensor<f32>) -> tensor<f32>
+    return %0 : tensor<f32>
+  }
+
+  func.func public @test_reduce_maximum_2to1dim(%arg0: tensor<128x10xf32>, %cst_0: tensor<f32>) -> tensor<128xf32> {
+    // CHECK: tensor.empty
+    // CHECK: "ttir.max"
+    // CHECK-SAME: dim_arg = [1 : i32]
+    // CHECK-SAME: keep_dim = false
+    // CHECK-SAME: tensor<128x10xf32>
+    // CHECK-SAME: -> tensor<128xf32>
     %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.maximum across dimensions = [1] : (tensor<128x10xf32>, tensor<f32>) -> tensor<128xf32>
-    // CHECK: %[[C:.*]] = tensor.empty[[C:.*]]
-    // CHECK: %[[C:.*]] = "ttir.max"[[C:.*]]
     return %0 : tensor<128xf32>
   }
+
+  func.func public @test_reduce_maximum_2to0dim(%arg0: tensor<128x10xf32>, %cst_0: tensor<f32>) -> tensor<f32> {
+    // CHECK: tensor.empty
+    // CHECK: "ttir.max"
+    // CHECK-SAME: dim_arg = [0 : i32, 1 : i32]
+    // CHECK-SAME: keep_dim = false
+    // CHECK-SAME: tensor<128x10xf32>
+    // CHECK-SAME: -> tensor<1xf32>
+    %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.maximum across dimensions = [0, 1] : (tensor<128x10xf32>, tensor<f32>) -> tensor<f32>
+    return %0 : tensor<f32>
+  }
+
+  func.func public @test_reduce_maximum_1to0dim(%arg0: tensor<128xf32>, %cst_0: tensor<f32>) -> tensor<f32> {
+    // CHECK: tensor.empty
+    // CHECK: "ttir.max"
+    // CHECK-SAME: dim_arg = [0 : i32]
+    // CHECK-SAME: keep_dim = false
+    // CHECK-SAME: tensor<128xf32>
+    // CHECK-SAME: -> tensor<1xf32>
+    %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.maximum across dimensions = [0] : (tensor<128xf32>, tensor<f32>) -> tensor<f32>
+    return %0 : tensor<f32>
+  }
 }
diff --git a/test/ttmlir/Dialect/TTIR/reduce_ops/negative_invalid_dim_high.mlir b/test/ttmlir/Dialect/TTIR/reduce_ops/negative_invalid_dim_high.mlir
new file mode 100644
index 0000000000..565745d057
--- /dev/null
+++ b/test/ttmlir/Dialect/TTIR/reduce_ops/negative_invalid_dim_high.mlir
@@ -0,0 +1,9 @@
+// RUN: not ttmlir-opt --split-input-file %s 2>&1 | FileCheck %s
+// Negative tests for reduce ops
+
+// CHECK: error: 'ttir.sum' op Reduce dimensions are out of range
+func.func public @test_reduce_add_invalid_dim_high(%arg0: tensor<128x10xf32>, %arg1: tensor<1xf32>) -> tensor<128xf32> {
+  %0 = tensor.empty() : tensor<128xf32>
+  %1 = "ttir.sum"(%arg0, %0) <{dim_arg = [2 : i32], keep_dim = false}> : (tensor<128x10xf32>, tensor<128xf32>) -> tensor<128xf32>
+  return %1 : tensor<128xf32>
+}
diff --git a/test/ttmlir/Dialect/TTIR/reduce_ops/negative_invalid_dim_low.mlir b/test/ttmlir/Dialect/TTIR/reduce_ops/negative_invalid_dim_low.mlir
new file mode 100644
index 0000000000..bd4a237d46
--- /dev/null
+++ b/test/ttmlir/Dialect/TTIR/reduce_ops/negative_invalid_dim_low.mlir
@@ -0,0 +1,9 @@
+// RUN: not ttmlir-opt --split-input-file %s 2>&1 | FileCheck %s
+// Negative tests for reduce ops
+
+// CHECK: error: 'ttir.sum' op Reduce dimensions are out of range
+func.func public @test_reduce_add_invalid_dim_low(%arg0: tensor<128x10xf32>, %arg1: tensor<1xf32>) -> tensor<128xf32> {
+  %0 = tensor.empty() : tensor<128xf32>
+  %1 = "ttir.sum"(%arg0, %0) <{dim_arg = [-3 : i32], keep_dim = false}> : (tensor<128x10xf32>, tensor<128xf32>) -> tensor<128xf32>
+  return %1 : tensor<128xf32>
+}
diff --git a/test/ttmlir/Dialect/TTIR/reduce_ops/negative_repeating_dims.mlir b/test/ttmlir/Dialect/TTIR/reduce_ops/negative_repeating_dims.mlir
new file mode 100644
index 0000000000..13649e1e65
--- /dev/null
+++ b/test/ttmlir/Dialect/TTIR/reduce_ops/negative_repeating_dims.mlir
@@ -0,0 +1,9 @@
+// RUN: not ttmlir-opt --split-input-file %s 2>&1 | FileCheck %s
+// Negative tests for reduce ops
+
+// CHECK: error: 'ttir.sum' op Reduce dimensions are not unique
+func.func public @test_reduce_add_repeating_dims(%arg0: tensor<128x10x32x4xf32>, %arg1: tensor<1xf32>) -> tensor<128xf32> {
+  %0 = tensor.empty() : tensor<128xf32>
+  %1 = "ttir.sum"(%arg0, %0) <{dim_arg = [1 : i32, 2 : i32, 3 : i32, 2 : i32], keep_dim = false}> : (tensor<128x10x32x4xf32>, tensor<128xf32>) -> tensor<128xf32>
+  return %1 : tensor<128xf32>
+}
diff --git a/test/ttmlir/Dialect/TTNN/reduction/max_op_negative.mlir b/test/ttmlir/Dialect/TTNN/reduction/max_op_negative.mlir
new file mode 100644
index 0000000000..ac587303ed
--- /dev/null
+++ b/test/ttmlir/Dialect/TTNN/reduction/max_op_negative.mlir
@@ -0,0 +1,10 @@
+// RUN: not ttmlir-opt --split-input-file --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s 2>&1 | FileCheck %s
+// Negative tests for Max op.
+module {
+  func.func @forward(%arg0: tensor<128x32x10x4xbf16>) -> tensor<128x1x1x1xbf16> {
+    %0 = tensor.empty() : tensor<128x1x1x1xbf16>
+    // CHECK: error: 'ttnn.max' op Reduce on more than two dimensions is not currently supported by TTNN
+    %1 = "ttir.max"(%arg0, %0) <{dim_arg = [1: i32, 2: i32, 3: i32], keep_dim = true}> : (tensor<128x32x10x4xbf16>, tensor<128x1x1x1xbf16>) -> tensor<128x1x1x1xbf16>
+    return %1 : tensor<128x1x1x1xbf16>
+  }
+}
diff --git a/test/ttmlir/Dialect/TTNN/reduction/mean_op_negative.mlir b/test/ttmlir/Dialect/TTNN/reduction/mean_op_negative.mlir
new file mode 100644
index 0000000000..768b220bb1
--- /dev/null
+++ b/test/ttmlir/Dialect/TTNN/reduction/mean_op_negative.mlir
@@ -0,0 +1,10 @@
+// RUN: not ttmlir-opt --split-input-file --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s 2>&1 | FileCheck %s
+// Negative tests for Mean op.
+module {
+  func.func @forward(%arg0: tensor<128x32x10x4xbf16>) -> tensor<128x1x1x1xbf16> {
+    %0 = tensor.empty() : tensor<128x1x1x1xbf16>
+    // CHECK: error: 'ttnn.mean' op Reduce on more than two dimensions is not currently supported by TTNN
+    %1 = "ttir.mean"(%arg0, %0) <{dim_arg = [1: i32, 2: i32, 3: i32], keep_dim = true}> : (tensor<128x32x10x4xbf16>, tensor<128x1x1x1xbf16>) -> tensor<128x1x1x1xbf16>
+    return %1 : tensor<128x1x1x1xbf16>
+  }
+}
diff --git a/test/ttmlir/Dialect/TTNN/reduction/sum_op_negative.mlir b/test/ttmlir/Dialect/TTNN/reduction/sum_op_negative.mlir
new file mode 100644
index 0000000000..c0c634f051
--- /dev/null
+++ b/test/ttmlir/Dialect/TTNN/reduction/sum_op_negative.mlir
@@ -0,0 +1,10 @@
+// RUN: not ttmlir-opt --split-input-file --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s 2>&1 | FileCheck %s
+// Negative tests for Sum op.
+module {
+  func.func @forward(%arg0: tensor<128x32x10x4xbf16>) -> tensor<128x1x1x1xbf16> {
+    %0 = tensor.empty() : tensor<128x1x1x1xbf16>
+    // CHECK: error: 'ttnn.sum' op Reduce on more than two dimensions is not currently supported by TTNN
+    %1 = "ttir.sum"(%arg0, %0) <{dim_arg = [1: i32, 2: i32, 3: i32], keep_dim = true}> : (tensor<128x32x10x4xbf16>, tensor<128x1x1x1xbf16>) -> tensor<128x1x1x1xbf16>
+    return %1 : tensor<128x1x1x1xbf16>
+  }
+}
diff --git a/test/ttmlir/Silicon/StableHLO/reduce_add_op.mlir b/test/ttmlir/Silicon/StableHLO/reduce_add_op.mlir
index 9da138bbb8..89f51123e6 100644
--- a/test/ttmlir/Silicon/StableHLO/reduce_add_op.mlir
+++ b/test/ttmlir/Silicon/StableHLO/reduce_add_op.mlir
@@ -1,22 +1,108 @@
 // REQUIRES: stablehlo
 // RUN: rm -rf %t.ttnn
 // RUN: rm -rf %t.mlir
-// RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | \
-// RUN:     ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" > %t.mlir
-// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+// RUN: ttmlir-opt --stablehlo-to-ttir-pipeline --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" > %t.mlir
 // RUN: FileCheck --input-file=%t.mlir %s
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
 // UNSUPPORTED: true
-// error: keepdim=False is not supported
+// These tests are currently failing until a fix for this issue is uplifted
+// with new version of Metal: https://github.com/tenstorrent/tt-metal/issues/16104
+// TODO(mrakita): Enable and edit these tests after the Metal issue is fixed.
+// Tracked by: https://github.com/tenstorrent/tt-mlir/issues/1640
 
 module @jit_reduce_add attributes {} {
-  func.func public @test_reduce_add(%arg0: tensor<128x10xf32>, %cst_0: tensor<f32>) -> tensor<128xf32> {
-    // CHECK-LABEL: func.func public @test_reduce_add
-    // CHECK: ttnn.sum
-    // CHECK-SAME: dim_arg = [1 : i32],
-    // CHECK-SAME: keep_dim = false
-    // CHECK-SAME: tensor<128x10xf32
+  func.func public @test_reduce_add_4to0dim(%arg0: tensor<128x10x32x4xf32>, %cst_0: tensor<f32>) -> tensor<f32> {
+    // CHECK: "ttnn.sum"
+    // CHECK-NOT: dim_arg
+    // CHECK-SAME: keep_dim = true
+    // CHECK-SAME: tensor<128x10x32x4xf32,
+    // CHECK-SAME: -> tensor<1x1x1x1xf32,
+    // CHECK: "ttnn.reshape"
+    // CHECK-SAME: shape = [1 : i32]
+    // CHECK-SAME: tensor<1x1x1x1xf32,
+    // CHECK-SAME: -> tensor<1xf32,
+    %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.add across dimensions = [0, 1, 2, 3] : (tensor<128x10x32x4xf32>, tensor<f32>) -> tensor<f32>
+    return %0 : tensor<f32>
+  }
+
+  func.func public @test_reduce_add_3to2dim(%arg0: tensor<128x10x4xf32>, %cst_0: tensor<f32>) -> tensor<128x4xf32> {
+    // CHECK: "ttnn.sum"
+    // CHECK-SAME: dim_arg = [1 : i32]
+    // CHECK-SAME: keep_dim = true
+    // CHECK-SAME: tensor<128x10x4xf32,
+    // CHECK-SAME: -> tensor<128x1x4xf32,
+    // CHECK: "ttnn.reshape"
+    // CHECK-SAME: shape = [128 : i32, 4 : i32]
+    // CHECK-SAME: tensor<128x1x4xf32,
+    // CHECK-SAME: -> tensor<128x4xf32,
+    %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.add across dimensions = [1] : (tensor<128x10x4xf32>, tensor<f32>) -> tensor<128x4xf32>
+    return %0 : tensor<128x4xf32>
+  }
+
+  func.func public @test_reduce_add_3to1dim(%arg0: tensor<128x10x4xf32>, %cst_0: tensor<f32>) -> tensor<128xf32> {
+    // CHECK: "ttnn.sum"
+    // CHECK-SAME: dim_arg = [1 : i32, 2 : i32]
+    // CHECK-SAME: keep_dim = true
+    // CHECK-SAME: tensor<128x10x4xf32,
+    // CHECK-SAME: -> tensor<128x1x1xf32,
+    // CHECK: "ttnn.reshape"
+    // CHECK-SAME: shape = [128 : i32]
+    // CHECK-SAME: tensor<128x1x1xf32,
+    // CHECK-SAME: -> tensor<128xf32,
+    %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.add across dimensions = [1, 2] : (tensor<128x10x4xf32>, tensor<f32>) -> tensor<128xf32>
+    return %0 : tensor<128xf32>
+  }
+
+  func.func public @test_reduce_add_3to0dim(%arg0: tensor<128x10x4xf32>, %cst_0: tensor<f32>) -> tensor<f32> {
+    // CHECK: "ttnn.sum"
+    // CHECK-NOT: dim_arg
+    // CHECK-SAME: keep_dim = true
+    // CHECK-SAME: tensor<128x10x4xf32,
+    // CHECK-SAME: -> tensor<1x1x1xf32,
+    // CHECK: "ttnn.reshape"
+    // CHECK-SAME: shape = [1 : i32]
+    // CHECK-SAME: tensor<1x1x1xf32,
+    // CHECK-SAME: -> tensor<1xf32,
+    %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.add across dimensions = [0, 1, 2] : (tensor<128x10x4xf32>, tensor<f32>) -> tensor<f32>
+    return %0 : tensor<f32>
+  }
+
+  func.func public @test_reduce_add_2to1dim(%arg0: tensor<128x10xf32>, %cst_0: tensor<f32>) -> tensor<128xf32> {
+    // CHECK: "ttnn.sum"
+    // CHECK-SAME: dim_arg = [1 : i32]
+    // CHECK-SAME: keep_dim = true
+    // CHECK-SAME: tensor<128x10xf32,
+    // CHECK-SAME: -> tensor<128x1xf32,
+    // CHECK: "ttnn.reshape"
+    // CHECK-SAME: shape = [128 : i32]
+    // CHECK-SAME: tensor<128x1xf32,
     // CHECK-SAME: -> tensor<128xf32,
     %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.add across dimensions = [1] : (tensor<128x10xf32>, tensor<f32>) -> tensor<128xf32>
     return %0 : tensor<128xf32>
   }
+
+  func.func public @test_reduce_add_2to0dim(%arg0: tensor<128x10xf32>, %cst_0: tensor<f32>) -> tensor<f32> {
+    // CHECK: "ttnn.sum"
+    // CHECK-NOT: dim_arg
+    // CHECK-SAME: keep_dim = true
+    // CHECK-SAME: tensor<128x10xf32,
+    // CHECK-SAME: -> tensor<1x1xf32,
+    // CHECK: "ttnn.reshape"
+    // CHECK-SAME: shape = [1 : i32]
+    // CHECK-SAME: tensor<1x1xf32,
+    // CHECK-SAME: -> tensor<1xf32,
+    %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.add across dimensions = [0, 1] : (tensor<128x10xf32>, tensor<f32>) -> tensor<f32>
+    return %0 : tensor<f32>
+  }
+
+  func.func public @test_reduce_add_1to0dim(%arg0: tensor<128xf32>, %cst_0: tensor<f32>) -> tensor<f32> {
+    // CHECK: "ttnn.sum"
+    // CHECK-NOT: dim_arg
+    // CHECK-SAME: keep_dim = true
+    // CHECK-SAME: tensor<128xf32,
+    // CHECK-SAME: -> tensor<1xf32,
+    // CHECK-NOT: "ttnn.reshape"
+    %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.add across dimensions = [0] : (tensor<128xf32>, tensor<f32>) -> tensor<f32>
+    return %0 : tensor<f32>
+  }
 }
diff --git a/test/ttmlir/Silicon/StableHLO/reduce_maximum_op.mlir b/test/ttmlir/Silicon/StableHLO/reduce_maximum_op.mlir
index 57318948e5..8ee57fd52f 100644
--- a/test/ttmlir/Silicon/StableHLO/reduce_maximum_op.mlir
+++ b/test/ttmlir/Silicon/StableHLO/reduce_maximum_op.mlir
@@ -1,22 +1,108 @@
 // REQUIRES: stablehlo
 // RUN: rm -rf %t.ttnn
 // RUN: rm -rf %t.mlir
-// RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | \
-// RUN:     ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" > %t.mlir
-// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+// RUN: ttmlir-opt --stablehlo-to-ttir-pipeline --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" > %t.mlir
 // RUN: FileCheck --input-file=%t.mlir %s
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
 // UNSUPPORTED: true
-// error: keepdim=False is not supported
+// These tests are currently failing until a fix for this issue is uplifted
+// with new version of Metal: https://github.com/tenstorrent/tt-metal/issues/16104
+// TODO(mrakita): Enable and edit these tests after the Metal issue is fixed.
+// Tracked by: https://github.com/tenstorrent/tt-mlir/issues/1640
 
 module @jit_reduce_maximum attributes {} {
-  func.func public @test_reduce_maximum(%arg0: tensor<128x10xf32>, %cst_0: tensor<f32>) -> tensor<128xf32> {
-    // CHECK-LABEL: func.func public @test_reduce_maximum
-    // CHECK: ttnn.max
-    // CHECK-SAME: dim_arg = [1 : i32],
-    // CHECK-SAME: keep_dim = false}
+  func.func public @test_reduce_maximum_4to0dim(%arg0: tensor<128x10x32x4xf32>, %cst_0: tensor<f32>) -> tensor<f32> {
+    // CHECK: "ttnn.max"
+    // CHECK-NOT: dim_arg
+    // CHECK-SAME: keep_dim = true
+    // CHECK-SAME: tensor<128x10x32x4xf32,
+    // CHECK-SAME: -> tensor<1x1x1x1xf32,
+    // CHECK: "ttnn.reshape"
+    // CHECK-SAME: shape = [1 : i32]
+    // CHECK-SAME: tensor<1x1x1x1xf32,
+    // CHECK-SAME: -> tensor<1xf32,
+    %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.maximum across dimensions = [0, 1, 2, 3] : (tensor<128x10x32x4xf32>, tensor<f32>) -> tensor<f32>
+    return %0 : tensor<f32>
+  }
+
+  func.func public @test_reduce_maximum_3to2dim(%arg0: tensor<128x10x4xf32>, %cst_0: tensor<f32>) -> tensor<128x4xf32> {
+    // CHECK: "ttnn.max"
+    // CHECK-SAME: dim_arg = [1 : i32]
+    // CHECK-SAME: keep_dim = true
+    // CHECK-SAME: tensor<128x10x4xf32,
+    // CHECK-SAME: -> tensor<128x1x4xf32,
+    // CHECK: "ttnn.reshape"
+    // CHECK-SAME: shape = [128 : i32, 4 : i32]
+    // CHECK-SAME: tensor<128x1x4xf32,
+    // CHECK-SAME: -> tensor<128x4xf32,
+    %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.maximum across dimensions = [1] : (tensor<128x10x4xf32>, tensor<f32>) -> tensor<128x4xf32>
+    return %0 : tensor<128x4xf32>
+  }
+
+  func.func public @test_reduce_maximum_3to1dim(%arg0: tensor<128x10x4xf32>, %cst_0: tensor<f32>) -> tensor<128xf32> {
+    // CHECK: "ttnn.max"
+    // CHECK-SAME: dim_arg = [1 : i32, 2 : i32]
+    // CHECK-SAME: keep_dim = true
+    // CHECK-SAME: tensor<128x10x4xf32,
+    // CHECK-SAME: -> tensor<128x1x1xf32,
+    // CHECK: "ttnn.reshape"
+    // CHECK-SAME: shape = [128 : i32]
+    // CHECK-SAME: tensor<128x1x1xf32,
+    // CHECK-SAME: -> tensor<128xf32,
+    %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.maximum across dimensions = [1, 2] : (tensor<128x10x4xf32>, tensor<f32>) -> tensor<128xf32>
+    return %0 : tensor<128xf32>
+  }
+
+  func.func public @test_reduce_maximum_3to0dim(%arg0: tensor<128x10x4xf32>, %cst_0: tensor<f32>) -> tensor<f32> {
+    // CHECK: "ttnn.max"
+    // CHECK-NOT: dim_arg
+    // CHECK-SAME: keep_dim = true
+    // CHECK-SAME: tensor<128x10x4xf32,
+    // CHECK-SAME: -> tensor<1x1x1xf32,
+    // CHECK: "ttnn.reshape"
+    // CHECK-SAME: shape = [1 : i32]
+    // CHECK-SAME: tensor<1x1x1xf32,
+    // CHECK-SAME: -> tensor<1xf32,
+    %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.maximum across dimensions = [0, 1, 2] : (tensor<128x10x4xf32>, tensor<f32>) -> tensor<f32>
+    return %0 : tensor<f32>
+  }
+
+  func.func public @test_reduce_maximum_2to1dim(%arg0: tensor<128x10xf32>, %cst_0: tensor<f32>) -> tensor<128xf32> {
+    // CHECK: "ttnn.max"
+    // CHECK-SAME: dim_arg = [1 : i32]
+    // CHECK-SAME: keep_dim = true
     // CHECK-SAME: tensor<128x10xf32,
-    // CHECK-SAME: -> tensor<128xf32
+    // CHECK-SAME: -> tensor<128x1xf32,
+    // CHECK: "ttnn.reshape"
+    // CHECK-SAME: shape = [128 : i32]
+    // CHECK-SAME: tensor<128x1xf32,
+    // CHECK-SAME: -> tensor<128xf32,
     %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.maximum across dimensions = [1] : (tensor<128x10xf32>, tensor<f32>) -> tensor<128xf32>
     return %0 : tensor<128xf32>
   }
+
+  func.func public @test_reduce_maximum_2to0dim(%arg0: tensor<128x10xf32>, %cst_0: tensor<f32>) -> tensor<f32> {
+    // CHECK: "ttnn.max"
+    // CHECK-NOT: dim_arg
+    // CHECK-SAME: keep_dim = true
+    // CHECK-SAME: tensor<128x10xf32,
+    // CHECK-SAME: -> tensor<1x1xf32,
+    // CHECK: "ttnn.reshape"
+    // CHECK-SAME: shape = [1 : i32]
+    // CHECK-SAME: tensor<1x1xf32,
+    // CHECK-SAME: -> tensor<1xf32,
+    %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.maximum across dimensions = [0, 1] : (tensor<128x10xf32>, tensor<f32>) -> tensor<f32>
+    return %0 : tensor<f32>
+  }
+
+  func.func public @test_reduce_maximum_1to0dim(%arg0: tensor<128xf32>, %cst_0: tensor<f32>) -> tensor<f32> {
+    // CHECK: "ttnn.max"
+    // CHECK-NOT: dim_arg
+    // CHECK-SAME: keep_dim = true
+    // CHECK-SAME: tensor<128xf32,
+    // CHECK-SAME: -> tensor<1xf32,
+    // CHECK-NOT: "ttnn.reshape"
+    %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.maximum across dimensions = [0] : (tensor<128xf32>, tensor<f32>) -> tensor<f32>
+    return %0 : tensor<f32>
+  }
 }
diff --git a/test/ttmlir/Silicon/TTNN/simple_max.mlir b/test/ttmlir/Silicon/TTNN/simple_max.mlir
new file mode 100644
index 0000000000..8ec3bdc591
--- /dev/null
+++ b/test/ttmlir/Silicon/TTNN/simple_max.mlir
@@ -0,0 +1,39 @@
+// RUN: rm -rf %t.ttnn
+// RUN: rm -rf %t.mlir
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+// UNSUPPORTED: true
+// These tests are currently failing until a fix for this issue is uplifted
+// with new version of Metal: https://github.com/tenstorrent/tt-metal/issues/16104
+// TODO(mrakita): Enable and edit these tests after the Metal issue is fixed.
+// Tracked by: https://github.com/tenstorrent/tt-mlir/issues/1640
+
+module {
+  func.func public @reduce_not_keep_dim(%arg0: tensor<128x10xf32>) -> tensor<128xf32> {
+    %0 = tensor.empty() : tensor<128xf32>
+    // CHECK: "ttnn.max"
+    // CHECK-SAME: dim_arg = [1 : i32]
+    // CHECK-SAME: keep_dim = true
+    // CHECK-SAME: tensor<128x10xf32,
+    // CHECK-SAME: -> tensor<128x1xf32,
+    // CHECK: "ttnn.reshape"
+    // CHECK-SAME: shape = [128 : i32]
+    // CHECK-SAME: tensor<128x1xf32,
+    // CHECK-SAME: -> tensor<128xf32,
+    %1 = "ttir.max"(%arg0, %0) <{dim_arg = [1 : i32], keep_dim = false}> : (tensor<128x10xf32>, tensor<128xf32>) -> tensor<128xf32>
+    return %1 : tensor<128xf32>
+  }
+
+  func.func public @reduce_keep_dim(%arg0: tensor<128x10xf32>) -> tensor<128x1xf32> {
+    %0 = tensor.empty() : tensor<128x1xf32>
+    // CHECK: "ttnn.max"
+    // CHECK-SAME: dim_arg = [1 : i32]
+    // CHECK-SAME: keep_dim = true
+    // CHECK-SAME: tensor<128x10xf32,
+    // CHECK-SAME: -> tensor<128x1xf32,
+    // CHECK-NOT: "ttnn.reshape"
+    %1 = "ttir.max"(%arg0, %0) <{dim_arg = [1 : i32], keep_dim = true}> : (tensor<128x10xf32>, tensor<128x1xf32>) -> tensor<128x1xf32>
+    return %1 : tensor<128x1xf32>
+  }
+}
diff --git a/test/ttmlir/Silicon/TTNN/simple_mean.mlir b/test/ttmlir/Silicon/TTNN/simple_mean.mlir
index 0a3250936b..476dcd9aba 100644
--- a/test/ttmlir/Silicon/TTNN/simple_mean.mlir
+++ b/test/ttmlir/Silicon/TTNN/simple_mean.mlir
@@ -1,12 +1,39 @@
+// RUN: rm -rf %t.ttnn
+// RUN: rm -rf %t.mlir
 // RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
 // RUN: FileCheck %s --input-file=%t.mlir
 // RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
 // UNSUPPORTED: true
+// These tests are currently failing until a fix for this issue is uplifted
+// with new version of Metal: https://github.com/tenstorrent/tt-metal/issues/16104
+// TODO(mrakita): Enable and edit these tests after the Metal issue is fixed.
+// Tracked by: https://github.com/tenstorrent/tt-mlir/issues/1640
+
 module {
-  func.func @forward(%arg0: tensor<512x1024xbf16>) -> tensor<512x32xbf16> {
-    %0 = tensor.empty() : tensor<512x32xbf16>
-    // CHECK: %[[C:.*]] = "ttnn.mean"[[C:.*]]
-    %1 = "ttir.mean"(%arg0, %0) <{dim_arg = [-1: i32], keep_dim = true}> : (tensor<512x1024xbf16>, tensor<512x32xbf16>) -> tensor<512x32xbf16>
-    return %1 : tensor<512x32xbf16>
+  func.func public @reduce_not_keep_dim(%arg0: tensor<128x10xf32>) -> tensor<128xf32> {
+    %0 = tensor.empty() : tensor<128xf32>
+    // CHECK: "ttnn.mean"
+    // CHECK-SAME: dim_arg = [1 : i32]
+    // CHECK-SAME: keep_dim = true
+    // CHECK-SAME: tensor<128x10xf32,
+    // CHECK-SAME: -> tensor<128x1xf32,
+    // CHECK: "ttnn.reshape"
+    // CHECK-SAME: shape = [128 : i32]
+    // CHECK-SAME: tensor<128x1xf32,
+    // CHECK-SAME: -> tensor<128xf32,
+    %1 = "ttir.mean"(%arg0, %0) <{dim_arg = [1 : i32], keep_dim = false}> : (tensor<128x10xf32>, tensor<128xf32>) -> tensor<128xf32>
+    return %1 : tensor<128xf32>
+  }
+
+  func.func public @reduce_keep_dim(%arg0: tensor<128x10xf32>) -> tensor<128x1xf32> {
+    %0 = tensor.empty() : tensor<128x1xf32>
+    // CHECK: "ttnn.mean"
+    // CHECK-SAME: dim_arg = [1 : i32]
+    // CHECK-SAME: keep_dim = true
+    // CHECK-SAME: tensor<128x10xf32,
+    // CHECK-SAME: -> tensor<128x1xf32,
+    // CHECK-NOT: "ttnn.reshape"
+    %1 = "ttir.mean"(%arg0, %0) <{dim_arg = [1 : i32], keep_dim = true}> : (tensor<128x10xf32>, tensor<128x1xf32>) -> tensor<128x1xf32>
+    return %1 : tensor<128x1xf32>
   }
 }
diff --git a/test/ttmlir/Silicon/TTNN/simple_sum.mlir b/test/ttmlir/Silicon/TTNN/simple_sum.mlir
new file mode 100644
index 0000000000..cb1904a34e
--- /dev/null
+++ b/test/ttmlir/Silicon/TTNN/simple_sum.mlir
@@ -0,0 +1,39 @@
+// RUN: rm -rf %t.ttnn
+// RUN: rm -rf %t.mlir
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+// UNSUPPORTED: true
+// These tests are currently failing until a fix for this issue is uplifted
+// with new version of Metal: https://github.com/tenstorrent/tt-metal/issues/16104
+// TODO(mrakita): Enable and edit these tests after the Metal issue is fixed.
+// Tracked by: https://github.com/tenstorrent/tt-mlir/issues/1640
+
+module {
+  func.func public @reduce_not_keep_dim(%arg0: tensor<128x10xf32>) -> tensor<128xf32> {
+    %0 = tensor.empty() : tensor<128xf32>
+    // CHECK: "ttnn.sum"
+    // CHECK-SAME: dim_arg = [1 : i32]
+    // CHECK-SAME: keep_dim = true
+    // CHECK-SAME: tensor<128x10xf32,
+    // CHECK-SAME: -> tensor<128x1xf32,
+    // CHECK: "ttnn.reshape"
+    // CHECK-SAME: shape = [128 : i32]
+    // CHECK-SAME: tensor<128x1xf32,
+    // CHECK-SAME: -> tensor<128xf32,
+    %1 = "ttir.sum"(%arg0, %0) <{dim_arg = [1 : i32], keep_dim = false}> : (tensor<128x10xf32>, tensor<128xf32>) -> tensor<128xf32>
+    return %1 : tensor<128xf32>
+  }
+
+  func.func public @reduce_keep_dim(%arg0: tensor<128x10xf32>) -> tensor<128x1xf32> {
+    %0 = tensor.empty() : tensor<128x1xf32>
+    // CHECK: "ttnn.sum"
+    // CHECK-SAME: dim_arg = [1 : i32]
+    // CHECK-SAME: keep_dim = true
+    // CHECK-SAME: tensor<128x10xf32,
+    // CHECK-SAME: -> tensor<128x1xf32,
+    // CHECK-NOT: "ttnn.reshape"
+    %1 = "ttir.sum"(%arg0, %0) <{dim_arg = [1 : i32], keep_dim = true}> : (tensor<128x10xf32>, tensor<128x1xf32>) -> tensor<128x1xf32>
+    return %1 : tensor<128x1xf32>
+  }
+}