diff --git a/mlir/docs/Dialects/MemRef.md b/mlir/docs/Dialects/MemRef.md
index 2e2adaf7afa954..1d926fc3852396 100644
--- a/mlir/docs/Dialects/MemRef.md
+++ b/mlir/docs/Dialects/MemRef.md
@@ -11,66 +11,3 @@ before adding or changing any operation in this dialect.**
 
 [include "Dialects/MemRefOps.md"]
 
-### 'dma_start' operation
-
-Syntax:
-
-```
-operation ::= `memref.dma_start` ssa-use`[`ssa-use-list`]` `,`
-               ssa-use`[`ssa-use-list`]` `,` ssa-use `,`
-               ssa-use`[`ssa-use-list`]` (`,` ssa-use `,` ssa-use)?
-              `:` memref-type `,` memref-type `,` memref-type
-```
-
-Starts a non-blocking DMA operation that transfers data from a source memref to
-a destination memref. The operands include the source and destination memref's
-each followed by its indices, size of the data transfer in terms of the number
-of elements (of the elemental type of the memref), a tag memref with its
-indices, and optionally two additional arguments corresponding to the stride (in
-terms of number of elements) and the number of elements to transfer per stride.
-The tag location is used by a dma_wait operation to check for completion. The
-indices of the source memref, destination memref, and the tag memref have the
-same restrictions as any load/store operation in an affine context (whenever DMA
-operations appear in an affine context). See
-[restrictions on dimensions and symbols](Affine.md/#restrictions-on-dimensions-and-symbols)
-in affine contexts. This allows powerful static analysis and transformations in
-the presence of such DMAs including rescheduling, pipelining / overlap with
-computation, and checking for matching start/end operations. The source and
-destination memref need not be of the same dimensionality, but need to have the
-same elemental type.
-
-For example, a `memref.dma_start` operation that transfers 32 vector elements
-from a memref `%src` at location `[%i, %j]` to memref `%dst` at `[%k, %l]` would
-be specified as shown below.
-
-Example:
-
-```mlir
-%size = arith.constant 32 : index
-%tag = memref.alloc() : memref<1 x i32, affine_map<(d0) -> (d0)>, 4>
-%idx = arith.constant 0 : index
-memref.dma_start %src[%i, %j], %dst[%k, %l], %size, %tag[%idx] :
-     memref<40 x 8 x vector<16xf32>, affine_map<(d0, d1) -> (d0, d1)>, 0>,
-     memref<2 x 4 x vector<16xf32>, affine_map<(d0, d1) -> (d0, d1)>, 2>,
-     memref<1 x i32>, affine_map<(d0) -> (d0)>, 4>
-```
-
-### 'dma_wait' operation
-
-Syntax:
-
-```
-operation ::= `memref.dma_wait` ssa-use`[`ssa-use-list`]` `,` ssa-use `:` memref-type
-```
-
-Blocks until the completion of a DMA operation associated with the tag element
-specified with a tag memref and its indices. The operands include the tag memref
-followed by its indices and the number of elements associated with the DMA being
-waited on. The indices of the tag memref have the same restrictions as
-load/store indices.
-
-Example:
-
-```mlir
-memref.dma_wait %tag[%idx], %size : memref<1 x i32, affine_map<(d0) -> (d0)>, 4>
-```
diff --git a/mlir/docs/Dialects/Transform.md b/mlir/docs/Dialects/Transform.md
index 0a59d4b32c7eb6..a24099c5ba9c9c 100644
--- a/mlir/docs/Dialects/Transform.md
+++ b/mlir/docs/Dialects/Transform.md
@@ -423,6 +423,10 @@ ops rather than having the methods directly act on the payload IR.
 
 [include "Dialects/BufferizationTransformOps.md"]
 
+## Func Transform Operations
+
+[include "Dialects/FuncTransformOps.md"]
+
 ## GPU Transform Operations
 
 [include "Dialects/GPUTransformOps.md"]
diff --git a/mlir/docs/PassManagement.md b/mlir/docs/PassManagement.md
index 9a7cfd1f9bebc3..95a38207b7f854 100644
--- a/mlir/docs/PassManagement.md
+++ b/mlir/docs/PassManagement.md
@@ -458,7 +458,7 @@ program has been run through the passes. This provides several benefits:
 In some situations it may be useful to run a pass pipeline within another pass,
 to allow configuring or filtering based on some invariants of the current
 operation being operated on. For example, the
-[Inliner Pass](Passes.md/#-inline-inline-function-calls) may want to run
+[Inliner Pass](Passes.md/#-inline) may want to run
 intraprocedural simplification passes while it is inlining to produce a better
 cost model, and provide more optimal inlining. To enable this, passes may run an
 arbitrary `OpPassManager` on the current operation being operated on or any
diff --git a/mlir/docs/Rationale/RationaleLinalgDialect.md b/mlir/docs/Rationale/RationaleLinalgDialect.md
index 237a8e3b6bda6c..7b5137ede3ae74 100644
--- a/mlir/docs/Rationale/RationaleLinalgDialect.md
+++ b/mlir/docs/Rationale/RationaleLinalgDialect.md
@@ -384,7 +384,7 @@ Affine dialects in particular, Linalg takes the following decisions.
   multi-for loops with induction variables independent of each other (referred
   to as hyper-rectangular iteration domains in the literature) such as the
   proposed
-  [affine.parallel]((https://llvm.discourse.group/t/rfc-add-affine-parallel/350)
+  [affine.parallel](https://llvm.discourse.group/t/rfc-add-affine-parallel/350)
   are sufficient in the majority of cases.
 - **Declarative Tiling**: the *tiling* transformation is ubiquitous in HPC code
   generation. It can be seen as a decomposition of either the iteration space or
diff --git a/mlir/docs/Traits/_index.md b/mlir/docs/Traits/_index.md
index 6a9c650aca96b7..3fa24ec77107f0 100644
--- a/mlir/docs/Traits/_index.md
+++ b/mlir/docs/Traits/_index.md
@@ -40,7 +40,7 @@ Operation traits may also provide a `verifyTrait` or `verifyRegionTrait` hook
 that is called when verifying the concrete operation. The difference between
 these two is that whether the verifier needs to access the regions, if so, the
 operations in the regions will be verified before the verification of this
-trait. The [verification order](DefiningDialects/Operations.md/#verification-ordering)
+trait. The [verification order](../DefiningDialects/Operations.md/#verification-ordering)
 determines when a verifier will be invoked.
 
 ```c++
@@ -155,7 +155,7 @@ class MyType : public Type::TypeBase<MyType, ..., MyTrait, MyParametricTrait<10>
 
 ### Attaching Operation Traits in ODS
 
-To use an operation trait in the [ODS](DefiningDialects/Operations.md) framework, we need to
+To use an operation trait in the [ODS](../DefiningDialects/Operations.md) framework, we need to
 provide a definition of the trait class. This can be done using the
 `NativeOpTrait` and `ParamNativeOpTrait` classes. `ParamNativeOpTrait` provides
 a mechanism in which to specify arguments to a parametric trait class with an
@@ -177,7 +177,7 @@ These can then be used in the `traits` list of an op definition:
 def OpWithInferTypeInterfaceOp : Op<...[MyTrait, MyParametricTrait<10>]> { ... }
 ```
 
-See the documentation on [operation definitions](DefiningDialects/Operations.md) for more
+See the documentation on [operation definitions](../DefiningDialects/Operations.md) for more
 details.
 
 ## Using a Trait
@@ -186,7 +186,7 @@ Traits may be used to provide additional methods, static fields, or other
 information directly on the concrete object. `Traits` internally become `Base`
 classes of the concrete operation, so all of these are directly accessible. To
 expose this information opaquely to transformations and analyses,
-[`interfaces`](Interfaces.md) may be used.
+[`interfaces`](../Interfaces.md) may be used.
 
 To query if a specific object contains a specific trait, the `hasTrait<>` method
 may be used. This takes as a template parameter the trait class, which is the
@@ -231,7 +231,7 @@ This trait is carried by region holding operations that define a new scope for
 automatic allocation. Such allocations are automatically freed when control is
 transferred back from the regions of such operations. As an example, allocations
 performed by
-[`memref.alloca`](Dialects/MemRef.md/#memrefalloca-mlirmemrefallocaop) are
+[`memref.alloca`](../Dialects/MemRef.md/#memrefalloca-memrefallocaop) are
 automatically freed when control leaves the region of its closest surrounding op
 that has the trait AutomaticAllocationScope.
 
@@ -241,7 +241,8 @@ that has the trait AutomaticAllocationScope.
 
 This trait adds the property that the operation is known to have
 [broadcast-compatible](https://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-operands and that its result type is compatible with the inferred broadcast shape. See [The `Broadcastable` Trait](Traits/Broadcastable.md) for details.
+operands and that its result type is compatible with the inferred broadcast shape. 
+See [The `Broadcastable` Trait](Broadcastable.md) for details.
 
 ### Commutative
 
@@ -290,7 +291,7 @@ foo.region_op {
 ```
 
 This trait is an important structural property of the IR, and enables operations
-to have [passes](PassManagement.md) scheduled under them.
+to have [passes](../PassManagement) scheduled under them.
 
 ### MemRefsNormalizable
 
@@ -302,8 +303,7 @@ type where those references can be 'normalized'. In cases where an associated
 operations can be modified so that the `MemRef` has an identity layout
 specification. This can be implemented by associating the operation with its own
 index expression that can express the equivalent of the memory-layout
-specification of the MemRef type. See [the -normalize-memrefs pass].
-(https://mlir.llvm.org/docs/Passes/#-normalize-memrefs-normalize-memrefs)
+specification of the MemRef type. See [the -normalize-memrefs pass](../Passes.md/#-normalize-memrefs).
 
 ### Single Block Region
 
@@ -325,18 +325,18 @@ that the single block must terminate with `TerminatorOpType`.
 *   `OpTrait::SymbolTable` -- `SymbolTable`
 
 This trait is used for operations that define a
-[`SymbolTable`](SymbolsAndSymbolTables.md#symbol-table).
+[`SymbolTable`](../SymbolsAndSymbolTables.md/#symbol-table).
 
 ### Terminator
 
 *   `OpTrait::IsTerminator` -- `Terminator`
 
 This trait provides verification and functionality for operations that are known
-to be [terminators](LangRef.md#terminator-operations).
+to be [terminators](../LangRef.md/#control-flow-and-ssacfg-regions).
 
 *   `OpTrait::NoTerminator` -- `NoTerminator`
 
 This trait removes the requirement on regions held by an operation to have
-[terminator operations](LangRef.md#terminator-operations) at the end of a block.
+[terminator operations](../LangRef.md/#control-flow-and-ssacfg-regions) at the end of a block.
 This requires that these regions have a single block. An example of operation
 using this trait is the top-level `ModuleOp`.
diff --git a/mlir/include/mlir/Dialect/MemRef/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/MemRef/IR/CMakeLists.txt
index be40d7bca66693..b7b12d49f9224d 100644
--- a/mlir/include/mlir/Dialect/MemRef/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/MemRef/IR/CMakeLists.txt
@@ -1,2 +1,2 @@
 add_mlir_dialect(MemRefOps memref)
-add_mlir_doc(MemRefOps MemRefOps Dialects/ -gen-dialect-doc -dialect=memref)
+add_mlir_doc(MemRefOps MemRefOps Dialects/ -gen-op-doc)
diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
index 6b0ccbe37e89e9..8fa41f4e4b659f 100644
--- a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
+++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
@@ -646,6 +646,15 @@ def MemRef_DimOp : MemRef_Op<"dim", [
 def MemRef_DmaStartOp : MemRef_Op<"dma_start"> {
   let summary = "non-blocking DMA operation that starts a transfer";
   let description = [{
+    Syntax:
+    
+    ```
+    operation ::= `memref.dma_start` ssa-use`[`ssa-use-list`]` `,`
+                   ssa-use`[`ssa-use-list`]` `,` ssa-use `,`
+                   ssa-use`[`ssa-use-list`]` (`,` ssa-use `,` ssa-use)?
+                  `:` memref-type `,` memref-type `,` memref-type
+    ```
+
     DmaStartOp starts a non-blocking DMA operation that transfers data from a
     source memref to a destination memref. The source and destination memref
     need not be of the same dimensionality, but need to have the same elemental
@@ -686,9 +695,9 @@ def MemRef_DmaStartOp : MemRef_Op<"dma_start"> {
               %num_elt_per_stride :
     ```
 
-    TODO: add additional operands to allow source and destination striding, and
+    * TODO: add additional operands to allow source and destination striding, and
     multiple stride levels.
-    TODO: Consider replacing src/dst memref indices with view memrefs.
+    * TODO: Consider replacing src/dst memref indices with view memrefs.
   }];
   let arguments = (ins Variadic<AnyType>:$operands);
 
@@ -839,8 +848,7 @@ def MemRef_DmaWaitOp : MemRef_Op<"dma_wait"> {
                        Variadic<Index>:$tagIndices,
                        Index:$numElements);
   let assemblyFormat = [{
-    $tagMemRef `[` $tagIndices `]` `,` $numElements attr-dict `:`
-    type($tagMemRef)
+    $tagMemRef `[` $tagIndices `]` `,` $numElements attr-dict `:` type($tagMemRef)
   }];
   let extraClassDeclaration = [{
     /// Returns the rank (number of indices) of the tag memref.