From 10447f847c4a325ff0127c308ea9a422a553d832 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?ZHU=20Yuhao=20=E6=9C=B1=E5=AE=87=E6=B5=A9?=
 <dr.yuhao.zhu@outlook.com>
Date: Thu, 13 Feb 2025 22:54:46 +0100
Subject: [PATCH 01/11] Add iterator over any axis

---
 numojo/core/ndarray.mojo           | 147 +++++++++++++++++++++++++++++
 tests/core/test_array_methods.mojo |  26 ++++-
 2 files changed, 171 insertions(+), 2 deletions(-)

diff --git a/numojo/core/ndarray.mojo b/numojo/core/ndarray.mojo
index 1adb505a..8c158f6e 100644
--- a/numojo/core/ndarray.mojo
+++ b/numojo/core/ndarray.mojo
@@ -3995,10 +3995,157 @@ struct _NDArrayIter[
             return self.index
 
 
+@value
+struct _NDAxisIter[
+    is_mutable: Bool, //,
+    origin: Origin[is_mutable],
+    dtype: DType,
+    forward: Bool = True,
+]():
+    """
+    An iterator yielding 1-d array according to the axis.
+
+    It can be used when a function reduces the dimension of the array by axis.
+
+    Parameters:
+        is_mutable: Whether the iterator is mutable.
+        origin: The lifetime of the underlying NDArray data.
+        dtype: The data type of the item.
+        forward: The iteration direction. `False` is backwards.
+
+    Example:
+    ```
+    [[[ 0,  1,  2,  3],
+      [ 4,  5,  6,  7],
+      [ 8,  9, 10, 11]],
+     [[12, 13, 14, 15],
+      [16, 17, 18, 19],
+      [20, 21, 22, 23]]]
+    ```
+    The above array is of shape (2,3,3). Itering by `axis=0` returns:
+    ```
+    [0, 12], [1, 13], [2, 14], [3, 15]
+    [4, 16], [5, 17], [6, 18], [7, 19]
+    [8, 20], [9, 21], [10, 22], [11, 23]
+    ```
+    Itering by `axis=1` returns:
+    ```
+    [0, 4, 8], [1, 5, 9], [2, 6, 10], [3, 7, 11]
+    [12, 16, 20], [13, 17, 21], [14, 18, 22], [15, 19, 23]
+    ```
+    """
+
+    var ptr: UnsafePointer[Scalar[dtype]]
+    var axis: Int
+    var length: Int
+    var ndim: Int
+    var shape: NDArrayShape
+    var strides: NDArrayStrides
+    """Strides of array or view. It is not necessarily compatible with shape."""
+    var strides_by_axis: NDArrayStrides
+    """Strides by axis according to shape of view."""
+    var offset: Int
+    var size_of_res: Int
+    """Size of the result 1-d array."""
+
+    fn __init__(
+        out self,
+        ptr: UnsafePointer[Scalar[dtype]],
+        axis: Int,
+        length: Int,
+        ndim: Int,
+        shape: NDArrayShape,
+        strides: NDArrayStrides,
+    ) raises:
+        """
+        Initialize the iterator.
+
+        Args:
+            ptr: Pointer to the data buffer.
+            axis: Axis.
+            length: Length of the axis.
+            ndim: Number of dimensions.
+            shape: Shape of the array.
+            strides: Strides of array or view. It is not necessarily compatible with shape.
+        """
+        if axis < 0 or axis >= ndim:
+            raise Error("Axis must be in the range of [0, ndim).")
+
+        self.size_of_res = shape[axis]
+        self.offset = 0 if forward else length - self.size_of_res
+        self.ptr = ptr
+        self.axis = axis
+        self.length = length
+        self.ndim = ndim
+        self.shape = shape
+        self.strides = strides
+        self.strides_by_axis = NDArrayStrides(ndim=self.ndim, initialized=False)
+        var temp = 1
+        (self.strides_by_axis._buf + axis).init_pointee_copy(temp)
+        temp *= shape[axis]
+        for i in range(self.ndim - 1, -1, -1):
+            if i != axis:
+                (self.strides_by_axis._buf + i).init_pointee_copy(temp)
+                temp *= shape[i]
+
+    fn __has_next__(self) -> Bool:
+        @parameter
+        if forward:
+            return self.offset < self.length
+        else:
+            return self.offset > 0 - self.size_of_res
+
+    fn __iter__(self) -> Self:
+        return self
+
+    fn __len__(self) -> Int:
+        @parameter
+        if forward:
+            return (self.length - self.offset) // self.size_of_res
+        else:
+            return self.offset // self.size_of_res + 1
+
+    fn __next__(mut self) raises -> NDArray[dtype]:
+        var res = NDArray[dtype](Shape(self.size_of_res))
+        var current_offset = self.offset
+
+        @parameter
+        if forward:
+            self.offset += self.size_of_res
+        else:
+            self.offset -= self.size_of_res
+
+        var remainder = current_offset
+        var item = Item(ndim=self.ndim, initialized=True)
+
+        for i in range(self.axis):
+            item[i], remainder = divmod(remainder, self.strides_by_axis[i])
+
+        for i in range(self.axis + 1, self.ndim):
+            item[i], remainder = divmod(remainder, self.strides_by_axis[i])
+
+        item[self.axis], remainder = divmod(
+            remainder, self.strides_by_axis[self.axis]
+        )
+
+        for j in range(self.size_of_res):
+            (res._buf.ptr + j).init_pointee_copy(
+                self.ptr[_get_offset(item, self.strides)]
+            )
+            item[self.axis] += 1
+
+        return res^
+
+
 @value
 struct _NDIter[
     is_mutable: Bool, //, origin: Origin[is_mutable], dtype: DType
 ]():
+    # TODO: Combine into `_NDAxisIter` with `axis=ndim-1`.
+    """
+    An iterator yielding the array elements according to the order.
+    """
+
     var ptr: UnsafePointer[Scalar[dtype]]
     var length: Int
     var ndim: Int
diff --git a/tests/core/test_array_methods.mojo b/tests/core/test_array_methods.mojo
index 6bc42307..361cfb45 100644
--- a/tests/core/test_array_methods.mojo
+++ b/tests/core/test_array_methods.mojo
@@ -1,5 +1,4 @@
-import numojo as nm
-from numojo import *
+from numojo.prelude import *
 from testing.testing import assert_true, assert_almost_equal, assert_equal
 from utils_for_test import check, check_is_close
 
@@ -58,3 +57,26 @@ def test_constructors():
         arr5.shape[2] == 5,
         "NDArray constructor with NDArrayShape: shape element 2",
     )
+
+
+def test_iterator():
+    var a = nm.arange[i8](24).reshape(Shape(2, 3, 4))
+    var a_iter = nm.core.ndarray._NDAxisIter[
+        origin = __origin_of(a), dtype=i8, forward=False
+    ](
+        ptr=a._buf.ptr,
+        axis=0,
+        length=a.size,
+        ndim=a.ndim,
+        shape=a.shape,
+        strides=a.strides,
+    )
+    var b = a_iter.__next__() == nm.array[i8]("[11, 23]")
+    assert_true(
+        b.item(0) == True,
+        "`_NDAxisIter` breaks",
+    )
+    assert_true(
+        b.item(1) == True,
+        "`_NDAxisIter` breaks",
+    )

From cf76a1c888f6cef2afdb50b4687ebe0dc6ce1690 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?ZHU=20Yuhao=20=E6=9C=B1=E5=AE=87=E6=B5=A9?=
 <dr.yuhao.zhu@outlook.com>
Date: Thu, 13 Feb 2025 23:17:31 +0100
Subject: [PATCH 02/11] Add function `iter_by_axis`

---
 numojo/core/ndarray.mojo           | 120 ++++++++++++++++++++++++++---
 tests/core/test_array_methods.mojo |  11 +--
 2 files changed, 110 insertions(+), 21 deletions(-)

diff --git a/numojo/core/ndarray.mojo b/numojo/core/ndarray.mojo
index 8c158f6e..f1ae8426 100644
--- a/numojo/core/ndarray.mojo
+++ b/numojo/core/ndarray.mojo
@@ -3458,6 +3458,104 @@ struct NDArray[dtype: DType = DType.float64](
                     )
             self._buf.ptr.store(_get_offset(indices, self.strides), item)
 
+    fn iter_by_axis[
+        forward: Bool = True
+    ](self, axis: Int) raises -> _NDAxisIter[__origin_of(self), dtype, forward]:
+        """
+        Returns an iterator yielding 1-d array by axis.
+
+        Parameters:
+            forward: If True, iterate from the beginning to the end.
+                If False, iterate from the end to the beginning.
+
+        Args:
+            axis: Axis by which the iteration is performed.
+
+        Returns:
+            An iterator yielding 1-d array by axis.
+
+        Example:
+        ```mojo
+        from numojo.prelude import *
+        var a = nm.arange[i8](24).reshape(Shape(2, 3, 4))
+        print(a)
+        for i in a.iter_by_axis(axis=0):
+            print(String(i))
+        ```
+
+        This prints:
+
+        ```console
+        [[[ 0  1  2  3]
+          [ 4  5  6  7]
+          [ 8  9 10 11]]
+         [[12 13 14 15]
+          [16 17 18 19]
+          [20 21 22 23]]]
+        3D-array  Shape(2,3,4)  Strides(12,4,1)  DType: i8  C-cont: True  F-cont: False  own data: True
+        [ 0 12]
+        [ 1 13]
+        [ 2 14]
+        [ 3 15]
+        [ 4 16]
+        [ 5 17]
+        [ 6 18]
+        [ 7 19]
+        [ 8 20]
+        [ 9 21]
+        [10 22]
+        [11 23]
+        ```
+
+        Another example:
+
+        ```mojo
+        from numojo.prelude import *
+        var a = nm.arange[i8](24).reshape(Shape(2, 3, 4))
+        print(a)
+        for i in a.iter_by_axis(axis=2):
+            print(String(i))
+        ```
+
+        This prints:
+
+        ```console
+        [[[ 0  1  2  3]
+          [ 4  5  6  7]
+          [ 8  9 10 11]]
+         [[12 13 14 15]
+          [16 17 18 19]
+          [20 21 22 23]]]
+        3D-array  Shape(2,3,4)  Strides(12,4,1)  DType: i8  C-cont: True  F-cont: False  own data: True
+        [0 1 2 3]
+        [4 5 6 7]
+        [ 8  9 10 11]
+        [12 13 14 15]
+        [16 17 18 19]
+        [20 21 22 23]
+        ```.
+        """
+
+        var normalized_axis: Int = axis
+        if normalized_axis < 0:
+            normalized_axis += self.ndim
+        if (normalized_axis >= self.ndim) or (normalized_axis < 0):
+            raise Error(
+                String(
+                    "\nError in `NDArray.iter_by_axis()`: "
+                    "Axis ({}) is not in valid range [{}, {})."
+                ).format(axis, -self.ndim, self.ndim)
+            )
+
+        return _NDAxisIter[__origin_of(self), dtype, forward](
+            ptr=self._buf.ptr,
+            axis=normalized_axis,
+            size=self.size,
+            ndim=self.ndim,
+            shape=self.shape,
+            strides=self.strides,
+        )
+
     fn max(self, axis: Int = 0) raises -> Self:
         """
         Max on axis.
@@ -4003,7 +4101,7 @@ struct _NDAxisIter[
     forward: Bool = True,
 ]():
     """
-    An iterator yielding 1-d array according to the axis.
+    An iterator yielding 1-d array by axis.
 
     It can be used when a function reduces the dimension of the array by axis.
 
@@ -4024,20 +4122,20 @@ struct _NDAxisIter[
     ```
     The above array is of shape (2,3,3). Itering by `axis=0` returns:
     ```
-    [0, 12], [1, 13], [2, 14], [3, 15]
-    [4, 16], [5, 17], [6, 18], [7, 19]
+    [0, 12], [1, 13], [2, 14], [3, 15],
+    [4, 16], [5, 17], [6, 18], [7, 19],
     [8, 20], [9, 21], [10, 22], [11, 23]
     ```
     Itering by `axis=1` returns:
     ```
-    [0, 4, 8], [1, 5, 9], [2, 6, 10], [3, 7, 11]
+    [0, 4, 8], [1, 5, 9], [2, 6, 10], [3, 7, 11],
     [12, 16, 20], [13, 17, 21], [14, 18, 22], [15, 19, 23]
     ```
     """
 
     var ptr: UnsafePointer[Scalar[dtype]]
     var axis: Int
-    var length: Int
+    var size: Int
     var ndim: Int
     var shape: NDArrayShape
     var strides: NDArrayStrides
@@ -4052,7 +4150,7 @@ struct _NDAxisIter[
         out self,
         ptr: UnsafePointer[Scalar[dtype]],
         axis: Int,
-        length: Int,
+        size: Int,
         ndim: Int,
         shape: NDArrayShape,
         strides: NDArrayStrides,
@@ -4063,7 +4161,7 @@ struct _NDAxisIter[
         Args:
             ptr: Pointer to the data buffer.
             axis: Axis.
-            length: Length of the axis.
+            size: Size of the axis.
             ndim: Number of dimensions.
             shape: Shape of the array.
             strides: Strides of array or view. It is not necessarily compatible with shape.
@@ -4072,10 +4170,10 @@ struct _NDAxisIter[
             raise Error("Axis must be in the range of [0, ndim).")
 
         self.size_of_res = shape[axis]
-        self.offset = 0 if forward else length - self.size_of_res
+        self.offset = 0 if forward else size - self.size_of_res
         self.ptr = ptr
         self.axis = axis
-        self.length = length
+        self.size = size
         self.ndim = ndim
         self.shape = shape
         self.strides = strides
@@ -4091,7 +4189,7 @@ struct _NDAxisIter[
     fn __has_next__(self) -> Bool:
         @parameter
         if forward:
-            return self.offset < self.length
+            return self.offset < self.size
         else:
             return self.offset > 0 - self.size_of_res
 
@@ -4101,7 +4199,7 @@ struct _NDAxisIter[
     fn __len__(self) -> Int:
         @parameter
         if forward:
-            return (self.length - self.offset) // self.size_of_res
+            return (self.size - self.offset) // self.size_of_res
         else:
             return self.offset // self.size_of_res + 1
 
diff --git a/tests/core/test_array_methods.mojo b/tests/core/test_array_methods.mojo
index 361cfb45..3848617a 100644
--- a/tests/core/test_array_methods.mojo
+++ b/tests/core/test_array_methods.mojo
@@ -61,16 +61,7 @@ def test_constructors():
 
 def test_iterator():
     var a = nm.arange[i8](24).reshape(Shape(2, 3, 4))
-    var a_iter = nm.core.ndarray._NDAxisIter[
-        origin = __origin_of(a), dtype=i8, forward=False
-    ](
-        ptr=a._buf.ptr,
-        axis=0,
-        length=a.size,
-        ndim=a.ndim,
-        shape=a.shape,
-        strides=a.strides,
-    )
+    var a_iter = a.iter_by_axis[forward=False](axis=0)
     var b = a_iter.__next__() == nm.array[i8]("[11, 23]")
     assert_true(
         b.item(0) == True,

From b7f786799c1713dd7c8fd299e200c7ad7f990ac2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?ZHU=20Yuhao=20=E6=9C=B1=E5=AE=87=E6=B5=A9?=
 <dr.yuhao.zhu@outlook.com>
Date: Fri, 14 Feb 2025 00:02:10 +0100
Subject: [PATCH 03/11] Add TODO

---
 numojo/core/ndarray.mojo | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/numojo/core/ndarray.mojo b/numojo/core/ndarray.mojo
index f1ae8426..b4c73d10 100644
--- a/numojo/core/ndarray.mojo
+++ b/numojo/core/ndarray.mojo
@@ -4100,8 +4100,15 @@ struct _NDAxisIter[
     dtype: DType,
     forward: Bool = True,
 ]():
+    # TODO:
+    # 1. Use `length` (`index`) instead of `size` (`offset`) for the
+    # length (counter) of the iterator.
+    # 2. Return a view instead of copy if possible (when Bufferable is supported).
+    # 3. Add an argument in `__init__()` to specify the starting offset or index.
     """
     An iterator yielding 1-d array by axis.
+    The yielded array is garanteed to be contiguous on memory,
+    and it is a view of the original array if possible.
 
     It can be used when a function reduces the dimension of the array by axis.
 

From 93e069404a325b739fc6b059b9b190d55ebb1aeb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?ZHU=20Yuhao=20=E6=9C=B1=E5=AE=87=E6=B5=A9?=
 <dr.yuhao.zhu@outlook.com>
Date: Fri, 14 Feb 2025 20:08:29 +0100
Subject: [PATCH 04/11] Add apply_func

---
 numojo/core/ndarray.mojo                 |  1 -
 numojo/core/utility.mojo                 | 32 ++++++++++++++++++++++++
 numojo/routines/math/extrema.mojo        |  1 -
 numojo/routines/statistics/averages.mojo | 20 +++++++++++++++
 4 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/numojo/core/ndarray.mojo b/numojo/core/ndarray.mojo
index b4c73d10..0d8bc501 100644
--- a/numojo/core/ndarray.mojo
+++ b/numojo/core/ndarray.mojo
@@ -4246,7 +4246,6 @@ struct _NDAxisIter[
 struct _NDIter[
     is_mutable: Bool, //, origin: Origin[is_mutable], dtype: DType
 ]():
-    # TODO: Combine into `_NDAxisIter` with `axis=ndim-1`.
     """
     An iterator yielding the array elements according to the order.
     """
diff --git a/numojo/core/utility.mojo b/numojo/core/utility.mojo
index c9702816..ac5dca10 100644
--- a/numojo/core/utility.mojo
+++ b/numojo/core/utility.mojo
@@ -308,6 +308,38 @@ fn _traverse_iterative_setter[
             index[d] = 0
 
 
+# ===----------------------------------------------------------------------=== #
+# Apply a function to NDArray by axis
+# ===----------------------------------------------------------------------=== #
+
+
+fn apply_func_on_array_with_dim_reduction[
+    dtype: DType,
+    func: fn[dtype_func: DType] (NDArray[dtype_func]) -> Scalar[dtype_func],
+](a: NDArray[dtype], axis: Int) raises -> NDArray[dtype]:
+    var res = NDArray[dtype](a.shape._pop(axis=axis))
+    var offset = 0
+    for i in a.iter_by_axis(axis=axis):
+        (res._buf.ptr + offset).init_pointee_copy(func[dtype](i))
+        offset += 1
+    return res^
+
+
+fn apply_func_on_array_with_dim_reduction[
+    dtype: DType, //,
+    returned_dtype: DType,
+    func: fn[dtype_func: DType, //, returned_dtype_func: DType] (
+        NDArray[dtype_func]
+    ) raises -> Scalar[returned_dtype_func],
+](a: NDArray[dtype], axis: Int) raises -> NDArray[returned_dtype]:
+    var res = NDArray[returned_dtype](a.shape._pop(axis=axis))
+    var offset = 0
+    for i in a.iter_by_axis(axis=axis):
+        (res._buf.ptr + offset).init_pointee_copy(func[returned_dtype](i))
+        offset += 1
+    return res^
+
+
 # ===----------------------------------------------------------------------=== #
 # NDArray conversions
 # ===----------------------------------------------------------------------=== #
diff --git a/numojo/routines/math/extrema.mojo b/numojo/routines/math/extrema.mojo
index b9a2dcfa..512ea443 100644
--- a/numojo/routines/math/extrema.mojo
+++ b/numojo/routines/math/extrema.mojo
@@ -51,7 +51,6 @@ fn max[
             slices.append(Slice(0, shape[i]))
         else:
             slices.append(Slice(0, 0))
-    print(result_shape.__str__())
 
     slices[axis] = Slice(0, 1)
 
diff --git a/numojo/routines/statistics/averages.mojo b/numojo/routines/statistics/averages.mojo
index 2b7b17b1..e750dc08 100644
--- a/numojo/routines/statistics/averages.mojo
+++ b/numojo/routines/statistics/averages.mojo
@@ -26,6 +26,26 @@ import numojo.routines.math.misc as misc
 from numojo.routines.sorting import sort
 
 
+fn mean_1d[
+    dtype: DType, //, returned_dtype: DType = DType.float64
+](a: NDArray[dtype]) raises -> Scalar[returned_dtype]:
+    """
+    Calculate the arithmetic average of all items in the array.
+
+    Parameters:
+        dtype: The element type.
+        returned_dtype: The returned data type, defaulting to float64.
+
+    Args:
+        a: NDArray.
+
+    Returns:
+        A scalar defaulting to float64.
+
+    """
+    return sum(a).cast[returned_dtype]() / a.size
+
+
 fn mean[
     dtype: DType, //, returned_dtype: DType = DType.float64
 ](a: NDArray[dtype]) raises -> Scalar[returned_dtype]:

From fd34e61ad2f6b51193f7a99d22ecc0485d3e7b7a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?ZHU=20Yuhao=20=E6=9C=B1=E5=AE=87=E6=B5=A9?=
 <dr.yuhao.zhu@outlook.com>
Date: Fri, 14 Feb 2025 21:27:48 +0100
Subject: [PATCH 05/11] Add `ith()` for NDAxisIter

---
 numojo/core/ndarray.mojo | 76 +++++++++++++++++++++++++++++++---------
 1 file changed, 59 insertions(+), 17 deletions(-)

diff --git a/numojo/core/ndarray.mojo b/numojo/core/ndarray.mojo
index 0d8bc501..88f8fd33 100644
--- a/numojo/core/ndarray.mojo
+++ b/numojo/core/ndarray.mojo
@@ -4101,10 +4101,10 @@ struct _NDAxisIter[
     forward: Bool = True,
 ]():
     # TODO:
-    # 1. Use `length` (`index`) instead of `size` (`offset`) for the
-    # length (counter) of the iterator.
-    # 2. Return a view instead of copy if possible (when Bufferable is supported).
-    # 3. Add an argument in `__init__()` to specify the starting offset or index.
+    # - Use `length` (`index`) instead of `size` (`offset`) for the
+    #   length (counter) of the iterator.
+    # - Return a view instead of copy if possible (when Bufferable is supported).
+    # - Add an argument in `__init__()` to specify the starting offset or index.
     """
     An iterator yielding 1-d array by axis.
     The yielded array is garanteed to be contiguous on memory,
@@ -4142,6 +4142,7 @@ struct _NDAxisIter[
 
     var ptr: UnsafePointer[Scalar[dtype]]
     var axis: Int
+    var length: Int
     var size: Int
     var ndim: Int
     var shape: NDArrayShape
@@ -4149,7 +4150,8 @@ struct _NDAxisIter[
     """Strides of array or view. It is not necessarily compatible with shape."""
     var strides_by_axis: NDArrayStrides
     """Strides by axis according to shape of view."""
-    var offset: Int
+    var index: Int
+    """Status counter."""
     var size_of_res: Int
     """Size of the result 1-d array."""
 
@@ -4177,9 +4179,9 @@ struct _NDAxisIter[
             raise Error("Axis must be in the range of [0, ndim).")
 
         self.size_of_res = shape[axis]
-        self.offset = 0 if forward else size - self.size_of_res
         self.ptr = ptr
         self.axis = axis
+        self.length = size // self.size_of_res
         self.size = size
         self.ndim = ndim
         self.shape = shape
@@ -4192,13 +4194,14 @@ struct _NDAxisIter[
             if i != axis:
                 (self.strides_by_axis._buf + i).init_pointee_copy(temp)
                 temp *= shape[i]
+        self.index = 0 if forward else self.length - 1
 
     fn __has_next__(self) -> Bool:
         @parameter
         if forward:
-            return self.offset < self.size
+            return self.index < self.length
         else:
-            return self.offset > 0 - self.size_of_res
+            return self.index >= 0
 
     fn __iter__(self) -> Self:
         return self
@@ -4206,29 +4209,26 @@ struct _NDAxisIter[
     fn __len__(self) -> Int:
         @parameter
         if forward:
-            return (self.size - self.offset) // self.size_of_res
+            return self.length - self.index
         else:
-            return self.offset // self.size_of_res + 1
+            return self.index
 
     fn __next__(mut self) raises -> NDArray[dtype]:
         var res = NDArray[dtype](Shape(self.size_of_res))
-        var current_offset = self.offset
+        var current_index = self.index
 
         @parameter
         if forward:
-            self.offset += self.size_of_res
+            self.index += 1
         else:
-            self.offset -= self.size_of_res
+            self.index -= 1
 
-        var remainder = current_offset
+        var remainder = current_index * self.size_of_res
         var item = Item(ndim=self.ndim, initialized=True)
-
         for i in range(self.axis):
             item[i], remainder = divmod(remainder, self.strides_by_axis[i])
-
         for i in range(self.axis + 1, self.ndim):
             item[i], remainder = divmod(remainder, self.strides_by_axis[i])
-
         item[self.axis], remainder = divmod(
             remainder, self.strides_by_axis[self.axis]
         )
@@ -4241,6 +4241,48 @@ struct _NDAxisIter[
 
         return res^
 
+    fn ith(
+        self, index: Int
+    ) raises -> Tuple[NDArray[DType.index], NDArray[dtype]]:
+        """
+        Gets the i-th item of the iterator, including the offsets and elements.
+
+        Args:
+            index: The index of the item. It must be non-negative.
+
+        Returns:
+            Offsets and elements of the i-th item.
+        """
+        var offsets = NDArray[DType.index](Shape(self.size_of_res))
+        var elements = NDArray[dtype](Shape(self.size_of_res))
+
+        if (index >= self.length) or (index < 0):
+            raise Error(
+                String(
+                    "\nError in `NDAxisIter.ith()`: "
+                    "Index ({}) must be in the range of [0, {})"
+                ).format(index, self.length)
+            )
+
+        var remainder = index * self.size_of_res
+        var item = Item(ndim=self.ndim, initialized=True)
+        for i in range(self.axis):
+            item[i], remainder = divmod(remainder, self.strides_by_axis[i])
+        for i in range(self.axis + 1, self.ndim):
+            item[i], remainder = divmod(remainder, self.strides_by_axis[i])
+        item[self.axis], remainder = divmod(
+            remainder, self.strides_by_axis[self.axis]
+        )
+
+        var offset = 0
+        for j in range(self.size_of_res):
+            offset = _get_offset(item, self.strides)
+            (offsets._buf.ptr + j).init_pointee_copy(offset)
+            (elements._buf.ptr + j).init_pointee_copy(self.ptr[offset])
+            item[self.axis] += 1
+
+        return Tuple(offsets, elements)
+
 
 @value
 struct _NDIter[

From dde24e9da860eed5b27d237b03c597fefdee2d35 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?ZHU=20Yuhao=20=E6=9C=B1=E5=AE=87=E6=B5=A9?=
 <dr.yuhao.zhu@outlook.com>
Date: Fri, 14 Feb 2025 22:56:29 +0100
Subject: [PATCH 06/11] Add `apply_func_on_array_without_dim_reduction` and
 `transfer_offset`

---
 numojo/core/ndarray.mojo     |  15 +----
 numojo/core/utility.mojo     | 118 ++++++++++++++++++++++++++++++++++-
 numojo/routines/sorting.mojo |  39 ++++++++++--
 3 files changed, 152 insertions(+), 20 deletions(-)

diff --git a/numojo/core/ndarray.mojo b/numojo/core/ndarray.mojo
index 88f8fd33..df9e647d 100644
--- a/numojo/core/ndarray.mojo
+++ b/numojo/core/ndarray.mojo
@@ -30,6 +30,7 @@ from numojo.core.ndstrides import NDArrayStrides
 from numojo.core.own_data import OwnData
 from numojo.core.utility import (
     _get_offset,
+    _transfer_offset,
     _traverse_iterative,
     _traverse_iterative_setter,
     to_numpy,
@@ -3294,19 +3295,7 @@ struct NDArray[dtype: DType = DType.float64](
             )
 
         if self.flags.F_CONTIGUOUS:
-            # column-major should be converted to row-major
-            # The following code can be taken out as a function that
-            # convert any index to coordinates according to the order
-            var c_stride = NDArrayStrides(shape=self.shape)
-            var c_coordinates = List[Int]()
-            var idx: Int = index
-            for i in range(c_stride.ndim):
-                var coordinate = idx // c_stride[i]
-                idx = idx - c_stride[i] * coordinate
-                c_coordinates.append(coordinate)
-
-            # Get the value by coordinates and the strides
-            return (self._buf.ptr + _get_offset(c_coordinates, self.strides))[]
+            return (self._buf.ptr + _transfer_offset(index, self.strides))[]
 
         else:
             return (self._buf.ptr + index)[]
diff --git a/numojo/core/utility.mojo b/numojo/core/utility.mojo
index ac5dca10..59397b88 100644
--- a/numojo/core/utility.mojo
+++ b/numojo/core/utility.mojo
@@ -152,6 +152,26 @@ fn _get_offset(indices: Tuple[Int, Int], strides: Tuple[Int, Int]) -> Int:
     return indices[0] * strides[0] + indices[1] * strides[1]
 
 
+fn _transfer_offset(offset: Int, strides: NDArrayStrides) raises -> Int:
+    """
+    Transfers the offset between C-contiguous and F-continuous memory layout.
+
+    Args:
+        offset: The offset of the array.
+        strides: The strides of the array.
+
+    Returns:
+        The offset of the array of a different memory layout.
+    """
+
+    var remainder = offset
+    var indices = Item(ndim=len(strides), initialized=False)
+    for i in range(len(strides)):
+        indices[i], remainder = divmod(remainder, strides[i])
+
+    return _get_offset(indices, strides._flip())
+
+
 # ===----------------------------------------------------------------------=== #
 # Functions to traverse a multi-dimensional array
 # ===----------------------------------------------------------------------=== #
@@ -315,8 +335,25 @@ fn _traverse_iterative_setter[
 
 fn apply_func_on_array_with_dim_reduction[
     dtype: DType,
-    func: fn[dtype_func: DType] (NDArray[dtype_func]) -> Scalar[dtype_func],
+    func: fn[dtype_func: DType] (NDArray[dtype_func]) raises -> Scalar[
+        dtype_func
+    ],
 ](a: NDArray[dtype], axis: Int) raises -> NDArray[dtype]:
+    """
+    Applies a function to a NDArray by axis and reduce that dimension.
+
+    Parameters:
+        dtype: The data type of the input NDArray elements.
+        func: The function to apply to the NDArray.
+
+    Args:
+        a: The NDArray to apply the function to.
+        axis: The axis to apply the function to.
+
+    Returns:
+        The NDArray with the function applied to the input NDArray by axis.
+    """
+
     var res = NDArray[dtype](a.shape._pop(axis=axis))
     var offset = 0
     for i in a.iter_by_axis(axis=axis):
@@ -332,6 +369,25 @@ fn apply_func_on_array_with_dim_reduction[
         NDArray[dtype_func]
     ) raises -> Scalar[returned_dtype_func],
 ](a: NDArray[dtype], axis: Int) raises -> NDArray[returned_dtype]:
+    """
+    Applies a function to a NDArray by axis and reduce that dimension.
+    The target data type of the returned NDArray is different from the input
+    NDArray.
+    This is a function overload.
+
+    Parameters:
+        dtype: The data type of the input NDArray elements.
+        returned_dtype: The data type of the output NDArray elements.
+        func: The function to apply to the NDArray.
+
+    Args:
+        a: The NDArray to apply the function to.
+        axis: The axis to apply the function to.
+
+    Returns:
+        The NDArray with the function applied to the input NDArray by axis.
+    """
+
     var res = NDArray[returned_dtype](a.shape._pop(axis=axis))
     var offset = 0
     for i in a.iter_by_axis(axis=axis):
@@ -340,6 +396,66 @@ fn apply_func_on_array_with_dim_reduction[
     return res^
 
 
+fn apply_func_on_array_without_dim_reduction[
+    dtype: DType,
+    func: fn[dtype_func: DType] (NDArray[dtype_func]) raises -> NDArray[
+        dtype_func
+    ],
+](a: NDArray[dtype], axis: Int) raises -> NDArray[dtype]:
+    """
+    Applies a function to a NDArray by axis without reducing that dimension.
+    The resulting array will have the same shape as the input array.
+
+    Parameters:
+        dtype: The data type of the input NDArray elements.
+        func: The function to apply to the NDArray.
+
+    Args:
+        a: The NDArray to apply the function to.
+        axis: The axis to apply the function to.
+
+    Returns:
+        The NDArray with the function applied to the input NDArray by axis.
+    """
+
+    # The iterator along the axis
+    var iterator = a.iter_by_axis(axis=axis)
+    # The final output array will have the same shape as the input array
+    var res = NDArray[dtype](a.shape)
+
+    if a.flags.C_CONTIGUOUS and (axis == a.ndim - 1):
+        print("The memory layout is contiguous and the axis is the last one")
+        # The memory layout is contiguous
+        var offset = 0
+        for elements in iterator:
+            var res_along_axis: NDArray[dtype] = func[dtype](elements)
+            memcpy(
+                res._buf.ptr + offset,
+                res_along_axis._buf.ptr,
+                res_along_axis.size,
+            )
+            offset += elements.size
+
+    else:
+        # The memory layout is not contiguous
+        for i in range(a.size // a.shape[axis]):
+            # The offsets of the input array in each iteration
+            var offsets: NDArray[DType.index]
+            # The elements of the input array in each iteration
+            var elements: NDArray[dtype]
+            # The array after applied the function
+            offsets, elements = iterator.ith(i)
+
+            var res_along_axis: NDArray[dtype] = func[dtype](elements)
+
+            for j in range(a.shape[axis]):
+                (res._buf.ptr + Int(offsets[j])).init_pointee_copy(
+                    (res_along_axis._buf.ptr + j)[]
+                )
+
+    return res^
+
+
 # ===----------------------------------------------------------------------=== #
 # NDArray conversions
 # ===----------------------------------------------------------------------=== #
diff --git a/numojo/routines/sorting.mojo b/numojo/routines/sorting.mojo
index 8cb5a4d7..6ac8f0d9 100644
--- a/numojo/routines/sorting.mojo
+++ b/numojo/routines/sorting.mojo
@@ -289,7 +289,34 @@ fn _sort_inplace[
         I = transpose(I, axes=transposed_axes)
 
 
-fn sort[dtype: DType](owned A: NDArray[dtype]) raises -> NDArray[dtype]:
+fn sort_1d[dtype: DType](a: NDArray[dtype]) raises -> NDArray[dtype]:
+    """
+    Sort 1-d array using quick sort method.
+    It is not guaranteed to be unstable.
+
+    Raises:
+        Error: If the input array is not 1-d.
+
+    Parameters:
+        dtype: The input element type.
+
+    Args:
+        a: An 1-d array.
+    """
+
+    if a.ndim != 1:
+        raise Error(
+            String(
+                "\nError in `sort_1d`: Input array must be 1-d, but got {}-d"
+            ).format(a.ndim)
+        )
+    res = a
+    var _I = NDArray[DType.index](a.shape)
+    _sort_inplace(res, _I, axis=0)
+    return res^
+
+
+fn sort[dtype: DType](a: NDArray[dtype]) raises -> NDArray[dtype]:
     """
     Sort NDArray using quick sort method.
     It is not guaranteed to be unstable.
@@ -300,13 +327,13 @@ fn sort[dtype: DType](owned A: NDArray[dtype]) raises -> NDArray[dtype]:
         dtype: The input element type.
 
     Args:
-        A: NDArray.
+        a: NDArray.
     """
 
-    A = ravel(A)
-    var _I = NDArray[DType.index](A.shape)
-    _sort_inplace(A, _I, axis=0)
-    return A^
+    var res = ravel(a)
+    var _I = NDArray[DType.index](a.shape)
+    _sort_inplace(res, _I, axis=0)
+    return res^
 
 
 fn sort[

From e24a9c4f791c7b2f5712ef3da823f977119a01ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?ZHU=20Yuhao=20=E6=9C=B1=E5=AE=87=E6=B5=A9?=
 <dr.yuhao.zhu@outlook.com>
Date: Fri, 14 Feb 2025 23:38:59 +0100
Subject: [PATCH 07/11] Fix bugs for F-order

---
 numojo/core/ndarray.mojo | 23 ++++++++++++-----------
 numojo/core/utility.mojo |  9 ++++-----
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/numojo/core/ndarray.mojo b/numojo/core/ndarray.mojo
index df9e647d..c77066a3 100644
--- a/numojo/core/ndarray.mojo
+++ b/numojo/core/ndarray.mojo
@@ -4090,10 +4090,7 @@ struct _NDAxisIter[
     forward: Bool = True,
 ]():
     # TODO:
-    # - Use `length` (`index`) instead of `size` (`offset`) for the
-    #   length (counter) of the iterator.
     # - Return a view instead of copy if possible (when Bufferable is supported).
-    # - Add an argument in `__init__()` to specify the starting offset or index.
     """
     An iterator yielding 1-d array by axis.
     The yielded array is garanteed to be contiguous on memory,
@@ -4234,15 +4231,16 @@ struct _NDAxisIter[
         self, index: Int
     ) raises -> Tuple[NDArray[DType.index], NDArray[dtype]]:
         """
-        Gets the i-th item of the iterator, including the offsets and elements.
+        Gets the i-th 1-d array of the iterator and its indices
+        (C-order coordinates).
 
         Args:
             index: The index of the item. It must be non-negative.
 
         Returns:
-            Offsets and elements of the i-th item.
+            Coordinates and elements of the i-th 1-d array of the iterator.
         """
-        var offsets = NDArray[DType.index](Shape(self.size_of_res))
+        var indices = NDArray[DType.index](Shape(self.size_of_res))
         var elements = NDArray[dtype](Shape(self.size_of_res))
 
         if (index >= self.length) or (index < 0):
@@ -4263,14 +4261,17 @@ struct _NDAxisIter[
             remainder, self.strides_by_axis[self.axis]
         )
 
-        var offset = 0
+        var new_strides = NDArrayStrides(self.shape, order="C")
         for j in range(self.size_of_res):
-            offset = _get_offset(item, self.strides)
-            (offsets._buf.ptr + j).init_pointee_copy(offset)
-            (elements._buf.ptr + j).init_pointee_copy(self.ptr[offset])
+            (indices._buf.ptr + j).init_pointee_copy(
+                _get_offset(item, new_strides)
+            )
+            (elements._buf.ptr + j).init_pointee_copy(
+                self.ptr[_get_offset(item, self.strides)]
+            )
             item[self.axis] += 1
 
-        return Tuple(offsets, elements)
+        return Tuple(indices, elements)
 
 
 @value
diff --git a/numojo/core/utility.mojo b/numojo/core/utility.mojo
index 59397b88..10e6b771 100644
--- a/numojo/core/utility.mojo
+++ b/numojo/core/utility.mojo
@@ -424,7 +424,6 @@ fn apply_func_on_array_without_dim_reduction[
     var res = NDArray[dtype](a.shape)
 
     if a.flags.C_CONTIGUOUS and (axis == a.ndim - 1):
-        print("The memory layout is contiguous and the axis is the last one")
         # The memory layout is contiguous
         var offset = 0
         for elements in iterator:
@@ -439,17 +438,17 @@ fn apply_func_on_array_without_dim_reduction[
     else:
         # The memory layout is not contiguous
         for i in range(a.size // a.shape[axis]):
-            # The offsets of the input array in each iteration
-            var offsets: NDArray[DType.index]
+            # The indices of the input array in each iteration
+            var indices: NDArray[DType.index]
             # The elements of the input array in each iteration
             var elements: NDArray[dtype]
             # The array after applied the function
-            offsets, elements = iterator.ith(i)
+            indices, elements = iterator.ith(i)
 
             var res_along_axis: NDArray[dtype] = func[dtype](elements)
 
             for j in range(a.shape[axis]):
-                (res._buf.ptr + Int(offsets[j])).init_pointee_copy(
+                (res._buf.ptr + Int(indices[j])).init_pointee_copy(
                     (res_along_axis._buf.ptr + j)[]
                 )
 

From cf3ed0208cf82ef66a1a945b2d66fac8140af953 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?ZHU=20Yuhao=20=E6=9C=B1=E5=AE=87=E6=B5=A9?=
 <dr.yuhao.zhu@outlook.com>
Date: Sat, 15 Feb 2025 00:46:49 +0100
Subject: [PATCH 08/11] Support parallelize

---
 numojo/core/ndarray.mojo | 72 ++++++++++++++++++++++++++++++------
 numojo/core/utility.mojo | 80 +++++++++++++++++++++++++---------------
 2 files changed, 112 insertions(+), 40 deletions(-)

diff --git a/numojo/core/ndarray.mojo b/numojo/core/ndarray.mojo
index c77066a3..40cadc60 100644
--- a/numojo/core/ndarray.mojo
+++ b/numojo/core/ndarray.mojo
@@ -4215,19 +4215,72 @@ struct _NDAxisIter[
             item[i], remainder = divmod(remainder, self.strides_by_axis[i])
         for i in range(self.axis + 1, self.ndim):
             item[i], remainder = divmod(remainder, self.strides_by_axis[i])
-        item[self.axis], remainder = divmod(
-            remainder, self.strides_by_axis[self.axis]
-        )
 
-        for j in range(self.size_of_res):
-            (res._buf.ptr + j).init_pointee_copy(
-                self.ptr[_get_offset(item, self.strides)]
+        if (self.axis == self.ndim - 1) & (
+            (self.shape[self.axis] == 1) or (self.strides[self.axis] == 1)
+        ):
+            # The memory layout is C-contiguous
+            memcpy(
+                res._buf.ptr,
+                self.ptr + _get_offset(item, self.strides),
+                self.size_of_res,
             )
-            item[self.axis] += 1
+
+        else:
+            for j in range(self.size_of_res):
+                (res._buf.ptr + j).init_pointee_copy(
+                    self.ptr[_get_offset(item, self.strides)]
+                )
+                item[self.axis] += 1
 
         return res^
 
-    fn ith(
+    fn ith(self, index: Int) raises -> NDArray[dtype]:
+        """
+        Gets the i-th 1-d array of the iterator.
+
+        Args:
+            index: The index of the item. It must be non-negative.
+
+        Returns:
+            Coordinates and elements of the i-th 1-d array of the iterator.
+        """
+        var elements = NDArray[dtype](Shape(self.size_of_res))
+
+        if (index >= self.length) or (index < 0):
+            raise Error(
+                String(
+                    "\nError in `NDAxisIter.ith()`: "
+                    "Index ({}) must be in the range of [0, {})"
+                ).format(index, self.length)
+            )
+
+        var remainder = index * self.size_of_res
+        var item = Item(ndim=self.ndim, initialized=True)
+        for i in range(self.axis):
+            item[i], remainder = divmod(remainder, self.strides_by_axis[i])
+        for i in range(self.axis + 1, self.ndim):
+            item[i], remainder = divmod(remainder, self.strides_by_axis[i])
+
+        if ((self.axis == self.ndim - 1) or (self.axis == 0)) & (
+            (self.shape[self.axis] == 1) or (self.strides[self.axis] == 1)
+        ):
+            # The memory layout is C-contiguous or F-contiguous
+            memcpy(
+                elements._buf.ptr,
+                self.ptr + _get_offset(item, self.strides),
+                self.size_of_res,
+            )
+        else:
+            for j in range(self.size_of_res):
+                (elements._buf.ptr + j).init_pointee_copy(
+                    self.ptr[_get_offset(item, self.strides)]
+                )
+                item[self.axis] += 1
+
+        return elements
+
+    fn ith_with_indices(
         self, index: Int
     ) raises -> Tuple[NDArray[DType.index], NDArray[dtype]]:
         """
@@ -4257,9 +4310,6 @@ struct _NDAxisIter[
             item[i], remainder = divmod(remainder, self.strides_by_axis[i])
         for i in range(self.axis + 1, self.ndim):
             item[i], remainder = divmod(remainder, self.strides_by_axis[i])
-        item[self.axis], remainder = divmod(
-            remainder, self.strides_by_axis[self.axis]
-        )
 
         var new_strides = NDArrayStrides(self.shape, order="C")
         for j in range(self.size_of_res):
diff --git a/numojo/core/utility.mojo b/numojo/core/utility.mojo
index 10e6b771..cf6b11bb 100644
--- a/numojo/core/utility.mojo
+++ b/numojo/core/utility.mojo
@@ -6,7 +6,7 @@ Implements N-DIMENSIONAL ARRAY UTILITY FUNCTIONS
 # Last updated: 2024-10-14
 # ===----------------------------------------------------------------------=== #
 
-from algorithm.functional import vectorize
+from algorithm.functional import vectorize, parallelize
 from collections import Dict
 from memory import UnsafePointer, memcpy
 from python import Python, PythonObject
@@ -389,10 +389,20 @@ fn apply_func_on_array_with_dim_reduction[
     """
 
     var res = NDArray[returned_dtype](a.shape._pop(axis=axis))
-    var offset = 0
-    for i in a.iter_by_axis(axis=axis):
-        (res._buf.ptr + offset).init_pointee_copy(func[returned_dtype](i))
-        offset += 1
+    # The iterator along the axis
+    var iterator = a.iter_by_axis(axis=axis)
+
+    @parameter
+    fn parallelized_func(i: Int):
+        try:
+            (res._buf.ptr + i).init_pointee_copy(
+                func[returned_dtype](iterator.ith(i))
+            )
+        except e:
+            print("Error in parallelized_func", e)
+
+    parallelize[parallelized_func](a.size // a.shape[axis])
+
     return res^
 
 
@@ -424,33 +434,45 @@ fn apply_func_on_array_without_dim_reduction[
     var res = NDArray[dtype](a.shape)
 
     if a.flags.C_CONTIGUOUS and (axis == a.ndim - 1):
-        # The memory layout is contiguous
-        var offset = 0
-        for elements in iterator:
-            var res_along_axis: NDArray[dtype] = func[dtype](elements)
-            memcpy(
-                res._buf.ptr + offset,
-                res_along_axis._buf.ptr,
-                res_along_axis.size,
-            )
-            offset += elements.size
+        # The memory layout is C-contiguous
+        var iterator = a.iter_by_axis(axis=axis)
+
+        @parameter
+        fn parallelized_func_c(i: Int):
+            try:
+                var elements: NDArray[dtype] = func[dtype](iterator.ith(i))
+                memcpy(
+                    res._buf.ptr + i * elements.size,
+                    elements._buf.ptr,
+                    elements.size,
+                )
+            except e:
+                print("Error in parallelized_func", e)
+
+        parallelize[parallelized_func_c](a.size // a.shape[axis])
 
     else:
         # The memory layout is not contiguous
-        for i in range(a.size // a.shape[axis]):
-            # The indices of the input array in each iteration
-            var indices: NDArray[DType.index]
-            # The elements of the input array in each iteration
-            var elements: NDArray[dtype]
-            # The array after applied the function
-            indices, elements = iterator.ith(i)
-
-            var res_along_axis: NDArray[dtype] = func[dtype](elements)
-
-            for j in range(a.shape[axis]):
-                (res._buf.ptr + Int(indices[j])).init_pointee_copy(
-                    (res_along_axis._buf.ptr + j)[]
-                )
+        @parameter
+        fn parallelized_func(i: Int):
+            try:
+                # The indices of the input array in each iteration
+                var indices: NDArray[DType.index]
+                # The elements of the input array in each iteration
+                var elements: NDArray[dtype]
+                # The array after applied the function
+                indices, elements = iterator.ith_with_indices(i)
+
+                var res_along_axis: NDArray[dtype] = func[dtype](elements)
+
+                for j in range(a.shape[axis]):
+                    (res._buf.ptr + Int(indices[j])).init_pointee_copy(
+                        (res_along_axis._buf.ptr + j)[]
+                    )
+            except e:
+                print("Error in parallelized_func", e)
+
+        parallelize[parallelized_func](a.size // a.shape[axis])
 
     return res^
 

From f9ef9b3f6405af4df58f64dd9441c242c1d70142 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?ZHU=20Yuhao=20=E6=9C=B1=E5=AE=87=E6=B5=A9?=
 <dr.yuhao.zhu@outlook.com>
Date: Sat, 15 Feb 2025 02:05:22 +0100
Subject: [PATCH 09/11] Implement the universal axis functions on stats
 functions and on `sort`

---
 numojo/core/ndarray.mojo                 |  40 ++++-
 numojo/routines/sorting.mojo             |  29 ++--
 numojo/routines/statistics/averages.mojo | 190 +++++++++++++++++++----
 tests/routines/test_statistics.mojo      |  54 +++++--
 4 files changed, 249 insertions(+), 64 deletions(-)

diff --git a/numojo/core/ndarray.mojo b/numojo/core/ndarray.mojo
index 40cadc60..0336c92f 100644
--- a/numojo/core/ndarray.mojo
+++ b/numojo/core/ndarray.mojo
@@ -60,14 +60,14 @@ from numojo.routines.statistics.averages import mean
 #
 # TODO: Generalize mdot, rdot to take any IxJx...xKxL and LxMx...xNxP matrix and
 #       matmul it into IxJx..xKxMx...xNxP array.
-# TODO: Add vectorization for _get_offset.
+# TODO: Consider whether we should add vectorization for _get_offset.
 # TODO: Create NDArrayView that points to the buffer of the raw array.
 #       This requires enhancement of functionalities of traits from Mojo's side.
 #       The data buffer can implement an ArrayData trait (RawData or RefData)
 #       RawData type is just a wrapper of `UnsafePointer`.
 #       RefData type has an extra property `indices`: getitem(i) -> A[I[i]].
 # TODO: Rename some variables or methods that should not be exposed to users.
-# TODO: Remove 0-d array. Raise errors if operations result in 0-d array.
+# TODO: Special checks for 0d array (numojo scalar).
 # ===----------------------------------------------------------------------===#
 
 
@@ -3639,6 +3639,17 @@ struct NDArray[dtype: DType = DType.float64](
 
         return result
 
+    fn mean[
+        returned_dtype: DType = DType.float64
+    ](self) raises -> Scalar[returned_dtype]:
+        """
+        Mean of a array.
+
+        Returns:
+            The mean of the array.
+        """
+        return mean[returned_dtype](self)
+
     fn mean[
         returned_dtype: DType = DType.float64
     ](self: Self, axis: Int) raises -> NDArray[returned_dtype]:
@@ -3654,14 +3665,31 @@ struct NDArray[dtype: DType = DType.float64](
         """
         return mean[returned_dtype](self, axis)
 
-    fn mean(self) raises -> Scalar[dtype]:
+    fn median[
+        returned_dtype: DType = DType.float64
+    ](self) raises -> Scalar[returned_dtype]:
         """
-        Mean of a array.
+        Median of a array.
 
         Returns:
-            The mean of the array as a SIMD Value of `dtype`.
+            The median of the array.
+        """
+        return median[returned_dtype](self)
+
+    fn median[
+        returned_dtype: DType = DType.float64
+    ](self: Self, axis: Int) raises -> NDArray[returned_dtype]:
+        """
+        Median of array elements over a given axis.
+
+        Args:
+            axis: The axis along which the median is performed.
+
+        Returns:
+            An NDArray.
+
         """
-        return mean[dtype](self)
+        return median[returned_dtype](self, axis)
 
     # fn nonzero(self):
     #     pass
diff --git a/numojo/routines/sorting.mojo b/numojo/routines/sorting.mojo
index 6ac8f0d9..07a95cd9 100644
--- a/numojo/routines/sorting.mojo
+++ b/numojo/routines/sorting.mojo
@@ -10,6 +10,7 @@ from numojo.core.ndarray import NDArray
 from numojo.core.ndshape import NDArrayShape
 import numojo.core.matrix as matrix
 from numojo.core.matrix import Matrix
+import numojo.core.utility as utility
 from numojo.routines.manipulation import ravel, transpose
 
 """
@@ -330,15 +331,15 @@ fn sort[dtype: DType](a: NDArray[dtype]) raises -> NDArray[dtype]:
         a: NDArray.
     """
 
-    var res = ravel(a)
-    var _I = NDArray[DType.index](a.shape)
-    _sort_inplace(res, _I, axis=0)
-    return res^
+    if a.ndim == 1:
+        return sort_1d(a)
+    else:
+        return sort_1d(ravel(a))
 
 
 fn sort[
     dtype: DType
-](owned A: NDArray[dtype], owned axis: Int) raises -> NDArray[dtype]:
+](owned a: NDArray[dtype], axis: Int) raises -> NDArray[dtype]:
     """
     Sort NDArray along the given axis using quick sort method.
     It is not guaranteed to be unstable.
@@ -349,14 +350,24 @@ fn sort[
         dtype: The input element type.
 
     Args:
-        A: NDArray to sort.
+        a: NDArray to sort.
         axis: The axis along which the array is sorted.
 
     """
 
-    var _I = NDArray[DType.index](A.shape)
-    _sort_inplace(A, _I, axis)
-    return A^
+    var normalized_axis = axis
+    if axis < 0:
+        normalized_axis += a.ndim
+    if (normalized_axis < 0) or (normalized_axis >= a.ndim):
+        raise Error(
+            String("Error in `mean`: Axis {} not in bound [-{}, {})").format(
+                axis, a.ndim, a.ndim
+            )
+        )
+
+    return utility.apply_func_on_array_without_dim_reduction[func=sort_1d](
+        a, axis=normalized_axis
+    )
 
 
 fn sort[dtype: DType](A: Matrix[dtype]) raises -> Matrix[dtype]:
diff --git a/numojo/routines/statistics/averages.mojo b/numojo/routines/statistics/averages.mojo
index e750dc08..0b7720b0 100644
--- a/numojo/routines/statistics/averages.mojo
+++ b/numojo/routines/statistics/averages.mojo
@@ -17,7 +17,7 @@ import math as mt
 from numojo.core.ndarray import NDArray
 import numojo.core.matrix as matrix
 from numojo.core.matrix import Matrix
-from numojo.core.utility import bool_to_numeric
+import numojo.core.utility as utility
 from numojo.routines.logic.comparison import greater, less
 from numojo.routines.manipulation import broadcast_to, _broadcast_back_to
 from numojo.routines.math.arithmetic import add
@@ -30,19 +30,28 @@ fn mean_1d[
     dtype: DType, //, returned_dtype: DType = DType.float64
 ](a: NDArray[dtype]) raises -> Scalar[returned_dtype]:
     """
-    Calculate the arithmetic average of all items in the array.
+    Calculate the arithmetic average of all items in the 1-d array.
+
+    Raises:
+        Error: If the array is not 1-d.
 
     Parameters:
         dtype: The element type.
         returned_dtype: The returned data type, defaulting to float64.
 
     Args:
-        a: NDArray.
+        a: A 1-d array.
 
     Returns:
         A scalar defaulting to float64.
-
     """
+    if a.ndim != 1:
+        raise Error(
+            String(
+                "\nError in `sort_1d`: Input array must be 1-d, but got {}-d"
+            ).format(a.ndim)
+        )
+
     return sum(a).cast[returned_dtype]() / a.size
 
 
@@ -85,17 +94,18 @@ fn mean[
     """
 
     var normalized_axis = axis
-
     if axis < 0:
         normalized_axis += a.ndim
-
     if (normalized_axis < 0) or (normalized_axis >= a.ndim):
-        raise Error(String("Axis {} out of bounds!").format(axis))
+        raise Error(
+            String("Error in `mean`: Axis {} not in bound [-{}, {})").format(
+                axis, a.ndim, a.ndim
+            )
+        )
 
-    return (
-        sum(a, axis=normalized_axis).astype[returned_dtype]()
-        / a.shape[normalized_axis]
-    )
+    return utility.apply_func_on_array_with_dim_reduction[
+        returned_dtype=returned_dtype, func=mean_1d
+    ](a=a, axis=normalized_axis)
 
 
 fn mean[
@@ -144,25 +154,120 @@ fn mean[
         raise Error(String("The axis can either be 1 or 0!"))
 
 
-fn mode[dtype: DType](array: NDArray[dtype]) raises -> Scalar[dtype]:
-    """Mode of all items of an array.
+fn median_1d[
+    dtype: DType, //, returned_dtype: DType = DType.float64
+](a: NDArray[dtype]) raises -> Scalar[returned_dtype]:
+    """
+    Median value of all items of a 1-d array.
+
+    Parameters:
+         dtype: The element type.
+         returned_dtype: The returned data type, defaulting to float64.
+
+    Args:
+        a: A 1-d array.
+
+    Returns:
+        The median of all of the member values of array as a SIMD Value of `dtype`.
+    """
+
+    if a.ndim != 1:
+        raise Error(
+            String(
+                "\nError in `sort_1d`: Input array must be 1-d, but got {}-d"
+            ).format(a.ndim)
+        )
+
+    var sorted_array = sort(a)
+
+    if a.size % 2 == 1:
+        return sorted_array.item(a.size // 2).cast[returned_dtype]()
+    else:
+        return (
+            sorted_array.item(a.size // 2 - 1) + sorted_array.item(a.size // 2)
+        ).cast[returned_dtype]() / 2
+
+
+fn median[
+    dtype: DType, //, returned_dtype: DType = DType.float64
+](a: NDArray[dtype]) raises -> Scalar[returned_dtype]:
+    """
+    Median value of all items of an array.
+
+    Parameters:
+         dtype: The element type.
+         returned_dtype: The returned data type, defaulting to float64.
+
+    Args:
+        a: A 1-d array.
+
+    Returns:
+        The median of all of the member values of array as a SIMD Value of `dtype`.
+    """
+
+    return median_1d[returned_dtype](ravel(a))
+
+
+fn median[
+    dtype: DType, //, returned_dtype: DType = DType.float64
+](a: NDArray[dtype], axis: Int) raises -> NDArray[returned_dtype]:
+    """
+    Returns median of the array elements along the given axis.
+
+    Parameters:
+         dtype: The element type.
+         returned_dtype: The returned data type, defaulting to float64.
+
+    Args:
+        a: An array.
+        axis: The axis along which the median is performed.
+
+    Returns:
+        Median of the array elements along the given axis.
+    """
+    var normalized_axis = axis
+    if axis < 0:
+        normalized_axis += a.ndim
+    if (normalized_axis < 0) or (normalized_axis >= a.ndim):
+        raise Error(
+            String("Error in `mean`: Axis {} not in bound [-{}, {})").format(
+                axis, a.ndim, a.ndim
+            )
+        )
+    return utility.apply_func_on_array_with_dim_reduction[
+        returned_dtype=returned_dtype, func=median_1d
+    ](a=a, axis=normalized_axis)
+
+
+fn mode_1d[dtype: DType](a: NDArray[dtype]) raises -> Scalar[dtype]:
+    """Mode of all items of a 1d array.
+
+    Raises:
+        Error: If the array is not 1-d.
 
     Parameters:
         dtype: The element type.
 
     Args:
-        array: An NDArray.
+        a: An NDArray.
 
     Returns:
         The mode of all of the member values of array as a SIMD Value of `dtype`.
     """
 
-    var sorted_array: NDArray[dtype] = sort(array)
+    if a.ndim != 1:
+        raise Error(
+            String(
+                "\nError in `sort_1d`: Input array must be 1-d, but got {}-d"
+            ).format(a.ndim)
+        )
+
+    var sorted_array: NDArray[dtype] = sort(a)
     var max_count = 0
     var mode_value = sorted_array.item(0)
     var current_count = 1
 
-    for i in range(1, array.num_elements()):
+    for i in range(1, a.num_elements()):
         if sorted_array[i] == sorted_array[i - 1]:
             current_count += 1
         else:
@@ -172,36 +277,55 @@ fn mode[dtype: DType](array: NDArray[dtype]) raises -> Scalar[dtype]:
             current_count = 1
 
     if current_count > max_count:
-        mode_value = sorted_array.item(array.num_elements() - 1)
+        mode_value = sorted_array.item(a.num_elements() - 1)
 
     return mode_value
 
 
-fn median[
-    dtype: DType, //, returned_dtype: DType = DType.float64
-](array: NDArray[dtype]) raises -> Scalar[returned_dtype]:
-    """
-    Median value of all items of an array.
+fn mode[dtype: DType](array: NDArray[dtype]) raises -> Scalar[dtype]:
+    """Mode of all items of an array.
 
     Parameters:
-         dtype: The element type.
-         returned_dtype: The returned data type, defaulting to float64.
+        dtype: The element type.
 
     Args:
         array: An NDArray.
 
     Returns:
-        The median of all of the member values of array as a SIMD Value of `dtype`.
+        The mode of all of the member values of array as a SIMD Value of `dtype`.
     """
-    var sorted_array = sort(array)
 
-    if array.size % 2 == 1:
-        return sorted_array.item(array.size // 2).cast[returned_dtype]()
-    else:
-        return (
-            sorted_array.item(array.size // 2 - 1)
-            + sorted_array.item(array.size // 2)
-        ).cast[returned_dtype]() / 2
+    return mode_1d(ravel(array))
+
+
+fn mode[dtype: DType](a: NDArray[dtype], axis: Int) raises -> NDArray[dtype]:
+    """
+    Returns mode of the array elements along the given axis.
+
+    Parameters:
+        dtype: The element type.
+
+    Args:
+        a: An NDArray.
+        axis: The axis along which the mode is performed.
+
+    Returns:
+        Mode of the array elements along the given axis.
+    """
+
+    var normalized_axis = axis
+    if axis < 0:
+        normalized_axis += a.ndim
+    if (normalized_axis < 0) or (normalized_axis >= a.ndim):
+        raise Error(
+            String("Error in `mean`: Axis {} not in bound [-{}, {})").format(
+                axis, a.ndim, a.ndim
+            )
+        )
+
+    return utility.apply_func_on_array_with_dim_reduction[func=mode_1d](
+        a=a, axis=normalized_axis
+    )
 
 
 fn std[
diff --git a/tests/routines/test_statistics.mojo b/tests/routines/test_statistics.mojo
index 097533ed..2497278b 100644
--- a/tests/routines/test_statistics.mojo
+++ b/tests/routines/test_statistics.mojo
@@ -12,43 +12,65 @@ from utils_for_test import check, check_is_close
 
 def test_mean_median_var_std():
     var np = Python.import_module("numpy")
+    var sp = Python.import_module("scipy")
     var A = nm.random.randn(3, 4, 5)
     var Anp = A.to_numpy()
 
     assert_true(
-        np.all(np.isclose(nm.mean(A), np.mean(Anp), atol=0.1)),
+        np.all(np.isclose(nm.mean(A), np.mean(Anp), atol=0.001)),
         "`mean` is broken",
     )
-    for i in range(3):
+    for axis in range(3):
         check_is_close(
-            nm.mean(A, i),
-            np.mean(Anp, i),
-            String("`mean` is broken for {}-dimension".format(i)),
+            nm.mean(A, axis=axis),
+            np.mean(Anp, axis=axis),
+            String("`mean` is broken for axis {}".format(axis)),
         )
 
     assert_true(
-        np.all(np.isclose(nm.median(A), np.median(Anp), atol=0.1)),
+        np.all(np.isclose(nm.median(A), np.median(Anp), atol=0.001)),
         "`median` is broken",
     )
+    for axis in range(3):
+        check_is_close(
+            nm.median(A, axis),
+            np.median(Anp, axis),
+            String("`median` is broken for axis {}".format(axis)),
+        )
+
+    assert_true(
+        np.all(
+            np.isclose(
+                nm.mode(A), sp.stats.mode(Anp, axis=None).mode, atol=0.001
+            )
+        ),
+        "`mode` is broken",
+    )
+    for axis in range(3):
+        check_is_close(
+            nm.mode(A, axis),
+            sp.stats.mode(Anp, axis).mode,
+            String("`mode` is broken for axis {}".format(axis)),
+        )
 
     assert_true(
-        np.all(np.isclose(nm.variance(A), np.`var`(Anp), atol=0.1)),
+        np.all(np.isclose(nm.variance(A), np.`var`(Anp), atol=0.001)),
         "`variance` is broken",
     )
-    for i in range(3):
+    for axis in range(3):
         check_is_close(
-            nm.variance(A, i),
-            np.`var`(Anp, i),
-            String("`variance` is broken for {}-dimension".format(i)),
+            nm.variance(A, axis),
+            np.`var`(Anp, axis),
+            String("`variance` is broken for axis {}".format(axis)),
         )
 
     assert_true(
-        np.all(np.isclose(nm.std(A), np.std(Anp), atol=0.1)),
+        np.all(np.isclose(nm.std(A), np.std(Anp), atol=0.001)),
         "`std` is broken",
     )
-    for i in range(3):
+    for axis in range(3):
         check_is_close(
-            nm.std(A, i),
-            np.std(Anp, i),
-            String("`std` is broken for {}-dimension".format(i)),
+            nm.std(A, axis),
+            np.std(Anp, axis),
+            String("`std` is broken for axis {}".format(axis)),
         )

From 690d0eb9708a28ea36c9d1942b3688e4450c31fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?ZHU=20Yuhao=20=E6=9C=B1=E5=AE=87=E6=B5=A9?=
 <dr.yuhao.zhu@outlook.com>
Date: Sat, 15 Feb 2025 13:22:31 +0100
Subject: [PATCH 10/11] Add more docs and explanations

---
 numojo/core/ndarray.mojo     | 4403 +++++++++++++++++-----------------
 numojo/core/utility.mojo     |   71 +-
 numojo/routines/sorting.mojo |    9 +
 3 files changed, 2271 insertions(+), 2212 deletions(-)

diff --git a/numojo/core/ndarray.mojo b/numojo/core/ndarray.mojo
index 0336c92f..0437f018 100644
--- a/numojo/core/ndarray.mojo
+++ b/numojo/core/ndarray.mojo
@@ -7,6 +7,31 @@
 """
 Implements basic object methods for working with N-Dimensional Array.
 """
+# ===----------------------------------------------------------------------===#
+# SECTIONS OF THE FILE:
+#
+# NDArray
+# 1. Life cycle methods.
+# 2. Indexing and slicing (get and set dunders and relevant methods).
+# 3. Operator dunders.
+# 4. IO, trait, and iterator dunders.
+# 5. Other methods (Sorted alphabetically).
+#
+# _NDArrayIter
+# _NDAxisIter
+# _NDIter
+#
+# TODO: Generalize mdot, rdot to take any IxJx...xKxL and LxMx...xNxP matrix and
+#       matmul it into IxJx..xKxMx...xNxP array.
+# TODO: Consider whether we should add vectorization for _get_offset.
+# TODO: Create NDArrayView that points to the buffer of the raw array.
+#       This requires enhancement of functionalities of traits from Mojo's side.
+#       The data buffer can implement an ArrayData trait (RawData or RefData)
+#       RawData type is just a wrapper of `UnsafePointer`.
+#       RefData type has an extra property `indices`: getitem(i) -> A[I[i]].
+# TODO: Rename some variables or methods that should not be exposed to users.
+# TODO: Special checks for 0d array (numojo scalar).
+# ===----------------------------------------------------------------------===#
 
 from algorithm import parallelize, vectorize
 import builtin.math as builtin_math
@@ -55,21 +80,6 @@ from numojo.routines.math.sums import sum, cumsum
 import numojo.routines.sorting as sorting
 from numojo.routines.statistics.averages import mean
 
-# ===----------------------------------------------------------------------===#
-# NDArray
-#
-# TODO: Generalize mdot, rdot to take any IxJx...xKxL and LxMx...xNxP matrix and
-#       matmul it into IxJx..xKxMx...xNxP array.
-# TODO: Consider whether we should add vectorization for _get_offset.
-# TODO: Create NDArrayView that points to the buffer of the raw array.
-#       This requires enhancement of functionalities of traits from Mojo's side.
-#       The data buffer can implement an ArrayData trait (RawData or RefData)
-#       RawData type is just a wrapper of `UnsafePointer`.
-#       RefData type has an extra property `indices`: getitem(i) -> A[I[i]].
-# TODO: Rename some variables or methods that should not be exposed to users.
-# TODO: Special checks for 0d array (numojo scalar).
-# ===----------------------------------------------------------------------===#
-
 
 struct NDArray[dtype: DType = DType.float64](
     Stringable, Representable, CollectionElement, Sized, Writable
@@ -301,95 +311,216 @@ struct NDArray[dtype: DType = DType.float64](
     # Getter and setter dunders and other methods
     # ===-------------------------------------------------------------------===#
 
-    fn _setitem(self, *indices: Int, val: Scalar[dtype]):
+    # ===-------------------------------------------------------------------===#
+    # Getter dunders and other getter methods
+    #
+    # 1. Basic Indexing Operations
+    # fn _getitem(self, *indices: Int) -> Scalar[dtype]                         # Direct unsafe getter
+    # fn __getitem__(self) raises -> SIMD[dtype, 1]                             # Get 0d array value
+    # fn __getitem__(self, index: Item) raises -> SIMD[dtype, 1]                # Get by coordinate list
+    #
+    # 2. Single Index Slicing
+    # fn __getitem__(self, idx: Int) raises -> Self                             # Get by single index
+    #
+    # 3. Multi-dimensional Slicing
+    # fn __getitem__(self, *slices: Slice) raises -> Self                       # Get by variable slices
+    # fn __getitem__(self, slice_list: List[Slice]) raises -> Self              # Get by list of slices
+    # fn __getitem__(self, *slices: Variant[Slice, Int]) raises -> Self         # Get by mix of slices/ints
+    #
+    # 4. Advanced Indexing
+    # fn __getitem__(self, indices: NDArray[DType.index]) raises -> Self        # Get by index array
+    # fn __getitem__(self, indices: List[Int]) raises -> Self                   # Get by list of indices
+    # fn __getitem__(self, mask: NDArray[DType.bool]) raises -> Self            # Get by boolean mask
+    # fn __getitem__(self, mask: List[Bool]) raises -> Self                     # Get by boolean list
+    #
+    # 5. Low-level Access
+    # fn item(self, owned index: Int) raises -> Scalar[dtype]                   # Get item by linear index
+    # fn item(self, *index: Int) raises -> Scalar[dtype]                        # Get item by coordinates
+    # fn load(self, owned index: Int) raises -> Scalar[dtype]                   # Load with bounds check
+    # fn load[width: Int](self, index: Int) raises -> SIMD[dtype, width]        # Load SIMD value
+    # fn load[width: Int](self, *indices: Int) raises -> SIMD[dtype, width]     # Load SIMD at coordinates
+    # ===-------------------------------------------------------------------===#
+
+    fn _getitem(self, *indices: Int) -> Scalar[dtype]:
         """
         (UNSAFE! for internal use only.)
-        Set item at indices and bypass all boundary checks.
+        Get item at indices and bypass all boundary checks.
 
         Args:
-            indices: Indices to set the value.
-            val: Value to set.
+            indices: Indices to get the value.
 
         Example:
         ```mojo
         import numojo
         var A = numojo.ones(numojo.Shape(2,3,4))
-        A._setitem(1,2,3, val=10)
+        print(A._getitem(1,2,3))
         ```
         """
         var index_of_buffer: Int = 0
         for i in range(self.ndim):
             index_of_buffer += indices[i] * self.strides._buf[i]
-        self._buf.ptr[index_of_buffer] = val
+        return self._buf.ptr[index_of_buffer]
 
-    fn __setitem__(mut self, idx: Int, val: Self) raises:
+    fn __getitem__(self) raises -> SIMD[dtype, 1]:
         """
-        Set a slice of array with given array.
+        Gets the value of the 0darray.
+
+        Raises:
+            Error: If the array is not 0-d.
+
+        Returns:
+            The value of the 0darray.
+        """
+        if self.ndim != 0:
+            raise Error(
+                "Error in `numojo.NDArray.__getitem__()`: \n"
+                "Cannot get value without index.\n"
+            )
+        return self._buf.ptr[]
+
+    fn __getitem__(self, index: Item) raises -> SIMD[dtype, 1]:
+        """
+        Get the value at the index list.
 
         Args:
-            idx: Index to set.
-            val: Value to set.
+            index: Index list.
+
+        Returns:
+            The value at the index list.
+        """
+        if index.__len__() != self.ndim:
+            var message = String(
+                "Error: Length of `index` do not match the number of"
+                " dimensions!\n"
+                "Length of indices is {}.\n"
+                "The number of dimensions is {}."
+            ).format(index.__len__(), self.ndim)
+            raise Error(message)
+
+        for i in range(index.__len__()):
+            if index[i] >= self.shape[i]:
+                var message = String(
+                    "Error: `index` exceeds the size!\n"
+                    "For {}-the mension:\n"
+                    "The index is {}.\n"
+                    "The size of the dimensions is {}"
+                ).format(i, index[i], self.shape[i])
+                raise Error(message)
+        var idx: Int = _get_offset(index, self.strides)
+        return self._buf.ptr.load[width=1](idx)
+
+    fn __getitem__(self, idx: Int) raises -> Self:
+        """
+        Retrieve a slice of the array corresponding to the index at the first dimension.
+
+        Args:
+            idx: Index to get the slice.
+
+        Returns:
+            A slice of the array.
+
+        Raises:
+            Error: If the array is 0-d.
 
         Example:
-        ```mojo
-        import numojo as nm
-        var A = nm.random.rand[nm.i16](3, 2)
-        var B = nm.random.rand[nm.i16](3)
-        A[1:4] = B
-        ```
+            `arr[1]` returns the second row of the array.
         """
 
-        var normalized_index = idx
-        if normalized_index < 0:
-            normalized_index = self.shape[0] + idx
-        if normalized_index >= self.shape[0]:
-            raise Error("Index out of bounds")
+        var slice_list = List[Slice]()
+        slice_list.append(Slice(idx, idx + 1, 1))
 
         # If the ndim is 0, then it is a numojo scalar (0darray).
-        # Not allow to set value to 0darray.
-        if self.ndim == 0 or val.ndim == 0:
+        if self.ndim == 0:
             raise Error(
-                "Error in `numojo.NDArray.__setitem__("
-                "self, idx: Int, val: Self)`: \n"
-                "Cannot set values to a 0-d array.\n"
+                "Error in `numojo.NDArray.__getitem__(self, idx: Int)`: \n"
+                "Cannot slice a 0-d array.\n"
             )
 
-        var slice_list = List[Slice]()
-        if idx >= self.shape[0]:
-            var message = String(
-                "Error: Slice value exceeds the array shape!\n"
-                "The {}-th dimension is of size {}.\n"
-                "The slice goes from {} to {}"
-            ).format(
-                0,
-                self.shape[0],
-                idx,
-                idx + 1,
-            )
-            raise Error(message)
-        slice_list.append(Slice(idx, idx + 1, 1))
-        if self.ndim > 1:
+        var narr: Self
+
+        # If the ndim is 1
+        if self.ndim == 1:
+            narr = creation._0darray[dtype](self._buf.ptr[idx])
+
+        else:
             for i in range(1, self.ndim):
                 var size_at_dim: Int = self.shape[i]
                 slice_list.append(Slice(0, size_at_dim, 1))
 
-        var n_slices: Int = len(slice_list)
+            narr = self.__getitem__(slice_list)
+
+        return narr
+
+    fn __getitem__(self, owned *slices: Slice) raises -> Self:
+        """
+        Retrieve slices of an array from variadic slices.
+
+        Args:
+            slices: Variadic slices.
+
+        Returns:
+            A slice of the array.
+
+        Example:
+            `arr[1:3, 2:4]` returns the corresponding sliced array (2 x 2).
+        """
+
+        var n_slices: Int = slices.__len__()
+        if n_slices > self.ndim:
+            raise Error("Error: No of slices exceed the array dimensions.")
+        var slice_list: List[Slice] = List[Slice]()
+        for i in range(len(slices)):
+            slice_list.append(slices[i])
+
+        if n_slices < self.ndim:
+            for i in range(n_slices, self.ndim):
+                slice_list.append(Slice(0, self.shape[i], 1))
+
+        var narr: Self = self[slice_list]
+        return narr
+
+    fn __getitem__(self, owned slice_list: List[Slice]) raises -> Self:
+        """
+        Retrieve slices of an array from list of slices.
+
+        Args:
+            slice_list: List of slices.
+
+        Returns:
+            A slice of the array.
+
+        Example:
+            `arr[1:3, 2:4]` returns the corresponding sliced array (2 x 2).
+        """
+
+        var n_slices: Int = slice_list.__len__()
+        if n_slices > self.ndim or n_slices < self.ndim:
+            raise Error("Error: No of slices do not match shape")
+
         var ndims: Int = 0
-        var count: Int = 0
         var spec: List[Int] = List[Int]()
-        for i in range(n_slices):
-            if slice_list[i].step is None:
-                raise Error(String("Step of slice is None."))
-            var slice_len: Int = (
-                (slice_list[i].end.value() - slice_list[i].start.value())
-                / slice_list[i].step.or_else(1)
-            ).__int__()
+        var count: Int = 0
+
+        var slices: List[Slice] = self._adjust_slice(slice_list)
+        for i in range(slices.__len__()):
+            if (
+                slices[i].start.value() >= self.shape[i]
+                or slices[i].end.value() > self.shape[i]
+            ):
+                raise Error("Error: Slice value exceeds the array shape")
+            var slice_len: Int = len(
+                range(
+                    slices[i].start.value(),
+                    slices[i].end.value(),
+                    slices[i].step.or_else(1),
+                )
+            )
             spec.append(slice_len)
             if slice_len != 1:
                 ndims += 1
             else:
                 count += 1
-        if count == slice_list.__len__():
+        if count == slices.__len__():
             ndims = 1
 
         var nshape: List[Int] = List[Int]()
@@ -405,272 +536,242 @@ struct NDArray[dtype: DType = DType.float64](
                 j += 1
             if j >= self.ndim:
                 break
-            var slice_len: Int = (
-                (slice_list[j].end.value() - slice_list[j].start.value())
-                / slice_list[j].step.or_else(1)
-            ).__int__()
+            var slice_len: Int = len(
+                range(
+                    slices[j].start.value(),
+                    slices[j].end.value(),
+                    slices[j].step.or_else(1),
+                )
+            )
             nshape.append(slice_len)
             nnum_elements *= slice_len
-            ncoefficients.append(
-                self.strides[j] * slice_list[j].step.or_else(1)
-            )
+            ncoefficients.append(self.strides[j] * slices[j].step.value())
             j += 1
 
-        # TODO: We can remove this check after we have support for broadcasting
-        for i in range(ndims):
-            if nshape[i] != val.shape[i]:
-                var message = String(
-                    "Error: Shape mismatch!\n"
-                    "Cannot set the array values with given array.\n"
-                    "The {}-th dimension of the array is of shape {}.\n"
-                    "The {}-th dimension of the value is of shape {}."
-                ).format(nshape[i], val.shape[i])
-                raise Error(message)
+        if count == slices.__len__():
+            nshape.append(1)
+            nnum_elements = 1
+            ncoefficients.append(1)
 
         var noffset: Int = 0
+
         if self.flags.C_CONTIGUOUS:
             noffset = 0
             for i in range(ndims):
                 var temp_stride: Int = 1
-                for j in range(i + 1, ndims):
+                for j in range(i + 1, ndims):  # temp
                     temp_stride *= nshape[j]
                 nstrides.append(temp_stride)
-            for i in range(slice_list.__len__()):
-                noffset += slice_list[i].start.value() * self.strides[i]
+            for i in range(slices.__len__()):
+                noffset += slices[i].start.value() * self.strides[i]
+
         elif self.flags.F_CONTIGUOUS:
             noffset = 0
             nstrides.append(1)
             for i in range(0, ndims - 1):
                 nstrides.append(nstrides[i] * nshape[i])
-            for i in range(slice_list.__len__()):
-                noffset += slice_list[i].start.value() * self.strides[i]
+            for i in range(slices.__len__()):
+                noffset += slices[i].start.value() * self.strides[i]
+
+        var narr = Self(
+            offset=noffset,
+            shape=nshape,
+            strides=nstrides,
+        )
 
         var index = List[Int]()
         for _ in range(ndims):
             index.append(0)
 
-        _traverse_iterative_setter[dtype](
-            val, self, nshape, ncoefficients, nstrides, noffset, index
+        _traverse_iterative[dtype](
+            self, narr, nshape, ncoefficients, nstrides, noffset, index, 0
         )
 
-    fn __setitem__(mut self, index: Item, val: Scalar[dtype]) raises:
-        """
-        Set the value at the index list.
+        return narr
 
-        Args:
-            index: Index list.
-            val: Value to set.
+    fn __getitem__(self, owned *slices: Variant[Slice, Int]) raises -> Self:
         """
-        if index.__len__() != self.ndim:
-            var message = String(
-                "Error: Length of `index` does not match the number of"
-                " dimensions!\n"
-                "Length of indices is {}.\n"
-                "The array dimension is {}."
-            ).format(index.__len__(), self.ndim)
-            raise Error(message)
-        for i in range(index.__len__()):
-            if index[i] >= self.shape[i]:
-                var message = String(
-                    "Error: `index` exceeds the size!\n"
-                    "For {}-th dimension:\n"
-                    "The index value is {}.\n"
-                    "The size of the corresponding dimension is {}"
-                ).format(i, index[i], self.shape[i])
-                raise Error(message)
-        var idx: Int = _get_offset(index, self.strides)
-        self._buf.ptr.store(idx, val)
+        Get items by a series of either slices or integers.
 
-    # only works if array is called as array.__setitem__(), mojo compiler doesn't parse it implicitly
-    fn __setitem__(
-        mut self, mask: NDArray[DType.bool], value: Scalar[dtype]
-    ) raises:
-        """
-        Set the value of the array at the indices where the mask is true.
+        A decrease of dimensions may or may not happen when `__getitem__` is
+        called on an ndarray. An ndarray of X-D array can become Y-D array after
+        `__getitem__` where `Y <= X`.
 
-        Args:
-            mask: Boolean mask array.
-            value: Value to set.
-        """
-        if (
-            mask.shape != self.shape
-        ):  # this behavious could be removed potentially
-            raise Error("Mask and array must have the same shape")
+        Whether the dimension decerases or not depends on:
+        1. What types of arguments are passed into `__getitem__`.
+        2. The number of arguments that are passed in `__getitem__`.
 
-        for i in range(mask.size):
-            if mask._buf.ptr.load[width=1](i):
-                self._buf.ptr.store(i, value)
+        PRINCIPAL: The number of dimensions to be decreased is determined by
+        the number of `Int` passed in `__getitem__`.
 
-    fn __setitem__(mut self, *slices: Slice, val: Self) raises:
-        """
-        Retrieve slices of an array from variadic slices.
+        For example, `A` is a 10x10x10 ndarray (3-D). Then,
 
-        Args:
-            slices: Variadic slices.
-            val: Value to set.
+        - `A[1, 2, 3]` leads to a 0-D array (scalar), since there are 3 integers.
+        - `A[1, 2]` leads to a 1-D array (vector), since there are 2 integers,
+        so the dimension decreases by 2.
+        - `A[1]` leads to a 2-D array (matrix), since there is 1 integer, so the
+        dimension decreases by 1.
 
-        Example:
-            `arr[1:3, 2:4]` returns the corresponding sliced array (2 x 2).
-        """
-        var slice_list: List[Slice] = List[Slice]()
-        for i in range(slices.__len__()):
-            slice_list.append(slices[i])
-        self.__setitem__(slices=slice_list, val=val)
+        The number of dimensions will not decrease when Slice is passed in
+        `__getitem__` or no argument is passed in for a certain dimension
+        (it is an implicit slide and a slide of all items will be used).
 
-    fn __setitem__(mut self, slices: List[Slice], val: Self) raises:
-        """
-        Sets the slices of an array from list of slices and array.
+        Take the same example `A` with 10x10x10 in shape. Then,
 
-        Args:
-            slices: List of slices.
-            val: Value to set.
+        - `A[1:4, 2:5, 3:6]`, leads to a 3-D array (no decrease in dimension),
+        since there are 3 slices.
+        - `A[2:8]`, leads to a 3-D array (no decrease in dimension), since there
+        are 1 explicit slice and 2 implicit slices.
 
-        Example:
-        ```console
-        >>> var a = nm.arange[i8](16).reshape(Shape(4, 4))
-        print(a)
-        [[      0       1       2       3       ]
-         [      4       5       6       7       ]
-         [      8       9       10      11      ]
-         [      12      13      14      15      ]]
-        2-D array  Shape: [4, 4]  DType: int8  C-cont: True  F-cont: False  own data: True
-        >>> a[2:4, 2:4] = a[0:2, 0:2]
-        print(a)
-        [[      0       1       2       3       ]
-         [      4       5       6       7       ]
-         [      8       9       0       1       ]
-         [      12      13      4       5       ]]
-        2-D array  Shape: [4, 4]  DType: int8  C-cont: True  F-cont: False  own data: True
-        ```
-        """
-        var n_slices: Int = len(slices)
-        var ndims: Int = 0
-        var count: Int = 0
-        var spec: List[Int] = List[Int]()
-        var slice_list: List[Slice] = self._adjust_slice(slices)
-        for i in range(n_slices):
-            if (
-                slice_list[i].start.value() >= self.shape[i]
-                or slice_list[i].end.value() > self.shape[i]
-            ):
-                var message = String(
-                    "Error: Slice value exceeds the array shape!\n"
-                    "The {}-th dimension is of size {}.\n"
-                    "The slice goes from {} to {}"
-                ).format(
-                    i,
-                    self.shape[i],
-                    slice_list[i].start.value(),
-                    slice_list[i].end.value(),
-                )
-                raise Error(message)
-            # if slice_list[i].step is None:
-            #     raise Error(String("Step of slice is None."))
-            var slice_len: Int = (
-                (slice_list[i].end.value() - slice_list[i].start.value())
-                / slice_list[i].step.or_else(1)
-            ).__int__()
-            spec.append(slice_len)
-            if slice_len != 1:
-                ndims += 1
-            else:
-                count += 1
-        if count == slice_list.__len__():
-            ndims = 1
+        When there is a mixture of int and slices passed into `__getitem__`,
+        the number of integers will be the number of dimensions to be decreased.
+        Example,
 
-        var nshape: List[Int] = List[Int]()
-        var ncoefficients: List[Int] = List[Int]()
-        var nstrides: List[Int] = List[Int]()
-        var nnum_elements: Int = 1
+        - `A[1:4, 2, 2]`, leads to a 1-D array (vector), since there are 2
+        integers, so the dimension decreases by 2.
 
-        var j: Int = 0
-        count = 0
-        for _ in range(ndims):
-            while spec[j] == 1:
-                count += 1
-                j += 1
-            if j >= self.ndim:
-                break
-            var slice_len: Int = (
-                (slice_list[j].end.value() - slice_list[j].start.value())
-                / slice_list[j].step.or_else(1)
-            ).__int__()
-            nshape.append(slice_len)
-            nnum_elements *= slice_len
-            ncoefficients.append(
-                self.strides[j] * slice_list[j].step.or_else(1)
-            )
-            j += 1
+        Note that, even though a slice contains one row, it does not reduce the
+        dimensions. Example,
 
-        # TODO: We can remove this check after we have support for broadcasting
-        for i in range(ndims):
-            if nshape[i] != val.shape[i]:
-                var message = String(
-                    "Error: Shape mismatch!\n"
-                    "For {}-th dimension: \n"
-                    "The size of the array is {}.\n"
-                    "The size of the input value is {}."
-                ).format(i, nshape[i], val.shape[i])
-                raise Error(message)
+        - `A[1:2, 2:3, 3:4]`, leads to a 3-D array (no decrease in dimension),
+        since there are 3 slices.
 
-        var noffset: Int = 0
-        if self.flags.C_CONTIGUOUS:
-            noffset = 0
-            for i in range(ndims):
-                var temp_stride: Int = 1
-                for j in range(i + 1, ndims):  # temp
-                    temp_stride *= nshape[j]
-                nstrides.append(temp_stride)
-            for i in range(slice_list.__len__()):
-                noffset += slice_list[i].start.value() * self.strides[i]
-        elif self.flags.F_CONTIGUOUS:
-            noffset = 0
-            nstrides.append(1)
-            for i in range(0, ndims - 1):
-                nstrides.append(nstrides[i] * nshape[i])
-            for i in range(slice_list.__len__()):
-                noffset += slice_list[i].start.value() * self.strides[i]
+        Note that, when the number of integers equals to the number of
+        dimensions, the final outcome is an 0-D array instead of a number.
+        The user has to upack the 0-D array with the method`A.item(0)` to get the
+        corresponding number.
+        This behavior is different from numpy where the latter returns a number.
 
-        var index = List[Int]()
-        for _ in range(ndims):
-            index.append(0)
+        More examples for 1-D, 2-D, and 3-D arrays.
 
-        _traverse_iterative_setter[dtype](
-            val, self, nshape, ncoefficients, nstrides, noffset, index
-        )
+        ```console
+        A is a matrix
+        [[      -128    -95     65      -11     ]
+        [      8       -72     -116    45      ]
+        [      45      111     -30     4       ]
+        [      84      -120    -115    7       ]]
+        2-D array  Shape: [4, 4]  DType: int8
 
-    fn __setitem__(mut self, *slices: Variant[Slice, Int], val: Self) raises:
-        """
-        Get items by a series of either slices or integers.
+        A[0]
+        [       -128    -95     65      -11     ]
+        1-D array  Shape: [4]  DType: int8
 
-        Args:
-            slices: Variadic slices or integers.
-            val: Value to set.
+        A[0, 1]
+        -95
+        0-D array  Shape: [0]  DType: int8
 
-        Example:
-        ```console
-        >>> var a = nm.arange[i8](16).reshape(Shape(4, 4))
-        print(a)
-        [[      0       1       2       3       ]
-         [      4       5       6       7       ]
-         [      8       9       10      11      ]
-         [      12      13      14      15      ]]
-        2-D array  Shape: [4, 4]  DType: int8  C-cont: True  F-cont: False  own data: True
-        >>> a[0, Slice(2, 4)] = a[3, Slice(0, 2)]
-        print(a)
-        [[      0       1       12      13      ]
-         [      4       5       6       7       ]
-         [      8       9       10      11      ]
-         [      12      13      14      15      ]]
-        2-D array  Shape: [4, 4]  DType: int8  C-cont: True  F-cont: False  own data: True
+        A[Slice(1,3)]
+        [[      8       -72     -116    45      ]
+        [      45      111     -30     4       ]]
+        2-D array  Shape: [2, 4]  DType: int8
+
+        A[1, Slice(2,4)]
+        [       -116    45      ]
+        1-D array  Shape: [2]  DType: int8
+
+        A[Slice(1,3), Slice(1,3)]
+        [[      -72     -116    ]
+        [      111     -30     ]]
+        2-D array  Shape: [2, 2]  DType: int8
+
+        A.item(0,1) as Scalar
+        -95
+
+        ==============================
+        A is a vector
+        [       43      -127    -30     -111    ]
+        1-D array  Shape: [4]  DType: int8
+
+        A[0]
+        43
+        0-D array  Shape: [0]  DType: int8
+
+        A[Slice(1,3)]
+        [       -127    -30     ]
+        1-D array  Shape: [2]  DType: int8
+
+        A.item(0) as Scalar
+        43
+
+        ==============================
+        A is a 3darray
+        [[[     -22     47      22      110     ]
+        [     88      6       -105    39      ]
+        [     -22     51      105     67      ]
+        [     -61     -116    60      -44     ]]
+        [[     33      65      125     -35     ]
+        [     -65     123     57      64      ]
+        [     38      -110    33      98      ]
+        [     -59     -17     68      -6      ]]
+        [[     -68     -58     -37     -86     ]
+        [     -4      101     104     -113    ]
+        [     103     1       4       -47     ]
+        [     124     -2      -60     -105    ]]
+        [[     114     -110    0       -30     ]
+        [     -58     105     7       -10     ]
+        [     112     -116    66      69      ]
+        [     83      -96     -124    48      ]]]
+        3-D array  Shape: [4, 4, 4]  DType: int8
+
+        A[0]
+        [[      -22     47      22      110     ]
+        [      88      6       -105    39      ]
+        [      -22     51      105     67      ]
+        [      -61     -116    60      -44     ]]
+        2-D array  Shape: [4, 4]  DType: int8
+
+        A[0, 1]
+        [       88      6       -105    39      ]
+        1-D array  Shape: [4]  DType: int8
+
+        A[0, 1, 2]
+        -105
+        0-D array  Shape: [0]  DType: int8
+
+        A[Slice(1,3)]
+        [[[     33      65      125     -35     ]
+        [     -65     123     57      64      ]
+        [     38      -110    33      98      ]
+        [     -59     -17     68      -6      ]]
+        [[     -68     -58     -37     -86     ]
+        [     -4      101     104     -113    ]
+        [     103     1       4       -47     ]
+        [     124     -2      -60     -105    ]]]
+        3-D array  Shape: [2, 4, 4]  DType: int8
+
+        A[1, Slice(2,4)]
+        [[      38      -110    33      98      ]
+        [      -59     -17     68      -6      ]]
+        2-D array  Shape: [2, 4]  DType: int8
+
+        A[Slice(1,3), Slice(1,3), 2]
+        [[      57      33      ]
+        [      104     4       ]]
+        2-D array  Shape: [2, 2]  DType: int8
+
+        A.item(0,1,2) as Scalar
+        -105
         ```
+
+        Args:
+            slices: A series of either Slice or Int.
+
+        Returns:
+            An ndarray with a smaller or equal dimension of the original one.
         """
+
         var n_slices: Int = slices.__len__()
         if n_slices > self.ndim:
-            raise Error("Error: No of slices greater than rank of array")
+            raise Error(
+                String(
+                    "Error: number of slices {} \n"
+                    "is greater than number of dimension of array {}!"
+                ).format(n_slices, self.ndim)
+            )
         var slice_list: List[Slice] = List[Slice]()
 
-        var count_int = 0
+        var count_int = 0  # Count the number of Int in the argument
+
         for i in range(len(slices)):
             if slices[i].isa[Slice]():
                 slice_list.append(slices[i]._get_ptr[Slice]()[0])
@@ -684,927 +785,1291 @@ struct NDArray[dtype: DType = DType.float64](
                 var size_at_dim: Int = self.shape[i]
                 slice_list.append(Slice(0, size_at_dim, 1))
 
-        self.__setitem__(slices=slice_list, val=val)
+        var narr: Self = self.__getitem__(slice_list)
 
-    # TODO: fix this setter, add bound checks. Not sure about it's use case.
-    fn __setitem__(self, index: NDArray[DType.index], val: NDArray) raises:
+        # Number of ints equals to nidm, it returns a 0-D array.
+        if count_int == self.ndim:
+            narr = creation._0darray[dtype](narr._buf.ptr[])
+
+        return narr
+
+    fn __getitem__(self, indices: NDArray[DType.index]) raises -> Self:
         """
-        Returns the items of the array from an array of indices.
+        Get items from 0-th dimension of an ndarray of indices.
+        If the original array is of shape (i,j,k) and
+        the indices array is of shape (l,m,n), then the output array
+        will be of shape (l,m,n,j,k).
 
         Args:
-            index: Array of indices.
-            val: Value to set.
+            indices: Array of indices.
+
+        Returns:
+            NDArray with items from the array of indices.
 
         Example:
         ```console
-        > var X = nm.NDArray[nm.i8](3,random=True)
-        > print(X)
-        [       32      21      53      ]
-        1-D array  Shape: [3]  DType: int8
-        > print(X.argsort())
-        [       1       0       2       ]
-        1-D array  Shape: [3]  DType: index
-        > print(X[X.argsort()])
-        [       21      32      53      ]
-        1-D array  Shape: [3]  DType: int8
-        ```
-        """
-
-        for i in range(len(index)):
-            self.store(Int(index.load(i)), rebind[Scalar[dtype]](val.load(i)))
+        >>>var a = nm.arange[i8](6)
+        >>>print(a)
+        [       0       1       2       3       4       5       ]
+        1-D array  Shape: [6]  DType: int8  C-cont: True  F-cont: True  own data: True
+        >>>print(a[nm.array[isize]("[4, 2, 5, 1, 0, 2]")])
+        [       4       2       5       1       0       2       ]
+        1-D array  Shape: [6]  DType: int8  C-cont: True  F-cont: True  own data: True
 
-    fn __setitem__(
-        mut self, mask: NDArray[DType.bool], val: NDArray[dtype]
-    ) raises:
+        var b = nm.arange[i8](12).reshape(Shape(2, 2, 3))
+        print(b)
+        [[[     0       1       2       ]
+          [     3       4       5       ]]
+         [[     6       7       8       ]
+          [     9       10      11      ]]]
+        3-D array  Shape: [2, 2, 3]  DType: int8  C-cont: True  F-cont: False  own data: True
+        print(b[nm.array[isize]("[2, 0, 1]")])
+        [[[     0       0       0       ]
+          [     0       67      95      ]]
+         [[     0       1       2       ]
+          [     3       4       5       ]]
+         [[     6       7       8       ]
+          [     9       10      11      ]]]
+        3-D array  Shape: [3, 2, 3]  DType: int8  C-cont: True  F-cont: False  own data: True
+        ```.
         """
-        Set the value of the array at the indices where the mask is true.
 
-        Args:
-            mask: Boolean mask array.
-            val: Value to set.
+        # Get the shape of resulted array
+        var shape = NDArrayShape.join(indices.shape, self.shape._pop(0))
 
-        Example:
-        ```
-        var A = numojo.core.NDArray[numojo.i16](6, random=True)
-        var mask = A > 0
-        print(A)
-        print(mask)
-        A[mask] = 0
-        print(A)
-        ```
-        """
-        if (
-            mask.shape != self.shape
-        ):  # this behavious could be removed potentially
-            var message = String(
-                "Shape of mask does not match the shape of array."
-            )
-            raise Error(message)
+        var result = NDArray[dtype](shape)
+        var size_per_item = self.size // self.shape[0]
 
-        for i in range(mask.size):
-            if mask._buf.ptr.load(i):
-                self._buf.ptr.store(i, val._buf.ptr.load(i))
+        # Fill in the values
+        for i in range(indices.size):
+            if indices.item(i) >= self.shape[0]:
+                raise Error(
+                    String(
+                        "index {} with value {} is out of boundary [0, {})"
+                    ).format(i, indices.item(i), self.shape[0])
+                )
+            memcpy(
+                result._buf.ptr + i * size_per_item,
+                self._buf.ptr + indices.item(i) * size_per_item,
+                size_per_item,
+            )
 
-    # ===-------------------------------------------------------------------===#
-    # Getter dunders and other getter methods
-    #
-    # INDEXING: to get a scalar from array.
-    # fn _getitem(self, *indices: Int) -> Scalar[dtype]
-    # fn __getitem__(self, index: Item) raises -> SIMD[dtype, 1]
-    #
-    # SLICING: to get a slice of array.
-    # fn __getitem__(self, idx: Int) raises -> Self
-    # fn __getitem__(self, *slices: Slice) raises -> Self
-    # fn __getitem__(self, slice_list: List[Slice]) raises -> Self
-    # fn __getitem__(self, *slices: Variant[Slice, Int]) raises -> Self
-    #
-    # SLICING: to get a slice of array from index or mask.
-    # fn __getitem__(self, index: List[Int]) raises -> Self
-    # fn __getitem__(self, index: NDArray[index]) raises -> Self
-    # fn __getitem__(self, mask: List[Bool]) raises -> Self
-    # fn __getitem__(self, mask: NDArray[bool]) raises -> Self
-    # ===-------------------------------------------------------------------===#
+        return result
 
-    fn _getitem(self, *indices: Int) -> Scalar[dtype]:
+    fn __getitem__(self, indices: List[Int]) raises -> Self:
+        # TODO: Use trait IntLike when it is supported by Mojo.
         """
-        (UNSAFE! for internal use only.)
-        Get item at indices and bypass all boundary checks.
+        Get items from 0-th dimension of an array. It is an overload of
+        `__getitem__(self, indices: NDArray[DType.index]) raises -> Self`.
 
         Args:
-            indices: Indices to get the value.
+            indices: A list of Int.
 
-        Example:
-        ```mojo
-        import numojo
-        var A = numojo.ones(numojo.Shape(2,3,4))
-        print(A._getitem(1,2,3))
-        ```
-        """
-        var index_of_buffer: Int = 0
-        for i in range(self.ndim):
-            index_of_buffer += indices[i] * self.strides._buf[i]
-        return self._buf.ptr[index_of_buffer]
+        Returns:
+            NDArray with items from the list of indices.
 
-    fn __getitem__(self) raises -> SIMD[dtype, 1]:
+        Example:
+        ```console
+        >>>var a = nm.arange[i8](6)
+        >>>print(a)
+        [       0       1       2       3       4       5       ]
+        1-D array  Shape: [6]  DType: int8  C-cont: True  F-cont: True  own data: True
+        >>>print(a[List[Int](4, 2, 5, 1, 0, 2)])
+        [       4       2       5       1       0       2       ]
+        1-D array  Shape: [6]  DType: int8  C-cont: True  F-cont: True  own data: True
+
+        var b = nm.arange[i8](12).reshape(Shape(2, 2, 3))
+        print(b)
+        [[[     0       1       2       ]
+          [     3       4       5       ]]
+         [[     6       7       8       ]
+          [     9       10      11      ]]]
+        3-D array  Shape: [2, 2, 3]  DType: int8  C-cont: True  F-cont: False  own data: True
+        print(b[List[Int](2, 0, 1)])
+        [[[     0       0       0       ]
+          [     0       67      95      ]]
+         [[     0       1       2       ]
+          [     3       4       5       ]]
+         [[     6       7       8       ]
+          [     9       10      11      ]]]
+        3-D array  Shape: [3, 2, 3]  DType: int8  C-cont: True  F-cont: False  own data: True
+        ```.
         """
-        Gets the value of the 0darray.
 
-        Raises:
-            Error: If the array is not 0-d.
+        var indices_array = NDArray[DType.index](shape=Shape(len(indices)))
+        for i in range(len(indices)):
+            (indices_array._buf.ptr + i).init_pointee_copy(indices[i])
 
-        Returns:
-            The value of the 0darray.
-        """
-        if self.ndim != 0:
-            raise Error(
-                "Error in `numojo.NDArray.__getitem__()`: \n"
-                "Cannot get value without index.\n"
-            )
-        return self._buf.ptr[]
+        return self[indices_array]
 
-    fn __getitem__(self, index: Item) raises -> SIMD[dtype, 1]:
+    fn __getitem__(self, mask: NDArray[DType.bool]) raises -> Self:
+        # TODO: Extend the mask into multiple dimensions.
         """
-        Get the value at the index list.
+        Get item from an array according to a mask array.
+        If array shape is equal to mask shape, it returns a flattened array of
+        the values where mask is True.
+        If array shape is not equal to mask shape, it returns items from the
+        0-th dimension of the array where mask is True.
 
         Args:
-            index: Index list.
+            mask: NDArray with Dtype.bool.
 
         Returns:
-            The value at the index list.
-        """
-        if index.__len__() != self.ndim:
-            var message = String(
-                "Error: Length of `index` do not match the number of"
-                " dimensions!\n"
-                "Length of indices is {}.\n"
-                "The number of dimensions is {}."
-            ).format(index.__len__(), self.ndim)
-            raise Error(message)
+            NDArray with items from the mask.
 
-        for i in range(index.__len__()):
-            if index[i] >= self.shape[i]:
-                var message = String(
-                    "Error: `index` exceeds the size!\n"
-                    "For {}-the mension:\n"
-                    "The index is {}.\n"
-                    "The size of the dimensions is {}"
-                ).format(i, index[i], self.shape[i])
-                raise Error(message)
-        var idx: Int = _get_offset(index, self.strides)
-        return self._buf.ptr.load[width=1](idx)
+        Example:
+        ```console
+        >>>var a = nm.arange[i8](6)
+        >>>print(a)
+        [       0       1       2       3       4       5       ]
+        1-D array  Shape: [6]  DType: int8  C-cont: True  F-cont: True  own data: True
+        >>>print(a[nm.array[boolean]("[1,0,1,1,0,1]")])
+        [       0       2       3       5       ]
+        1-D array  Shape: [4]  DType: int8  C-cont: True  F-cont: True  own data: True
 
-    fn __getitem__(self, idx: Int) raises -> Self:
+        var b = nm.arange[i8](12).reshape(Shape(2, 2, 3))
+        print(b)
+        [[[     0       1       2       ]
+          [     3       4       5       ]]
+         [[     6       7       8       ]
+          [     9       10      11      ]]]
+        3-D array  Shape: [2, 2, 3]  DType: int8  C-cont: True  F-cont: False  own data: True
+        >>>print(b[nm.array[boolean]("[0,1]")])
+        [[[     6       7       8       ]
+          [     9       10      11      ]]]
+        3-D array  Shape: [1, 2, 3]  DType: int8  C-cont: True  F-cont: True  own data: True
+        ```.
         """
-        Retrieve a slice of the array corresponding to the index at the first dimension.
 
-        Args:
-            idx: Index to get the slice.
+        # CASE 1:
+        # if array shape is equal to mask shape,
+        # return a flattened array of the values where mask is True
+        if mask.shape == self.shape:
+            var len_of_result = 0
 
-        Returns:
-            A slice of the array.
+            # Count number of True
+            for i in range(mask.size):
+                if mask.item(i):
+                    len_of_result += 1
 
-        Raises:
-            Error: If the array is 0-d.
+            # Change the first number of the ndshape
+            var result = NDArray[dtype](shape=NDArrayShape(len_of_result))
 
-        Example:
-            `arr[1]` returns the second row of the array.
-        """
+            # Fill in the values
+            var offset = 0
+            for i in range(mask.size):
+                if mask.item(i):
+                    (result._buf.ptr + offset).init_pointee_copy(
+                        self._buf.ptr[i]
+                    )
+                    offset += 1
 
-        var slice_list = List[Slice]()
-        slice_list.append(Slice(idx, idx + 1, 1))
+            return result
 
-        # If the ndim is 0, then it is a numojo scalar (0darray).
-        if self.ndim == 0:
+        # CASE 2:
+        # if array shape is not equal to mask shape,
+        # return items from the 0-th dimension of the array where mask is True
+        if mask.ndim > 1:
+            raise Error(String("Currently we only support 1-d mask array."))
+
+        if mask.shape[0] != self.shape[0]:
             raise Error(
-                "Error in `numojo.NDArray.__getitem__(self, idx: Int)`: \n"
-                "Cannot slice a 0-d array.\n"
+                String(
+                    "Shape 0 of mask ({}) does not match that of array ({})."
+                ).format(mask.shape[0], self.shape[0])
             )
 
-        var narr: Self
+        var len_of_result = 0
 
-        # If the ndim is 1
-        if self.ndim == 1:
-            narr = creation._0darray[dtype](self._buf.ptr[idx])
+        # Count number of True
+        for i in range(mask.size):
+            if mask.item(i):
+                len_of_result += 1
 
-        else:
-            for i in range(1, self.ndim):
-                var size_at_dim: Int = self.shape[i]
-                slice_list.append(Slice(0, size_at_dim, 1))
+        # Change the first number of the ndshape
+        var shape = self.shape
+        shape._buf[0] = len_of_result
 
-            narr = self.__getitem__(slice_list)
+        var result = NDArray[dtype](shape)
+        var size_per_item = self.size // self.shape[0]
 
-        return narr
+        # Fill in the values
+        var offset = 0
+        for i in range(mask.size):
+            if mask.item(i):
+                memcpy(
+                    result._buf.ptr + offset * size_per_item,
+                    self._buf.ptr + i * size_per_item,
+                    size_per_item,
+                )
+                offset += 1
 
-    fn __getitem__(self, owned *slices: Slice) raises -> Self:
+        return result
+
+    fn __getitem__(self, mask: List[Bool]) raises -> Self:
         """
-        Retrieve slices of an array from variadic slices.
+        Get items from 0-th dimension of an array according to mask.
+        __getitem__(self, mask: NDArray[DType.bool]) raises -> Self.
 
         Args:
-            slices: Variadic slices.
+            mask: A list of boolean values.
 
         Returns:
-            A slice of the array.
+            NDArray with items from the mask.
 
         Example:
-            `arr[1:3, 2:4]` returns the corresponding sliced array (2 x 2).
-        """
+        ```console
+        >>>var a = nm.arange[i8](6)
+        >>>print(a)
+        [       0       1       2       3       4       5       ]
+        1-D array  Shape: [6]  DType: int8  C-cont: True  F-cont: True  own data: True
+        >>>print(a[List[Bool](True, False, True, True, False, True)])
+        [       0       2       3       5       ]
+        1-D array  Shape: [4]  DType: int8  C-cont: True  F-cont: True  own data: True
 
-        var n_slices: Int = slices.__len__()
-        if n_slices > self.ndim:
-            raise Error("Error: No of slices exceed the array dimensions.")
-        var slice_list: List[Slice] = List[Slice]()
-        for i in range(len(slices)):
-            slice_list.append(slices[i])
+        var b = nm.arange[i8](12).reshape(Shape(2, 2, 3))
+        print(b)
+        [[[     0       1       2       ]
+          [     3       4       5       ]]
+         [[     6       7       8       ]
+          [     9       10      11      ]]]
+        3-D array  Shape: [2, 2, 3]  DType: int8  C-cont: True  F-cont: False  own data: True
+        >>>print(b[List[Bool](False, True)])
+        [[[     6       7       8       ]
+          [     9       10      11      ]]]
+        3-D array  Shape: [1, 2, 3]  DType: int8  C-cont: True  F-cont: True  own data: True
+        ```.
+        """
 
-        if n_slices < self.ndim:
-            for i in range(n_slices, self.ndim):
-                slice_list.append(Slice(0, self.shape[i], 1))
+        var mask_array = NDArray[DType.bool](shape=Shape(len(mask)))
+        for i in range(len(mask)):
+            (mask_array._buf.ptr + i).init_pointee_copy(mask[i])
 
-        var narr: Self = self[slice_list]
-        return narr
+        return self[mask_array]
 
-    fn __getitem__(self, owned slice_list: List[Slice]) raises -> Self:
+    fn item(
+        self, owned index: Int
+    ) raises -> ref [self._buf.ptr.origin, self._buf.ptr.address_space] Scalar[
+        dtype
+    ]:
         """
-        Retrieve slices of an array from list of slices.
+        Return the scalar at the coordinates.
+        If one index is given, get the i-th item of the array (not buffer).
+        It first scans over the first row, even it is a colume-major array.
+        If more than one index is given, the length of the indices must match
+        the number of dimensions of the array.
+        If the ndim is 0 (0darray), get the value as a mojo scalar.
 
         Args:
-            slice_list: List of slices.
+            index: Index of item, counted in row-major way.
 
         Returns:
-            A slice of the array.
-
-        Example:
-            `arr[1:3, 2:4]` returns the corresponding sliced array (2 x 2).
-        """
-
-        var n_slices: Int = slice_list.__len__()
-        if n_slices > self.ndim or n_slices < self.ndim:
-            raise Error("Error: No of slices do not match shape")
+            A scalar matching the dtype of the array.
 
-        var ndims: Int = 0
-        var spec: List[Int] = List[Int]()
-        var count: Int = 0
+        Raises:
+            Error if array is 0darray (numojo scalar).
+            Error if index is equal or larger than array size.
 
-        var slices: List[Slice] = self._adjust_slice(slice_list)
-        for i in range(slices.__len__()):
-            if (
-                slices[i].start.value() >= self.shape[i]
-                or slices[i].end.value() > self.shape[i]
-            ):
-                raise Error("Error: Slice value exceeds the array shape")
-            var slice_len: Int = len(
-                range(
-                    slices[i].start.value(),
-                    slices[i].end.value(),
-                    slices[i].step.or_else(1),
-                )
-            )
-            spec.append(slice_len)
-            if slice_len != 1:
-                ndims += 1
-            else:
-                count += 1
-        if count == slices.__len__():
-            ndims = 1
-
-        var nshape: List[Int] = List[Int]()
-        var ncoefficients: List[Int] = List[Int]()
-        var nstrides: List[Int] = List[Int]()
-        var nnum_elements: Int = 1
+        Example:
+        ```console
+        >>> var A = nm.random.randn[nm.f16](2, 2, 2)
+        >>> A = A.reshape(A.shape, order="F")
+        >>> print(A)
+        [[[     0.2446289       0.5419922       ]
+          [     0.09643555      -0.90722656     ]]
+         [[     1.1806641       0.24389648      ]
+          [     0.5234375       1.0390625       ]]]
+        3-D array  Shape: [2, 2, 2]  DType: float16  order: F
+        >>> for i in range(A.size):
+        ...     print(A.item(i))
+        0.2446289
+        0.5419922
+        0.09643555
+        -0.90722656
+        1.1806641
+        0.24389648
+        0.5234375
+        1.0390625
+        >>> print(A.item(0, 1, 1))
+        -0.90722656
+        ```.
+        """
 
-        var j: Int = 0
-        count = 0
-        for _ in range(ndims):
-            while spec[j] == 1:
-                count += 1
-                j += 1
-            if j >= self.ndim:
-                break
-            var slice_len: Int = len(
-                range(
-                    slices[j].start.value(),
-                    slices[j].end.value(),
-                    slices[j].step.or_else(1),
+        # For 0darray, raise error
+        if self.ndim == 0:
+            raise Error(
+                String(
+                    "\nError in `numojo.NDArray.item(index: Int)`: "
+                    "Cannot index a 0darray (numojo scalar). "
+                    "Use `a.item()` without arguments."
                 )
             )
-            nshape.append(slice_len)
-            nnum_elements *= slice_len
-            ncoefficients.append(self.strides[j] * slices[j].step.value())
-            j += 1
 
-        if count == slices.__len__():
-            nshape.append(1)
-            nnum_elements = 1
-            ncoefficients.append(1)
+        if index < 0:
+            index += self.size
 
-        var noffset: Int = 0
+        if (index < 0) or (index >= self.size):
+            raise Error(
+                String(
+                    "\nError in `numojo.NDArray.item(index: Int)`:"
+                    "`index` exceeds array size ({})"
+                ).format(self.size)
+            )
 
-        if self.flags.C_CONTIGUOUS:
-            noffset = 0
-            for i in range(ndims):
-                var temp_stride: Int = 1
-                for j in range(i + 1, ndims):  # temp
-                    temp_stride *= nshape[j]
-                nstrides.append(temp_stride)
-            for i in range(slices.__len__()):
-                noffset += slices[i].start.value() * self.strides[i]
+        if self.flags.F_CONTIGUOUS:
+            return (self._buf.ptr + _transfer_offset(index, self.strides))[]
 
-        elif self.flags.F_CONTIGUOUS:
-            noffset = 0
-            nstrides.append(1)
-            for i in range(0, ndims - 1):
-                nstrides.append(nstrides[i] * nshape[i])
-            for i in range(slices.__len__()):
-                noffset += slices[i].start.value() * self.strides[i]
+        else:
+            return (self._buf.ptr + index)[]
 
-        var narr = Self(
-            offset=noffset,
-            shape=nshape,
-            strides=nstrides,
-        )
+    fn item(
+        self, *index: Int
+    ) raises -> ref [self._buf.ptr.origin, self._buf.ptr.address_space] Scalar[
+        dtype
+    ]:
+        """
+        Return the scalar at the coordinates.
+        If one index is given, get the i-th item of the array (not buffer).
+        It first scans over the first row, even it is a colume-major array.
+        If more than one index is given, the length of the indices must match
+        the number of dimensions of the array.
+        For 0darray (numojo scalar), return the scalar value.
 
-        var index = List[Int]()
-        for _ in range(ndims):
-            index.append(0)
+        Args:
+            index: The coordinates of the item.
 
-        _traverse_iterative[dtype](
-            self, narr, nshape, ncoefficients, nstrides, noffset, index, 0
-        )
+        Returns:
+            A scalar matching the dtype of the array.
 
-        return narr
+        Raises:
+            Index is equal or larger than size of dimension.
 
-    fn __getitem__(self, owned *slices: Variant[Slice, Int]) raises -> Self:
+        Example:
+        ```
+        >>> var A = nm.random.randn[nm.f16](2, 2, 2)
+        >>> A = A.reshape(A.shape, order="F")
+        >>> print(A)
+        [[[     0.2446289       0.5419922       ]
+          [     0.09643555      -0.90722656     ]]
+         [[     1.1806641       0.24389648      ]
+          [     0.5234375       1.0390625       ]]]
+        3-D array  Shape: [2, 2, 2]  DType: float16  order: F
+        >>> print(A.item(0, 1, 1))
+        -0.90722656
+        ```.
         """
-        Get items by a series of either slices or integers.
 
-        A decrease of dimensions may or may not happen when `__getitem__` is
-        called on an ndarray. An ndarray of X-D array can become Y-D array after
-        `__getitem__` where `Y <= X`.
+        if len(index) != self.ndim:
+            raise Error(
+                String(
+                    "\nError in `numojo.NDArray.item(*index: Int)`:"
+                    "Number of indices ({}) do not match ndim ({})"
+                ).format(len(index), self.ndim)
+            )
 
-        Whether the dimension decerases or not depends on:
-        1. What types of arguments are passed into `__getitem__`.
-        2. The number of arguments that are passed in `__getitem__`.
+        # For 0darray, return the scalar value.
+        if self.ndim == 0:
+            return self._buf.ptr[]
 
-        PRINCIPAL: The number of dimensions to be decreased is determined by
-        the number of `Int` passed in `__getitem__`.
+        var list_index = List[Int]()
+        for i in range(len(index)):
+            if index[i] < 0:
+                list_index.append(index[i] + self.shape[i])
+            else:
+                list_index.append(index[i])
+            if (list_index[i] < 0) or (list_index[i] >= self.shape[i]):
+                raise Error(
+                    String("{}-th index exceeds shape size {}").format(
+                        i, self.shape[i]
+                    )
+                )
+        return (self._buf.ptr + _get_offset(index, self.strides))[]
 
-        For example, `A` is a 10x10x10 ndarray (3-D). Then,
+    fn load(self, owned index: Int) raises -> Scalar[dtype]:
+        """
+        Safely retrieve i-th item from the underlying buffer.
 
-        - `A[1, 2, 3]` leads to a 0-D array (scalar), since there are 3 integers.
-        - `A[1, 2]` leads to a 1-D array (vector), since there are 2 integers,
-        so the dimension decreases by 2.
-        - `A[1]` leads to a 2-D array (matrix), since there is 1 integer, so the
-        dimension decreases by 1.
+        `A.load(i)` differs from `A._buf.ptr[i]` due to boundary check.
 
-        The number of dimensions will not decrease when Slice is passed in
-        `__getitem__` or no argument is passed in for a certain dimension
-        (it is an implicit slide and a slide of all items will be used).
+        Args:
+            index: Index of the item.
 
-        Take the same example `A` with 10x10x10 in shape. Then,
+        Returns:
+            The value at the index.
 
-        - `A[1:4, 2:5, 3:6]`, leads to a 3-D array (no decrease in dimension),
-        since there are 3 slices.
-        - `A[2:8]`, leads to a 3-D array (no decrease in dimension), since there
-        are 1 explicit slice and 2 implicit slices.
+        Example:
+        ```console
+        > array.load(15)
+        ```
+        returns the item of index 15 from the array's data buffer.
 
-        When there is a mixture of int and slices passed into `__getitem__`,
-        the number of integers will be the number of dimensions to be decreased.
-        Example,
+        Note that it does not checked against C-order or F-order.
+        ```console
+        > # A is a 3x3 matrix, F-order (column-major)
+        > A.load(3)  # Row 0, Col 1
+        > A.item(3)  # Row 1, Col 0
+        ```.
+        """
 
-        - `A[1:4, 2, 2]`, leads to a 1-D array (vector), since there are 2
-        integers, so the dimension decreases by 2.
+        if index < 0:
+            index += self.size
 
-        Note that, even though a slice contains one row, it does not reduce the
-        dimensions. Example,
+        if (index >= self.size) or (index < 0):
+            raise Error(
+                String("Invalid index: index out of bound [0, {}).").format(
+                    self.size
+                )
+            )
 
-        - `A[1:2, 2:3, 3:4]`, leads to a 3-D array (no decrease in dimension),
-        since there are 3 slices.
+        return self._buf.ptr[index]
 
-        Note that, when the number of integers equals to the number of
-        dimensions, the final outcome is an 0-D array instead of a number.
-        The user has to upack the 0-D array with the method`A.item(0)` to get the
-        corresponding number.
-        This behavior is different from numpy where the latter returns a number.
+    fn load[width: Int = 1](self, index: Int) raises -> SIMD[dtype, width]:
+        """
+        Safely loads a SIMD element of size `width` at `index`
+        from the underlying buffer.
 
-        More examples for 1-D, 2-D, and 3-D arrays.
+        To bypass boundary checks, use `self._buf.ptr.load` directly.
 
-        ```console
-        A is a matrix
-        [[      -128    -95     65      -11     ]
-        [      8       -72     -116    45      ]
-        [      45      111     -30     4       ]
-        [      84      -120    -115    7       ]]
-        2-D array  Shape: [4, 4]  DType: int8
+        Args:
+            index: Index of the item.
 
-        A[0]
-        [       -128    -95     65      -11     ]
-        1-D array  Shape: [4]  DType: int8
-
-        A[0, 1]
-        -95
-        0-D array  Shape: [0]  DType: int8
-
-        A[Slice(1,3)]
-        [[      8       -72     -116    45      ]
-        [      45      111     -30     4       ]]
-        2-D array  Shape: [2, 4]  DType: int8
-
-        A[1, Slice(2,4)]
-        [       -116    45      ]
-        1-D array  Shape: [2]  DType: int8
-
-        A[Slice(1,3), Slice(1,3)]
-        [[      -72     -116    ]
-        [      111     -30     ]]
-        2-D array  Shape: [2, 2]  DType: int8
-
-        A.item(0,1) as Scalar
-        -95
-
-        ==============================
-        A is a vector
-        [       43      -127    -30     -111    ]
-        1-D array  Shape: [4]  DType: int8
-
-        A[0]
-        43
-        0-D array  Shape: [0]  DType: int8
-
-        A[Slice(1,3)]
-        [       -127    -30     ]
-        1-D array  Shape: [2]  DType: int8
-
-        A.item(0) as Scalar
-        43
-
-        ==============================
-        A is a 3darray
-        [[[     -22     47      22      110     ]
-        [     88      6       -105    39      ]
-        [     -22     51      105     67      ]
-        [     -61     -116    60      -44     ]]
-        [[     33      65      125     -35     ]
-        [     -65     123     57      64      ]
-        [     38      -110    33      98      ]
-        [     -59     -17     68      -6      ]]
-        [[     -68     -58     -37     -86     ]
-        [     -4      101     104     -113    ]
-        [     103     1       4       -47     ]
-        [     124     -2      -60     -105    ]]
-        [[     114     -110    0       -30     ]
-        [     -58     105     7       -10     ]
-        [     112     -116    66      69      ]
-        [     83      -96     -124    48      ]]]
-        3-D array  Shape: [4, 4, 4]  DType: int8
-
-        A[0]
-        [[      -22     47      22      110     ]
-        [      88      6       -105    39      ]
-        [      -22     51      105     67      ]
-        [      -61     -116    60      -44     ]]
-        2-D array  Shape: [4, 4]  DType: int8
-
-        A[0, 1]
-        [       88      6       -105    39      ]
-        1-D array  Shape: [4]  DType: int8
+        Returns:
+            The SIMD element at the index.
 
-        A[0, 1, 2]
-        -105
-        0-D array  Shape: [0]  DType: int8
+        Raises:
+            Index out of boundary.
+        """
 
-        A[Slice(1,3)]
-        [[[     33      65      125     -35     ]
-        [     -65     123     57      64      ]
-        [     38      -110    33      98      ]
-        [     -59     -17     68      -6      ]]
-        [[     -68     -58     -37     -86     ]
-        [     -4      101     104     -113    ]
-        [     103     1       4       -47     ]
-        [     124     -2      -60     -105    ]]]
-        3-D array  Shape: [2, 4, 4]  DType: int8
+        if (index < 0) or (index >= self.size):
+            raise Error(
+                String("Invalid index: index out of bound [0, {}).").format(
+                    self.size
+                )
+            )
 
-        A[1, Slice(2,4)]
-        [[      38      -110    33      98      ]
-        [      -59     -17     68      -6      ]]
-        2-D array  Shape: [2, 4]  DType: int8
+        return self._buf.ptr.load[width=width](index)
 
-        A[Slice(1,3), Slice(1,3), 2]
-        [[      57      33      ]
-        [      104     4       ]]
-        2-D array  Shape: [2, 2]  DType: int8
+    fn load[width: Int = 1](self, *indices: Int) raises -> SIMD[dtype, width]:
+        """
+        Safely loads SIMD element of size `width` at given variadic indices
+        from the underlying buffer.
 
-        A.item(0,1,2) as Scalar
-        -105
-        ```
+        To bypass boundary checks, use `self._buf.ptr.load` directly.
 
         Args:
-            slices: A series of either Slice or Int.
+            indices: Variadic indices.
 
         Returns:
-            An ndarray with a smaller or equal dimension of the original one.
+            The SIMD element at the indices.
+
+        Raises:
+            Index out of boundary.
         """
 
-        var n_slices: Int = slices.__len__()
-        if n_slices > self.ndim:
-            raise Error(
+        if len(indices) != self.ndim:
+            raise (
                 String(
-                    "Error: number of slices {} \n"
-                    "is greater than number of dimension of array {}!"
-                ).format(n_slices, self.ndim)
+                    "Length of indices {} does not match ndim {}".format(
+                        len(indices), self.ndim
+                    )
+                )
             )
-        var slice_list: List[Slice] = List[Slice]()
 
-        var count_int = 0  # Count the number of Int in the argument
+        for i in range(self.ndim):
+            if (indices[i] < 0) or (indices[i] >= self.shape[i]):
+                raise Error(
+                    String(
+                        "Invalid index at {}-th dim: "
+                        "index out of bound [0, {})."
+                    ).format(i, self.shape[i])
+                )
 
-        for i in range(len(slices)):
-            if slices[i].isa[Slice]():
-                slice_list.append(slices[i]._get_ptr[Slice]()[0])
-            elif slices[i].isa[Int]():
-                count_int += 1
-                var int: Int = slices[i]._get_ptr[Int]()[0]
-                slice_list.append(Slice(int, int + 1, 1))
+        var idx: Int = _get_offset(indices, self.strides)
+        return self._buf.ptr.load[width=width](idx)
 
-        if n_slices < self.ndim:
-            for i in range(n_slices, self.ndim):
-                var size_at_dim: Int = self.shape[i]
-                slice_list.append(Slice(0, size_at_dim, 1))
+    # ===-------------------------------------------------------------------===#
+    # Setter dunders and other setter methods
+    #
+    # Basic Setter Methods
+    # fn _setitem(self, *indices: Int, val: Scalar[dtype])                      # Direct unsafe setter
+    # fn __setitem__(mut self, idx: Int, val: Self) raises                      # Set by single index
+    # fn __setitem__(mut self, index: Item, val: Scalar[dtype]) raises          # Set by coordinate list
+    # fn __setitem__(mut self, mask: NDArray[DType.bool], value: Scalar[dtype]) # Set by boolean mask
+
+    # Slice-based Setters
+    # fn __setitem__(mut self, *slices: Slice, val: Self) raises                # Set by variable slices
+    # fn __setitem__(mut self, slices: List[Slice], val: Self) raises           # Set by list of slices
+    # fn __setitem__(mut self, *slices: Variant[Slice, Int], val: Self) raises  # Set by mix of slices/ints
+
+    # Index-based Setters
+    # fn __setitem__(self, indices: NDArray[DType.index], val: NDArray) raises  # Set by index array
+    # fn __setitem__(mut self, mask: NDArray[DType.bool], val: NDArray[dtype])  # Set by boolean mask array
+
+    # Helper Methods
+    # fn itemset(mut self, index: Variant[Int, List[Int]], item: Scalar[dtype]) # Set single item
+    # fn store(self, owned index: Int, val: Scalar[dtype]) raises               # Store with bounds checking
+    # fn store[width: Int](mut self, index: Int, val: SIMD[dtype, width])       # Store SIMD value
+    # fn store[width: Int = 1](mut self, *indices: Int, val: SIMD[dtype, width])# Store SIMD at coordinates
+    # ===-------------------------------------------------------------------===#
 
-        var narr: Self = self.__getitem__(slice_list)
+    fn _setitem(self, *indices: Int, val: Scalar[dtype]):
+        """
+        (UNSAFE! for internal use only.)
+        Set item at indices and bypass all boundary checks.
 
-        # Number of ints equals to nidm, it returns a 0-D array.
-        if count_int == self.ndim:
-            narr = creation._0darray[dtype](narr._buf.ptr[])
+        Args:
+            indices: Indices to set the value.
+            val: Value to set.
 
-        return narr
+        Example:
+        ```mojo
+        import numojo
+        var A = numojo.ones(numojo.Shape(2,3,4))
+        A._setitem(1,2,3, val=10)
+        ```
+        """
+        var index_of_buffer: Int = 0
+        for i in range(self.ndim):
+            index_of_buffer += indices[i] * self.strides._buf[i]
+        self._buf.ptr[index_of_buffer] = val
 
-    fn __getitem__(self, indices: NDArray[DType.index]) raises -> Self:
+    fn __setitem__(mut self, idx: Int, val: Self) raises:
         """
-        Get items from 0-th dimension of an ndarray of indices.
-        If the original array is of shape (i,j,k) and
-        the indices array is of shape (l,m,n), then the output array
-        will be of shape (l,m,n,j,k).
+        Set a slice of array with given array.
 
         Args:
-            indices: Array of indices.
-
-        Returns:
-            NDArray with items from the array of indices.
+            idx: Index to set.
+            val: Value to set.
 
         Example:
-        ```console
-        >>>var a = nm.arange[i8](6)
-        >>>print(a)
-        [       0       1       2       3       4       5       ]
-        1-D array  Shape: [6]  DType: int8  C-cont: True  F-cont: True  own data: True
-        >>>print(a[nm.array[isize]("[4, 2, 5, 1, 0, 2]")])
-        [       4       2       5       1       0       2       ]
-        1-D array  Shape: [6]  DType: int8  C-cont: True  F-cont: True  own data: True
-
-        var b = nm.arange[i8](12).reshape(Shape(2, 2, 3))
-        print(b)
-        [[[     0       1       2       ]
-          [     3       4       5       ]]
-         [[     6       7       8       ]
-          [     9       10      11      ]]]
-        3-D array  Shape: [2, 2, 3]  DType: int8  C-cont: True  F-cont: False  own data: True
-        print(b[nm.array[isize]("[2, 0, 1]")])
-        [[[     0       0       0       ]
-          [     0       67      95      ]]
-         [[     0       1       2       ]
-          [     3       4       5       ]]
-         [[     6       7       8       ]
-          [     9       10      11      ]]]
-        3-D array  Shape: [3, 2, 3]  DType: int8  C-cont: True  F-cont: False  own data: True
-        ```.
+        ```mojo
+        import numojo as nm
+        var A = nm.random.rand[nm.i16](3, 2)
+        var B = nm.random.rand[nm.i16](3)
+        A[1:4] = B
+        ```
         """
 
-        # Get the shape of resulted array
-        var shape = NDArrayShape.join(indices.shape, self.shape._pop(0))
-
-        var result = NDArray[dtype](shape)
-        var size_per_item = self.size // self.shape[0]
+        var normalized_index = idx
+        if normalized_index < 0:
+            normalized_index = self.shape[0] + idx
+        if normalized_index >= self.shape[0]:
+            raise Error("Index out of bounds")
 
-        # Fill in the values
-        for i in range(indices.size):
-            if indices.item(i) >= self.shape[0]:
-                raise Error(
-                    String(
-                        "index {} with value {} is out of boundary [0, {})"
-                    ).format(i, indices.item(i), self.shape[0])
-                )
-            memcpy(
-                result._buf.ptr + i * size_per_item,
-                self._buf.ptr + indices.item(i) * size_per_item,
-                size_per_item,
+        # If the ndim is 0, then it is a numojo scalar (0darray).
+        # Not allow to set value to 0darray.
+        if self.ndim == 0 or val.ndim == 0:
+            raise Error(
+                "Error in `numojo.NDArray.__setitem__("
+                "self, idx: Int, val: Self)`: \n"
+                "Cannot set values to a 0-d array.\n"
             )
 
-        return result
-
-    fn __getitem__(self, indices: List[Int]) raises -> Self:
-        # TODO: Use trait IntLike when it is supported by Mojo.
-        """
-        Get items from 0-th dimension of an array. It is an overload of
-        `__getitem__(self, indices: NDArray[DType.index]) raises -> Self`.
+        var slice_list = List[Slice]()
+        if idx >= self.shape[0]:
+            var message = String(
+                "Error: Slice value exceeds the array shape!\n"
+                "The {}-th dimension is of size {}.\n"
+                "The slice goes from {} to {}"
+            ).format(
+                0,
+                self.shape[0],
+                idx,
+                idx + 1,
+            )
+            raise Error(message)
+        slice_list.append(Slice(idx, idx + 1, 1))
+        if self.ndim > 1:
+            for i in range(1, self.ndim):
+                var size_at_dim: Int = self.shape[i]
+                slice_list.append(Slice(0, size_at_dim, 1))
 
-        Args:
-            indices: A list of Int.
+        var n_slices: Int = len(slice_list)
+        var ndims: Int = 0
+        var count: Int = 0
+        var spec: List[Int] = List[Int]()
+        for i in range(n_slices):
+            if slice_list[i].step is None:
+                raise Error(String("Step of slice is None."))
+            var slice_len: Int = (
+                (slice_list[i].end.value() - slice_list[i].start.value())
+                / slice_list[i].step.or_else(1)
+            ).__int__()
+            spec.append(slice_len)
+            if slice_len != 1:
+                ndims += 1
+            else:
+                count += 1
+        if count == slice_list.__len__():
+            ndims = 1
 
-        Returns:
-            NDArray with items from the list of indices.
+        var nshape: List[Int] = List[Int]()
+        var ncoefficients: List[Int] = List[Int]()
+        var nstrides: List[Int] = List[Int]()
+        var nnum_elements: Int = 1
 
-        Example:
-        ```console
-        >>>var a = nm.arange[i8](6)
-        >>>print(a)
-        [       0       1       2       3       4       5       ]
-        1-D array  Shape: [6]  DType: int8  C-cont: True  F-cont: True  own data: True
-        >>>print(a[List[Int](4, 2, 5, 1, 0, 2)])
-        [       4       2       5       1       0       2       ]
-        1-D array  Shape: [6]  DType: int8  C-cont: True  F-cont: True  own data: True
+        var j: Int = 0
+        count = 0
+        for _ in range(ndims):
+            while spec[j] == 1:
+                count += 1
+                j += 1
+            if j >= self.ndim:
+                break
+            var slice_len: Int = (
+                (slice_list[j].end.value() - slice_list[j].start.value())
+                / slice_list[j].step.or_else(1)
+            ).__int__()
+            nshape.append(slice_len)
+            nnum_elements *= slice_len
+            ncoefficients.append(
+                self.strides[j] * slice_list[j].step.or_else(1)
+            )
+            j += 1
 
-        var b = nm.arange[i8](12).reshape(Shape(2, 2, 3))
-        print(b)
-        [[[     0       1       2       ]
-          [     3       4       5       ]]
-         [[     6       7       8       ]
-          [     9       10      11      ]]]
-        3-D array  Shape: [2, 2, 3]  DType: int8  C-cont: True  F-cont: False  own data: True
-        print(b[List[Int](2, 0, 1)])
-        [[[     0       0       0       ]
-          [     0       67      95      ]]
-         [[     0       1       2       ]
-          [     3       4       5       ]]
-         [[     6       7       8       ]
-          [     9       10      11      ]]]
-        3-D array  Shape: [3, 2, 3]  DType: int8  C-cont: True  F-cont: False  own data: True
-        ```.
-        """
+        # TODO: We can remove this check after we have support for broadcasting
+        for i in range(ndims):
+            if nshape[i] != val.shape[i]:
+                var message = String(
+                    "Error: Shape mismatch!\n"
+                    "Cannot set the array values with given array.\n"
+                    "The {}-th dimension of the array is of shape {}.\n"
+                    "The {}-th dimension of the value is of shape {}."
+                ).format(nshape[i], val.shape[i])
+                raise Error(message)
 
-        var indices_array = NDArray[DType.index](shape=Shape(len(indices)))
-        for i in range(len(indices)):
-            (indices_array._buf.ptr + i).init_pointee_copy(indices[i])
+        var noffset: Int = 0
+        if self.flags.C_CONTIGUOUS:
+            noffset = 0
+            for i in range(ndims):
+                var temp_stride: Int = 1
+                for j in range(i + 1, ndims):
+                    temp_stride *= nshape[j]
+                nstrides.append(temp_stride)
+            for i in range(slice_list.__len__()):
+                noffset += slice_list[i].start.value() * self.strides[i]
+        elif self.flags.F_CONTIGUOUS:
+            noffset = 0
+            nstrides.append(1)
+            for i in range(0, ndims - 1):
+                nstrides.append(nstrides[i] * nshape[i])
+            for i in range(slice_list.__len__()):
+                noffset += slice_list[i].start.value() * self.strides[i]
 
-        return self[indices_array]
+        var index = List[Int]()
+        for _ in range(ndims):
+            index.append(0)
 
-    fn __getitem__(self, mask: NDArray[DType.bool]) raises -> Self:
-        # TODO: Extend the mask into multiple dimensions.
+        _traverse_iterative_setter[dtype](
+            val, self, nshape, ncoefficients, nstrides, noffset, index
+        )
+
+    fn __setitem__(mut self, index: Item, val: Scalar[dtype]) raises:
         """
-        Get item from an array according to a mask array.
-        If array shape is equal to mask shape, it returns a flattened array of
-        the values where mask is True.
-        If array shape is not equal to mask shape, it returns items from the
-        0-th dimension of the array where mask is True.
+        Set the value at the index list.
 
         Args:
-            mask: NDArray with Dtype.bool.
-
-        Returns:
-            NDArray with items from the mask.
-
-        Example:
-        ```console
-        >>>var a = nm.arange[i8](6)
-        >>>print(a)
-        [       0       1       2       3       4       5       ]
-        1-D array  Shape: [6]  DType: int8  C-cont: True  F-cont: True  own data: True
-        >>>print(a[nm.array[boolean]("[1,0,1,1,0,1]")])
-        [       0       2       3       5       ]
-        1-D array  Shape: [4]  DType: int8  C-cont: True  F-cont: True  own data: True
-
-        var b = nm.arange[i8](12).reshape(Shape(2, 2, 3))
-        print(b)
-        [[[     0       1       2       ]
-          [     3       4       5       ]]
-         [[     6       7       8       ]
-          [     9       10      11      ]]]
-        3-D array  Shape: [2, 2, 3]  DType: int8  C-cont: True  F-cont: False  own data: True
-        >>>print(b[nm.array[boolean]("[0,1]")])
-        [[[     6       7       8       ]
-          [     9       10      11      ]]]
-        3-D array  Shape: [1, 2, 3]  DType: int8  C-cont: True  F-cont: True  own data: True
-        ```.
+            index: Index list.
+            val: Value to set.
         """
+        if index.__len__() != self.ndim:
+            var message = String(
+                "Error: Length of `index` does not match the number of"
+                " dimensions!\n"
+                "Length of indices is {}.\n"
+                "The array dimension is {}."
+            ).format(index.__len__(), self.ndim)
+            raise Error(message)
+        for i in range(index.__len__()):
+            if index[i] >= self.shape[i]:
+                var message = String(
+                    "Error: `index` exceeds the size!\n"
+                    "For {}-th dimension:\n"
+                    "The index value is {}.\n"
+                    "The size of the corresponding dimension is {}"
+                ).format(i, index[i], self.shape[i])
+                raise Error(message)
+        var idx: Int = _get_offset(index, self.strides)
+        self._buf.ptr.store(idx, val)
 
-        # CASE 1:
-        # if array shape is equal to mask shape,
-        # return a flattened array of the values where mask is True
-        if mask.shape == self.shape:
-            var len_of_result = 0
+    # only works if array is called as array.__setitem__(), mojo compiler doesn't parse it implicitly
+    fn __setitem__(
+        mut self, mask: NDArray[DType.bool], value: Scalar[dtype]
+    ) raises:
+        """
+        Set the value of the array at the indices where the mask is true.
 
-            # Count number of True
-            for i in range(mask.size):
-                if mask.item(i):
-                    len_of_result += 1
+        Args:
+            mask: Boolean mask array.
+            value: Value to set.
+        """
+        if (
+            mask.shape != self.shape
+        ):  # this behavious could be removed potentially
+            raise Error("Mask and array must have the same shape")
 
-            # Change the first number of the ndshape
-            var result = NDArray[dtype](shape=NDArrayShape(len_of_result))
+        for i in range(mask.size):
+            if mask._buf.ptr.load[width=1](i):
+                self._buf.ptr.store(i, value)
 
-            # Fill in the values
-            var offset = 0
-            for i in range(mask.size):
-                if mask.item(i):
-                    (result._buf.ptr + offset).init_pointee_copy(
-                        self._buf.ptr[i]
-                    )
-                    offset += 1
+    fn __setitem__(mut self, *slices: Slice, val: Self) raises:
+        """
+        Retrieve slices of an array from variadic slices.
 
-            return result
+        Args:
+            slices: Variadic slices.
+            val: Value to set.
 
-        # CASE 2:
-        # if array shape is not equal to mask shape,
-        # return items from the 0-th dimension of the array where mask is True
-        if mask.ndim > 1:
-            raise Error(String("Currently we only support 1-d mask array."))
+        Example:
+            `arr[1:3, 2:4]` returns the corresponding sliced array (2 x 2).
+        """
+        var slice_list: List[Slice] = List[Slice]()
+        for i in range(slices.__len__()):
+            slice_list.append(slices[i])
+        self.__setitem__(slices=slice_list, val=val)
 
-        if mask.shape[0] != self.shape[0]:
-            raise Error(
-                String(
-                    "Shape 0 of mask ({}) does not match that of array ({})."
-                ).format(mask.shape[0], self.shape[0])
-            )
+    fn __setitem__(mut self, slices: List[Slice], val: Self) raises:
+        """
+        Sets the slices of an array from list of slices and array.
 
-        var len_of_result = 0
+        Args:
+            slices: List of slices.
+            val: Value to set.
 
-        # Count number of True
-        for i in range(mask.size):
-            if mask.item(i):
-                len_of_result += 1
+        Example:
+        ```console
+        >>> var a = nm.arange[i8](16).reshape(Shape(4, 4))
+        print(a)
+        [[      0       1       2       3       ]
+         [      4       5       6       7       ]
+         [      8       9       10      11      ]
+         [      12      13      14      15      ]]
+        2-D array  Shape: [4, 4]  DType: int8  C-cont: True  F-cont: False  own data: True
+        >>> a[2:4, 2:4] = a[0:2, 0:2]
+        print(a)
+        [[      0       1       2       3       ]
+         [      4       5       6       7       ]
+         [      8       9       0       1       ]
+         [      12      13      4       5       ]]
+        2-D array  Shape: [4, 4]  DType: int8  C-cont: True  F-cont: False  own data: True
+        ```
+        """
+        var n_slices: Int = len(slices)
+        var ndims: Int = 0
+        var count: Int = 0
+        var spec: List[Int] = List[Int]()
+        var slice_list: List[Slice] = self._adjust_slice(slices)
+        for i in range(n_slices):
+            if (
+                slice_list[i].start.value() >= self.shape[i]
+                or slice_list[i].end.value() > self.shape[i]
+            ):
+                var message = String(
+                    "Error: Slice value exceeds the array shape!\n"
+                    "The {}-th dimension is of size {}.\n"
+                    "The slice goes from {} to {}"
+                ).format(
+                    i,
+                    self.shape[i],
+                    slice_list[i].start.value(),
+                    slice_list[i].end.value(),
+                )
+                raise Error(message)
+            # if slice_list[i].step is None:
+            #     raise Error(String("Step of slice is None."))
+            var slice_len: Int = (
+                (slice_list[i].end.value() - slice_list[i].start.value())
+                / slice_list[i].step.or_else(1)
+            ).__int__()
+            spec.append(slice_len)
+            if slice_len != 1:
+                ndims += 1
+            else:
+                count += 1
+        if count == slice_list.__len__():
+            ndims = 1
 
-        # Change the first number of the ndshape
-        var shape = self.shape
-        shape._buf[0] = len_of_result
+        var nshape: List[Int] = List[Int]()
+        var ncoefficients: List[Int] = List[Int]()
+        var nstrides: List[Int] = List[Int]()
+        var nnum_elements: Int = 1
 
-        var result = NDArray[dtype](shape)
-        var size_per_item = self.size // self.shape[0]
+        var j: Int = 0
+        count = 0
+        for _ in range(ndims):
+            while spec[j] == 1:
+                count += 1
+                j += 1
+            if j >= self.ndim:
+                break
+            var slice_len: Int = (
+                (slice_list[j].end.value() - slice_list[j].start.value())
+                / slice_list[j].step.or_else(1)
+            ).__int__()
+            nshape.append(slice_len)
+            nnum_elements *= slice_len
+            ncoefficients.append(
+                self.strides[j] * slice_list[j].step.or_else(1)
+            )
+            j += 1
 
-        # Fill in the values
-        var offset = 0
-        for i in range(mask.size):
-            if mask.item(i):
-                memcpy(
-                    result._buf.ptr + offset * size_per_item,
-                    self._buf.ptr + i * size_per_item,
-                    size_per_item,
-                )
-                offset += 1
+        # TODO: We can remove this check after we have support for broadcasting
+        for i in range(ndims):
+            if nshape[i] != val.shape[i]:
+                var message = String(
+                    "Error: Shape mismatch!\n"
+                    "For {}-th dimension: \n"
+                    "The size of the array is {}.\n"
+                    "The size of the input value is {}."
+                ).format(i, nshape[i], val.shape[i])
+                raise Error(message)
 
-        return result
+        var noffset: Int = 0
+        if self.flags.C_CONTIGUOUS:
+            noffset = 0
+            for i in range(ndims):
+                var temp_stride: Int = 1
+                for j in range(i + 1, ndims):  # temp
+                    temp_stride *= nshape[j]
+                nstrides.append(temp_stride)
+            for i in range(slice_list.__len__()):
+                noffset += slice_list[i].start.value() * self.strides[i]
+        elif self.flags.F_CONTIGUOUS:
+            noffset = 0
+            nstrides.append(1)
+            for i in range(0, ndims - 1):
+                nstrides.append(nstrides[i] * nshape[i])
+            for i in range(slice_list.__len__()):
+                noffset += slice_list[i].start.value() * self.strides[i]
 
-    fn __getitem__(self, mask: List[Bool]) raises -> Self:
+        var index = List[Int]()
+        for _ in range(ndims):
+            index.append(0)
+
+        _traverse_iterative_setter[dtype](
+            val, self, nshape, ncoefficients, nstrides, noffset, index
+        )
+
+    fn __setitem__(mut self, *slices: Variant[Slice, Int], val: Self) raises:
         """
-        Get items from 0-th dimension of an array according to mask.
-        __getitem__(self, mask: NDArray[DType.bool]) raises -> Self.
+        Get items by a series of either slices or integers.
 
         Args:
-            mask: A list of boolean values.
-
-        Returns:
-            NDArray with items from the mask.
+            slices: Variadic slices or integers.
+            val: Value to set.
 
         Example:
         ```console
-        >>>var a = nm.arange[i8](6)
-        >>>print(a)
-        [       0       1       2       3       4       5       ]
-        1-D array  Shape: [6]  DType: int8  C-cont: True  F-cont: True  own data: True
-        >>>print(a[List[Bool](True, False, True, True, False, True)])
-        [       0       2       3       5       ]
-        1-D array  Shape: [4]  DType: int8  C-cont: True  F-cont: True  own data: True
-
-        var b = nm.arange[i8](12).reshape(Shape(2, 2, 3))
-        print(b)
-        [[[     0       1       2       ]
-          [     3       4       5       ]]
-         [[     6       7       8       ]
-          [     9       10      11      ]]]
-        3-D array  Shape: [2, 2, 3]  DType: int8  C-cont: True  F-cont: False  own data: True
-        >>>print(b[List[Bool](False, True)])
-        [[[     6       7       8       ]
-          [     9       10      11      ]]]
-        3-D array  Shape: [1, 2, 3]  DType: int8  C-cont: True  F-cont: True  own data: True
-        ```.
+        >>> var a = nm.arange[i8](16).reshape(Shape(4, 4))
+        print(a)
+        [[      0       1       2       3       ]
+         [      4       5       6       7       ]
+         [      8       9       10      11      ]
+         [      12      13      14      15      ]]
+        2-D array  Shape: [4, 4]  DType: int8  C-cont: True  F-cont: False  own data: True
+        >>> a[0, Slice(2, 4)] = a[3, Slice(0, 2)]
+        print(a)
+        [[      0       1       12      13      ]
+         [      4       5       6       7       ]
+         [      8       9       10      11      ]
+         [      12      13      14      15      ]]
+        2-D array  Shape: [4, 4]  DType: int8  C-cont: True  F-cont: False  own data: True
+        ```
         """
+        var n_slices: Int = slices.__len__()
+        if n_slices > self.ndim:
+            raise Error("Error: No of slices greater than rank of array")
+        var slice_list: List[Slice] = List[Slice]()
 
-        var mask_array = NDArray[DType.bool](shape=Shape(len(mask)))
-        for i in range(len(mask)):
-            (mask_array._buf.ptr + i).init_pointee_copy(mask[i])
-
-        return self[mask_array]
+        var count_int = 0
+        for i in range(len(slices)):
+            if slices[i].isa[Slice]():
+                slice_list.append(slices[i]._get_ptr[Slice]()[0])
+            elif slices[i].isa[Int]():
+                count_int += 1
+                var int: Int = slices[i]._get_ptr[Int]()[0]
+                slice_list.append(Slice(int, int + 1, 1))
 
-    # ===-------------------------------------------------------------------===#
-    # Operator dunders
-    # ===-------------------------------------------------------------------===#
+        if n_slices < self.ndim:
+            for i in range(n_slices, self.ndim):
+                var size_at_dim: Int = self.shape[i]
+                slice_list.append(Slice(0, size_at_dim, 1))
 
-    # TODO: We should make a version that checks nonzero/not_nan
-    fn __bool__(self) raises -> Bool:
-        """
-        If all true return true.
-        """
-        if self.dtype == DType.bool:
-            if self.all():
-                return True
-            else:
-                return False
-        raise Error(
-            "core:ndarray:NDArray:__bool__: Bool is currently only implemented"
-            " for DType.bool"
-        )
+        self.__setitem__(slices=slice_list, val=val)
 
-    fn __int__(self) raises -> Int:
+    # TODO: fix this setter, add bound checks. Not sure about it's use case.
+    fn __setitem__(self, index: NDArray[DType.index], val: NDArray) raises:
         """
-        Gets `Int` representation of the array.
-
-        Only 0-D arrays or length-1 arrays can be converted to scalars.
+        Returns the items of the array from an array of indices.
 
-        Raises:
-            Error: If the array is not 0-D or length-1.
+        Args:
+            index: Array of indices.
+            val: Value to set.
 
         Example:
         ```console
-        > var A = NDArray[dtype](6, random=True)
-        > print(Int(A))
-
-        Unhandled exception caught during execution: Only 0-D arrays or length-1 arrays can be converted to scalars
-        mojo: error: execution exited with a non-zero result: 1
-
-        > var B = NDArray[dtype](1, 1, random=True)
-        > print(Int(B))
-        14
+        > var X = nm.NDArray[nm.i8](3,random=True)
+        > print(X)
+        [       32      21      53      ]
+        1-D array  Shape: [3]  DType: int8
+        > print(X.argsort())
+        [       1       0       2       ]
+        1-D array  Shape: [3]  DType: index
+        > print(X[X.argsort()])
+        [       21      32      53      ]
+        1-D array  Shape: [3]  DType: int8
         ```
+        """
 
-        Returns:
-            Int representation of the array.
+        for i in range(len(index)):
+            self.store(Int(index.load(i)), rebind[Scalar[dtype]](val.load(i)))
 
+    fn __setitem__(
+        mut self, mask: NDArray[DType.bool], val: NDArray[dtype]
+    ) raises:
         """
-        if (self.size == 1) or (self.ndim == 0):
-            return Int(self._buf.ptr[])
-        else:
-            raise Error(
-                "Error in `numojo.NDArray.__int__(self)`: \n"
-                "Only 0-D arrays (numojo scalar) or length-1 arrays "
-                "can be converted to scalars."
-            )
+        Set the value of the array at the indices where the mask is true.
 
-    fn __float__(self) raises -> Float64:
+        Args:
+            mask: Boolean mask array.
+            val: Value to set.
+
+        Example:
+        ```
+        var A = numojo.core.NDArray[numojo.i16](6, random=True)
+        var mask = A > 0
+        print(A)
+        print(mask)
+        A[mask] = 0
+        print(A)
+        ```
         """
-        Gets `Float64` representation of the array.
+        if (
+            mask.shape != self.shape
+        ):  # this behavious could be removed potentially
+            var message = String(
+                "Shape of mask does not match the shape of array."
+            )
+            raise Error(message)
 
-        Only 0-D arrays or length-1 arrays can be converted to scalars.
+        for i in range(mask.size):
+            if mask._buf.ptr.load(i):
+                self._buf.ptr.store(i, val._buf.ptr.load(i))
 
-        Raises:
-            Error: If the array is not 0-D or length-1.
+    fn itemset(
+        mut self, index: Variant[Int, List[Int]], item: Scalar[dtype]
+    ) raises:
+        """Set the scalar at the coordinates.
 
-        Returns:
-            Float representation of the array.
+        Args:
+            index: The coordinates of the item.
+                Can either be `Int` or `List[Int]`.
+                If `Int` is passed, it is the index of i-th item of the whole array.
+                If `List[Int]` is passed, it is the coordinate of the item.
+            item: The scalar to be set.
+
+        Note:
+            This is similar to `numpy.ndarray.itemset`.
+            The difference is that we takes in `List[Int]`, but numpy takes in a tuple.
+
+        An example goes as follows.
+
+        ```
+        import numojo as nm
 
+        fn main() raises:
+            var A = nm.zeros[nm.i16](3, 3)
+            print(A)
+            A.itemset(5, 256)
+            print(A)
+            A.itemset(List(1,1), 1024)
+            print(A)
+        ```
+        ```console
+        [[      0       0       0       ]
+         [      0       0       0       ]
+         [      0       0       0       ]]
+        2-D array  Shape: [3, 3]  DType: int16
+        [[      0       0       0       ]
+         [      0       0       256     ]
+         [      0       0       0       ]]
+        2-D array  Shape: [3, 3]  DType: int16
+        [[      0       0       0       ]
+         [      0       1024    256     ]
+         [      0       0       0       ]]
+        2-D array  Shape: [3, 3]  DType: int16
+        ```
         """
-        if (self.size == 1) or (self.ndim == 0):
-            return Float64(self._buf.ptr[])
+
+        # If one index is given
+        if index.isa[Int]():
+            var idx = index._get_ptr[Int]()[]
+            if idx < self.size:
+                if self.flags.F_CONTIGUOUS:
+                    # column-major should be converted to row-major
+                    # The following code can be taken out as a function that
+                    # convert any index to coordinates according to the order
+                    var c_stride = NDArrayStrides(shape=self.shape)
+                    var c_coordinates = List[Int]()
+                    for i in range(c_stride.ndim):
+                        var coordinate = idx // c_stride[i]
+                        idx = idx - c_stride[i] * coordinate
+                        c_coordinates.append(coordinate)
+                    self._buf.ptr.store(
+                        _get_offset(c_coordinates, self.strides), item
+                    )
+
+                self._buf.ptr.store(idx, item)
+            else:
+                raise Error(
+                    String(
+                        "Error: Elements of `index` ({}) \n"
+                        "exceed the array size ({})."
+                    ).format(idx, self.size)
+                )
+
         else:
-            raise Error(
-                "Error in `numojo.NDArray.__int__(self)`: \n"
-                "Only 0-D arrays (numojo scalar) or length-1 arrays "
-                "can be converted to scalars."
-            )
+            var indices = index._get_ptr[List[Int]]()[]
+            # If more than one index is given
+            if indices.__len__() != self.ndim:
+                raise Error("Error: Length of Indices do not match the shape")
+            for i in range(indices.__len__()):
+                if indices[i] >= self.shape[i]:
+                    raise Error(
+                        "Error: Elements of `index` exceed the array shape"
+                    )
+            self._buf.ptr.store(_get_offset(indices, self.strides), item)
 
-    fn __pos__(self) raises -> Self:
+    fn store(self, owned index: Int, val: Scalar[dtype]) raises:
         """
-        Unary positve returns self unless boolean type.
+        Safely store a scalar to i-th item of the underlying buffer.
+
+        `A.store(i, a)` differs from `A._buf.ptr[i] = a` due to boundary check.
+
+        Args:
+            index: Index of the item.
+            val: Value to store.
+
+        Raises:
+            Index out of boundary.
+
+        Example:
+        ```console
+        > array.store(15, val = 100)
+        ```
+        sets the item of index 15 of the array's data buffer to 100.
+
+        Note that it does not checked against C-order or F-order.
         """
-        if self.dtype is DType.bool:
+
+        if index < 0:
+            index += self.size
+
+        if (index >= self.size) or (index < 0):
             raise Error(
-                "ndarray:NDArrray:__pos__: pos does not accept bool type arrays"
+                String("Invalid index: index out of bound [0, {}).").format(
+                    self.size
+                )
             )
-        return self
 
-    fn __neg__(self) raises -> Self:
+        self._buf.ptr[index] = val
+
+    fn store[width: Int](mut self, index: Int, val: SIMD[dtype, width]) raises:
         """
-        Unary negative returns self unless boolean type.
+        Safely stores SIMD element of size `width` at `index`
+        of the underlying buffer.
 
-        For bolean use `__invert__`(~)
+        To bypass boundary checks, use `self._buf.ptr.store` directly.
+
+        Args:
+            index: Index of the item.
+            val: Value to store.
+
+        Raises:
+            Index out of boundary.
         """
-        if self.dtype is DType.bool:
+
+        if (index < 0) or (index >= self.size):
             raise Error(
-                "ndarray:NDArrray:__pos__: pos does not accept bool type arrays"
+                String("Invalid index: index out of bound [0, {}).").format(
+                    self.size
+                )
             )
-        return self * Scalar[dtype](-1.0)
 
-    # maybe they don't need conversion with astype.
-    @always_inline("nodebug")
-    fn __eq__[
-        OtherDtype: DType,
-        ResultDType: DType = TypeCoercion.result[dtype, OtherDtype](),
-    ](self, other: NDArray[OtherDtype]) raises -> NDArray[DType.bool]:
+        self._buf.ptr.store(index, val)
+
+    fn store[
+        width: Int = 1
+    ](mut self, *indices: Int, val: SIMD[dtype, width]) raises:
         """
-        Itemwise equivalence.
+        Safely stores SIMD element of size `width` at given variadic indices
+        of the underlying buffer.
+
+        To bypass boundary checks, use `self._buf.ptr.store` directly.
+
+        Args:
+            indices: Variadic indices.
+            val: Value to store.
+
+        Raises:
+            Index out of boundary.
         """
-        return comparison.equal[ResultDType](
-            self.astype[ResultDType](), other.astype[ResultDType]()
-        )
 
-    @always_inline("nodebug")
-    fn __eq__[
-        OtherDtype: DType,
-        ResultDType: DType = TypeCoercion.result[dtype, OtherDtype](),
-    ](self, other: Scalar[OtherDtype]) raises -> NDArray[DType.bool]:
+        if len(indices) != self.ndim:
+            raise (
+                String(
+                    "Length of indices {} does not match ndim {}".format(
+                        len(indices), self.ndim
+                    )
+                )
+            )
+
+        for i in range(self.ndim):
+            if (indices[i] < 0) or (indices[i] >= self.shape[i]):
+                raise Error(
+                    String(
+                        "Invalid index at {}-th dim: "
+                        "index out of bound [0, {})."
+                    ).format(i, self.shape[i])
+                )
+
+        var idx: Int = _get_offset(indices, self.strides)
+        self._buf.ptr.store(idx, val)
+
+    # ===-------------------------------------------------------------------===#
+    # Operator dunders
+    # ===-------------------------------------------------------------------===#
+
+    # TODO: We should make a version that checks nonzero/not_nan
+    fn __bool__(self) raises -> Bool:
         """
-        Itemwise equivalence.
+        If all true return true.
         """
-        return comparison.equal[ResultDType](
-            self.astype[ResultDType](), other.cast[ResultDType]()
+        if self.dtype == DType.bool:
+            if self.all():
+                return True
+            else:
+                return False
+        raise Error(
+            "core:ndarray:NDArray:__bool__: Bool is currently only implemented"
+            " for DType.bool"
         )
 
-    @always_inline("nodebug")
-    fn __eq__(self, other: Self) raises -> NDArray[DType.bool]:
-        """
-        Itemwise equivalence.
+    fn __int__(self) raises -> Int:
         """
-        return comparison.equal[dtype](self, other)
+        Gets `Int` representation of the array.
 
-    @always_inline("nodebug")
-    fn __eq__(self, other: SIMD[dtype, 1]) raises -> NDArray[DType.bool]:
-        """
-        Itemwise equivalence between scalar and Array.
+        Only 0-D arrays or length-1 arrays can be converted to scalars.
+
+        Raises:
+            Error: If the array is not 0-D or length-1.
+
+        Example:
+        ```console
+        > var A = NDArray[dtype](6, random=True)
+        > print(Int(A))
+
+        Unhandled exception caught during execution: Only 0-D arrays or length-1 arrays can be converted to scalars
+        mojo: error: execution exited with a non-zero result: 1
+
+        > var B = NDArray[dtype](1, 1, random=True)
+        > print(Int(B))
+        14
+        ```
+
+        Returns:
+            Int representation of the array.
+
+        """
+        if (self.size == 1) or (self.ndim == 0):
+            return Int(self._buf.ptr[])
+        else:
+            raise Error(
+                "Error in `numojo.NDArray.__int__(self)`: \n"
+                "Only 0-D arrays (numojo scalar) or length-1 arrays "
+                "can be converted to scalars."
+            )
+
+    fn __float__(self) raises -> Float64:
+        """
+        Gets `Float64` representation of the array.
+
+        Only 0-D arrays or length-1 arrays can be converted to scalars.
+
+        Raises:
+            Error: If the array is not 0-D or length-1.
+
+        Returns:
+            Float representation of the array.
+
+        """
+        if (self.size == 1) or (self.ndim == 0):
+            return Float64(self._buf.ptr[])
+        else:
+            raise Error(
+                "Error in `numojo.NDArray.__int__(self)`: \n"
+                "Only 0-D arrays (numojo scalar) or length-1 arrays "
+                "can be converted to scalars."
+            )
+
+    fn __pos__(self) raises -> Self:
+        """
+        Unary positve returns self unless boolean type.
+        """
+        if self.dtype is DType.bool:
+            raise Error(
+                "ndarray:NDArrray:__pos__: pos does not accept bool type arrays"
+            )
+        return self
+
+    fn __neg__(self) raises -> Self:
+        """
+        Unary negative returns self unless boolean type.
+
+        For bolean use `__invert__`(~)
+        """
+        if self.dtype is DType.bool:
+            raise Error(
+                "ndarray:NDArrray:__pos__: pos does not accept bool type arrays"
+            )
+        return self * Scalar[dtype](-1.0)
+
+    # maybe they don't need conversion with astype.
+    @always_inline("nodebug")
+    fn __eq__[
+        OtherDtype: DType,
+        ResultDType: DType = TypeCoercion.result[dtype, OtherDtype](),
+    ](self, other: NDArray[OtherDtype]) raises -> NDArray[DType.bool]:
+        """
+        Itemwise equivalence.
+        """
+        return comparison.equal[ResultDType](
+            self.astype[ResultDType](), other.astype[ResultDType]()
+        )
+
+    @always_inline("nodebug")
+    fn __eq__[
+        OtherDtype: DType,
+        ResultDType: DType = TypeCoercion.result[dtype, OtherDtype](),
+    ](self, other: Scalar[OtherDtype]) raises -> NDArray[DType.bool]:
+        """
+        Itemwise equivalence.
+        """
+        return comparison.equal[ResultDType](
+            self.astype[ResultDType](), other.cast[ResultDType]()
+        )
+
+    @always_inline("nodebug")
+    fn __eq__(self, other: Self) raises -> NDArray[DType.bool]:
+        """
+        Itemwise equivalence.
+        """
+        return comparison.equal[dtype](self, other)
+
+    @always_inline("nodebug")
+    fn __eq__(self, other: SIMD[dtype, 1]) raises -> NDArray[DType.bool]:
+        """
+        Itemwise equivalence between scalar and Array.
         """
         return comparison.equal[dtype](self, other)
 
@@ -1989,988 +2454,636 @@ struct NDArray[dtype: DType = DType.float64](
     fn __imul__(mut self, other: SIMD[dtype, 1]) raises:
         """
         Enables `array *= scalar`.
-        """
-        self = self * other
-
-    fn __imul__(mut self, other: Self) raises:
-        """
-        Enables `array *= array`.
-        """
-        self = self * other
-
-    fn __abs__(self) -> Self:
-        return abs(self)
-
-    fn __invert__(self) raises -> Self:
-        """
-        Element-wise inverse (~ or not), only for bools and integral types.
-        """
-        return bitwise.invert[dtype](self)
-
-    fn __pow__(self, p: Int) -> Self:
-        return self._elementwise_pow(p)
-
-    fn __pow__(self, rhs: Scalar[dtype]) raises -> Self:
-        """Power of items."""
-        var res = self
-        for i in range(self.size):
-            res._buf.ptr[i] = self._buf.ptr[i].__pow__(rhs)
-        return res^
-
-    fn __pow__(self, p: Self) raises -> Self:
-        if self.size != p.size:
-            raise Error(
-                String(
-                    "Both arrays must have same number of elements! \n"
-                    "Self array has {} elements. \n"
-                    "Other array has {} elements"
-                ).format(self.size, p.size)
-            )
-
-        var result = Self(self.shape)
-
-        @parameter
-        fn vectorized_pow[simd_width: Int](index: Int) -> None:
-            result._buf.ptr.store(
-                index,
-                self._buf.ptr.load[width=simd_width](index)
-                ** p._buf.ptr.load[width=simd_width](index),
-            )
-
-        vectorize[vectorized_pow, self.width](self.size)
-        return result
-
-    fn __ipow__(mut self, p: Int):
-        self = self.__pow__(p)
-
-    fn _elementwise_pow(self, p: Int) -> Self:
-        var new_vec = self
-
-        @parameter
-        fn array_scalar_vectorize[simd_width: Int](index: Int) -> None:
-            new_vec._buf.ptr.store(
-                index,
-                builtin_math.pow(
-                    self._buf.ptr.load[width=simd_width](index), p
-                ),
-            )
-
-        vectorize[array_scalar_vectorize, self.width](self.size)
-        return new_vec
-
-    fn __truediv__[
-        OtherDType: DType,
-        ResultDType: DType = TypeCoercion.result[dtype, OtherDType](),
-    ](self, other: Scalar[OtherDType]) raises -> NDArray[ResultDType]:
-        """
-        Enables `array / scalar`.
-        """
-        return math.div[ResultDType](
-            self.astype[ResultDType](), other.cast[ResultDType]()
-        )
-
-    fn __truediv__[
-        OtherDType: DType,
-        ResultDType: DType = TypeCoercion.result[dtype, OtherDType](),
-    ](self, other: NDArray[OtherDType]) raises -> NDArray[ResultDType]:
-        """
-        Enables `array / array`.
-        """
-        return math.div[ResultDType](
-            self.astype[ResultDType](), other.astype[ResultDType]()
-        )
-
-    fn __truediv__(self, other: SIMD[dtype, 1]) raises -> Self:
-        """
-        Enables `array / scalar`.
-        """
-        return math.div[dtype](self, other)
-
-    fn __truediv__(self, other: Self) raises -> Self:
-        """
-        Enables `array / array`.
-        """
-        return math.div[dtype](self, other)
-
-    fn __itruediv__(mut self, s: SIMD[dtype, 1]) raises:
-        """
-        Enables `array /= scalar`.
-        """
-        self = self.__truediv__(s)
-
-    fn __itruediv__(mut self, other: Self) raises:
-        """
-        Enables `array /= array`.
-        """
-        self = self.__truediv__(other)
-
-    fn __rtruediv__[
-        OtherDType: DType,
-        ResultDType: DType = TypeCoercion.result[dtype, OtherDType](),
-    ](self, s: Scalar[OtherDType]) raises -> NDArray[ResultDType]:
-        """
-        Enables `scalar / array`.
-        """
-        return math.div[ResultDType](
-            s.cast[ResultDType](), self.astype[ResultDType]()
-        )
-
-    fn __rtruediv__(self, s: SIMD[dtype, 1]) raises -> Self:
-        """
-        Enables `scalar / array`.
-        """
-        return math.div[dtype](s, self)
-
-    fn __floordiv__[
-        OtherDType: DType,
-        ResultDType: DType = TypeCoercion.result[dtype, OtherDType](),
-    ](self, other: Scalar[OtherDType]) raises -> NDArray[ResultDType]:
-        """
-        Enables `array // scalar`.
-        """
-        return math.floor_div[ResultDType](
-            self.astype[ResultDType](), other.cast[ResultDType]()
-        )
-
-    fn __floordiv__[
-        OtherDType: DType,
-        ResultDType: DType = TypeCoercion.result[dtype, OtherDType](),
-    ](self, other: NDArray[OtherDType]) raises -> NDArray[ResultDType]:
-        """
-        Enables `array // array`.
-        """
-        return math.floor_div[ResultDType](
-            self.astype[ResultDType](), other.astype[ResultDType]()
-        )
-
-    fn __floordiv__(self, other: SIMD[dtype, 1]) raises -> Self:
-        """
-        Enables `array // scalar`.
-        """
-        return math.floor_div[dtype](self, other)
-
-    fn __floordiv__(self, other: Self) raises -> Self:
-        """
-        Enables `array // array`.
-        """
-        return math.floor_div[dtype](self, other)
-
-    fn __ifloordiv__(mut self, s: SIMD[dtype, 1]) raises:
-        """
-        Enables `array //= scalar`.
-        """
-        self = self.__floordiv__(s)
-
-    fn __ifloordiv__(mut self, other: Self) raises:
-        """
-        Enables `array //= array`.
-        """
-        self = self.__floordiv__(other)
-
-    fn __rfloordiv__[
-        OtherDType: DType,
-        ResultDType: DType = TypeCoercion.result[dtype, OtherDType](),
-    ](self, other: Scalar[OtherDType]) raises -> NDArray[ResultDType]:
-        """
-        Enables `scalar // array`.
-        """
-        return math.floor_div[ResultDType](
-            other.cast[ResultDType](), self.astype[ResultDType]()
-        )
-
-    fn __rfloordiv__(self, other: SIMD[dtype, 1]) raises -> Self:
-        """
-        Enables `scalar // array`.
-        """
-        return math.floor_div[dtype](other, self)
-
-    fn __mod__[
-        OtherDType: DType,
-        ResultDType: DType = TypeCoercion.result[dtype, OtherDType](),
-    ](self, other: Scalar[OtherDType]) raises -> NDArray[ResultDType]:
-        """
-        Enables `array % scalar`.
-        """
-        return math.mod[ResultDType](
-            self.astype[ResultDType](), other.cast[ResultDType]()
-        )
-
-    fn __mod__[
-        OtherDType: DType,
-        ResultDType: DType = TypeCoercion.result[dtype, OtherDType](),
-    ](self, other: NDArray[OtherDType]) raises -> NDArray[ResultDType]:
-        """
-        Enables `array % array`.
-        """
-        return math.mod[ResultDType](
-            self.astype[ResultDType](), other.astype[ResultDType]()
-        )
-
-    fn __mod__(mut self, other: SIMD[dtype, 1]) raises -> Self:
-        """
-        Enables `array % scalar`.
-        """
-        return math.mod[dtype](self, other)
-
-    fn __mod__(mut self, other: NDArray[dtype]) raises -> Self:
-        """
-        Enables `array % array`.
-        """
-        return math.mod[dtype](self, other)
-
-    fn __imod__(mut self, other: SIMD[dtype, 1]) raises:
-        """
-        Enables `array %= scalar`.
-        """
-        self = math.mod[dtype](self, other)
-
-    fn __imod__(mut self, other: NDArray[dtype]) raises:
-        """
-        Enables `array %= array`.
-        """
-        self = math.mod[dtype](self, other)
-
-    fn __rmod__(mut self, other: SIMD[dtype, 1]) raises -> Self:
-        """
-        Enables `scalar % array`.
-        """
-        return math.mod[dtype](other, self)
-
-    fn __rmod__[
-        OtherDType: DType,
-        ResultDType: DType = TypeCoercion.result[dtype, OtherDType](),
-    ](self, other: Scalar[OtherDType]) raises -> NDArray[ResultDType]:
-        """
-        Enables `scalar % array`.
-        """
-        return math.mod[ResultDType](
-            other.cast[ResultDType](), self.astype[ResultDType]()
-        )
-
-    # ===-------------------------------------------------------------------===#
-    # IO dunders and other methods
-    # Trait implementations
-    # ===-------------------------------------------------------------------===#
-    fn __str__(self) -> String:
-        """
-        Enables String(array).
-        """
-        var res: String
-        try:
-            res = self._array_to_string(0, 0, GLOBAL_PRINT_OPTIONS)
-        except e:
-            res = String("Cannot convert array to string.\n") + String(e)
-
-        return res
-
-    fn write_to[W: Writer](self, mut writer: W):
-        if self.ndim == 0:
-            # For 0darray (numojo scalar), we can directly write the value
-            writer.write(
-                String(self._buf.ptr[])
-                + String("  (0darray[" + _concise_dtype_str(self.dtype) + "])")
-            )
-        else:
-            try:
-                writer.write(
-                    self._array_to_string(0, 0, GLOBAL_PRINT_OPTIONS)
-                    + "\n"
-                    + String(self.ndim)
-                    + "D-array  Shape"
-                    + String(self.shape)
-                    + "  Strides"
-                    + String(self.strides)
-                    + "  DType: "
-                    + _concise_dtype_str(self.dtype)
-                    + "  C-cont: "
-                    + String(self.flags.C_CONTIGUOUS)
-                    + "  F-cont: "
-                    + String(self.flags.F_CONTIGUOUS)
-                    + "  own data: "
-                    + String(self.flags.OWNDATA)
-                )
-            except e:
-                writer.write("Cannot convert array to string.\n" + String(e))
-
-    fn __repr__(self) -> String:
-        """
-        Compute the "official" string representation of NDArray.
-
-        You can construct the array using this representation.
-
-        An example is:
-        ```console
-        >>>import numojo as nm
-        >>>var b = nm.arange[nm.f32](20).reshape(Shape(4, 5))
-        >>>print(repr(b))
-        numojo.array[f32](
-        '''
-        [[0.0, 1.0, 2.0, 3.0, 4.0]
-         [5.0, 6.0, 7.0, 8.0, 9.0]
-         [10.0, 11.0, 12.0, 13.0, 14.0]
-         [15.0, 16.0, 17.0, 18.0, 19.0]]
-        '''
-        )
-        ```
-        """
-        var result: String
-
-        try:
-            result = (
-                String("numojo.array[")
-                + _concise_dtype_str(self.dtype)
-                + String('](\n"""\n')
-                + self._array_to_string(0, 0, GLOBAL_PRINT_OPTIONS)
-                + '\n"""\n)'
-            )
-        except e:
-            result = "Cannot convert array to string.\n" + String(e)
-
-        return result
-
-    fn __len__(self) -> Int:
-        """
-        Returns length of 0-th dimension.
-        """
-        return self.shape._buf[0]
-
-    fn __iter__(self) raises -> _NDArrayIter[__origin_of(self), dtype]:
-        """
-        Iterate over elements of the NDArray and return sub-arrays as view.
-
-        Returns:
-            An iterator of NDArray elements.
-
-        Example:
-        ```
-        >>> var a = nm.random.arange[nm.i8](2, 3, 4).reshape(nm.Shape(2, 3, 4))
-        >>> for i in a:
-        ...     print(i)
-        [[      0       1       2       3       ]
-         [      4       5       6       7       ]
-         [      8       9       10      11      ]]
-        2-D array  Shape: [3, 4]  DType: int8  C-cont: True  F-cont: False  own data: False
-        [[      12      13      14      15      ]
-         [      16      17      18      19      ]
-         [      20      21      22      23      ]]
-        2-D array  Shape: [3, 4]  DType: int8  C-cont: True  F-cont: False  own data: False
-        ```.
-        """
-
-        return _NDArrayIter[__origin_of(self), dtype](
-            ptr=self._buf.ptr,
-            length=self.shape[0],
-            stride_of_axis=self.strides[0],
-            shape=self.shape._pop(axis=0),
-            strides=self.strides._pop(axis=0),
-        )
-
-    fn __reversed__(
-        self,
-    ) raises -> _NDArrayIter[__origin_of(self), dtype, forward=False]:
-        """Iterate backwards over elements of the NDArray, returning
-        copied value.
-
-        Returns:
-            A reversed iterator of NDArray elements.
-        """
-
-        return _NDArrayIter[__origin_of(self), dtype, forward=False](
-            ptr=self._buf.ptr,
-            length=self.shape[0],
-            stride_of_axis=self.strides[0],
-            shape=self.shape._pop(axis=0),
-            strides=self.strides._pop(axis=0),
-        )
-
-    fn _adjust_slice(self, slice_list: List[Slice]) raises -> List[Slice]:
-        """
-        Adjusts the slice values to lie within 0 and dim.
-
-        Args:
-            slice_list: List of slices.
-
-        Returns:
-            Adjusted list of slices.
-        """
-        var n_slices: Int = slice_list.__len__()
-        var slices = List[Slice]()
-        for i in range(n_slices):
-            if i >= self.ndim:
-                raise Error("Error: Number of slices exceeds array dimensions")
-
-            var start: Int = 0
-            var end: Int = self.shape[i]
-            var step: Int = 1
-            if slice_list[i].start is not None:
-                start = slice_list[i].start.value()
-                if start < 0:
-                    # start += self.shape[i]
-                    raise Error(
-                        "Error: Negative indexing in slices not supported"
-                        " currently"
-                    )
-
-            if slice_list[i].end is not None:
-                end = slice_list[i].end.value()
-                if end < 0:
-                    # end += self.shape[i] + 1
-                    raise Error(
-                        "Error: Negative indexing in slices not supported"
-                        " currently"
-                    )
-            step = slice_list[i].step.or_else(1)
-            if step == 0:
-                raise Error("Error: Slice step cannot be zero")
-
-            slices.append(
-                Slice(
-                    start=Optional(start),
-                    end=Optional(end),
-                    step=Optional(step),
-                )
-            )
-
-        return slices^
-
-    fn _array_to_string(
-        self,
-        dimension: Int,
-        offset: Int,
-        owned print_options: PrintOptions,
-    ) raises -> String:
-        """
-        Convert the array to a string.
-
-        Args:
-            dimension: The current dimension.
-            offset: The offset of the current dimension.
-            print_options: The print options.
-
-        Returns:
-            String representation of the array.
-        """
-
-        if self.ndim == 0:
-            # For 0darray (numojo scalar), return the scalar value.
-            return String(self._buf.ptr[0])
-
-        var seperator = print_options.separator
-        var padding = print_options.padding
-        var edge_items = print_options.edge_items
-
-        # The following code get the max value and the min value
-        # to determine the digits before decimals and the negative sign
-        # and then determine the formatted withd
-        var negative_sign: Bool = False  # whether there should be a negative sign
-        var number_of_digits: Int  # number of digits before or after decimal point
-        var number_of_digits_small_values: Int  # number of digits after decimal point for small values
-        var formatted_width: Int  # formatted width based on precision and digits before decimal points
-        var max_value: Scalar[dtype] = abs(
-            self._buf.ptr[]
-        )  # maximum absolute value of the items
-        var min_value: Scalar[dtype] = abs(
-            self._buf.ptr[]
-        )  # minimum absolute value of the items
-        var val: Scalar[dtype]  # storage of value of the item
-
-        var skip: Bool
-        for index in range(self.size):
-            skip = False
-            var remainder = index
-            var indices = Item(ndim=self.ndim, initialized=False)
-            for i in range(self.ndim):
-                indices[i], remainder = divmod(
-                    remainder, NDArrayStrides(self.shape)[i]
-                )
-                if (indices[i] >= 3) and (indices[i] < self.shape[i] - 3):
-                    skip = True
-                    continue
-            if skip:
-                continue
-
-            val = self._buf.ptr[_get_offset(indices, self.strides)]
-            if val < 0:
-                negative_sign = True
-            max_value = max(
-                max_value,
-                abs(val),
-            )
-            min_value = min(
-                min_value,
-                abs(val),
-            )
-        number_of_digits = Int(log10(Float64(max_value))) + 1
-        number_of_digits_small_values = abs(Int(log10(Float64(min_value)))) + 1
-
-        if dtype.is_floating_point():
-            formatted_width = (
-                print_options.precision
-                + 1
-                + number_of_digits
-                + Int(negative_sign)
-            )
-        else:
-            formatted_width = number_of_digits + Int(negative_sign)
-
-        # If the number is not too wide,
-        # or digits after decimal point is not many
-        # format it as a floating point.
-        if (formatted_width <= 14) and (number_of_digits_small_values <= 2):
-            print_options.formatted_width = formatted_width
-        # Otherwise, format it as a scientific number.
-        else:
-            print_options.float_format = "scientific"
-            print_options.formatted_width = 7 + print_options.precision
-
-        if dimension == self.ndim - 1:
-            var result: String = String("[") + padding
-            var number_of_items = self.shape[dimension]
-            if number_of_items <= edge_items * 2:  # Print all items
-                for i in range(number_of_items):
-                    var value = self.load[width=1](
-                        offset + i * self.strides[dimension]
-                    )
-                    var formatted_value = format_value(value, print_options)
-                    result = result + formatted_value
-                    if i < (number_of_items - 1):
-                        result = result + seperator
-                result = result + padding
-            else:  # Print first 3 and last 3 items
-                for i in range(edge_items):
-                    var value = self.load[width=1](
-                        offset + i * self.strides[dimension]
-                    )
-                    var formatted_value = format_value(value, print_options)
-                    result = result + formatted_value
-                    if i < (edge_items - 1):
-                        result = result + seperator
-                result = result + seperator + "..." + seperator
-                for i in range(number_of_items - edge_items, number_of_items):
-                    var value = self.load[width=1](
-                        offset + i * self.strides[dimension]
-                    )
-                    var formatted_value = format_value(value, print_options)
-                    result = result + formatted_value
-                    if i < (number_of_items - 1):
-                        result = result + seperator
-                result = result + padding
-            result = result + "]"
-            return result
-        else:
-            var result: String = String("[")
-            var number_of_items = self.shape[dimension]
-            if number_of_items <= edge_items * 2:  # Print all items
-                for i in range(number_of_items):
-                    if i == 0:
-                        result = result + self._array_to_string(
-                            dimension + 1,
-                            offset + i * self.strides[dimension].__int__(),
-                            print_options,
-                        )
-                    if i > 0:
-                        result = (
-                            result
-                            + String(" ") * (dimension + 1)
-                            + self._array_to_string(
-                                dimension + 1,
-                                offset + i * self.strides[dimension].__int__(),
-                                print_options,
-                            )
-                        )
-                    if i < (number_of_items - 1):
-                        result = result + "\n"
-            else:  # Print first 3 and last 3 items
-                for i in range(edge_items):
-                    if i == 0:
-                        result = result + self._array_to_string(
-                            dimension + 1,
-                            offset + i * self.strides[dimension].__int__(),
-                            print_options,
-                        )
-                    if i > 0:
-                        result = (
-                            result
-                            + String(" ") * (dimension + 1)
-                            + self._array_to_string(
-                                dimension + 1,
-                                offset + i * self.strides[dimension].__int__(),
-                                print_options,
-                            )
-                        )
-                    if i < (number_of_items - 1):
-                        result += "\n"
-                result = result + "...\n"
-                for i in range(number_of_items - edge_items, number_of_items):
-                    result = (
-                        result
-                        + String(" ") * (dimension + 1)
-                        + self._array_to_string(
-                            dimension + 1,
-                            offset + i * self.strides[dimension].__int__(),
-                            print_options,
-                        )
-                    )
-                    if i < (number_of_items - 1):
-                        result = result + "\n"
-            result = result + "]"
-            return result
-
-    # ===-------------------------------------------------------------------===#
-    # Methods
-    # ===-------------------------------------------------------------------===#
-
-    fn vdot(self, other: Self) raises -> SIMD[dtype, 1]:
-        """
-        Inner product of two vectors.
-
-        Args:
-            other: The other vector.
+        """
+        self = self * other
 
-        Returns:
-            The inner product of the two vectors.
+    fn __imul__(mut self, other: Self) raises:
         """
-        if self.size != other.size:
-            raise Error("The lengths of two vectors do not match.")
+        Enables `array *= array`.
+        """
+        self = self * other
 
-        var sum = Scalar[dtype](0)
-        for i in range(self.size):
-            sum = sum + self.load(i) * other.load(i)
-        return sum
+    fn __abs__(self) -> Self:
+        return abs(self)
 
-    fn mdot(self, other: Self) raises -> Self:
+    fn __invert__(self) raises -> Self:
         """
-        Dot product of two matrix.
-        Matrix A: M * N.
-        Matrix B: N * L.
+        Element-wise inverse (~ or not), only for bools and integral types.
+        """
+        return bitwise.invert[dtype](self)
 
-        Args:
-            other: The other matrix.
+    fn __pow__(self, p: Int) -> Self:
+        return self._elementwise_pow(p)
 
-        Returns:
-            The dot product of the two matrices.
-        """
+    fn __pow__(self, rhs: Scalar[dtype]) raises -> Self:
+        """Power of items."""
+        var res = self
+        for i in range(self.size):
+            res._buf.ptr[i] = self._buf.ptr[i].__pow__(rhs)
+        return res^
 
-        if (self.ndim != 2) or (other.ndim != 2):
+    fn __pow__(self, p: Self) raises -> Self:
+        if self.size != p.size:
             raise Error(
                 String(
-                    "The array should have only two dimensions (matrix).\n"
-                    "The self array has {} dimensions.\n"
-                    "The orther array has {} dimensions"
-                ).format(self.ndim, other.ndim)
+                    "Both arrays must have same number of elements! \n"
+                    "Self array has {} elements. \n"
+                    "Other array has {} elements"
+                ).format(self.size, p.size)
             )
 
-        if self.shape[1] != other.shape[0]:
-            raise Error(
-                String(
-                    "Second dimension of A does not match first dimension of"
-                    " B.\nA is {}x{}. \nB is {}x{}."
-                ).format(
-                    self.shape[0], self.shape[1], other.shape[0], other.shape[1]
-                )
+        var result = Self(self.shape)
+
+        @parameter
+        fn vectorized_pow[simd_width: Int](index: Int) -> None:
+            result._buf.ptr.store(
+                index,
+                self._buf.ptr.load[width=simd_width](index)
+                ** p._buf.ptr.load[width=simd_width](index),
             )
 
-        var new_matrix = Self(Shape(self.shape[0], other.shape[1]))
-        for row in range(self.shape[0]):
-            for col in range(other.shape[1]):
-                new_matrix.__setitem__(
-                    Item(row, col),
-                    self[row : row + 1, :].vdot(other[:, col : col + 1]),
-                )
-        return new_matrix
+        vectorize[vectorized_pow, self.width](self.size)
+        return result
 
-    fn row(self, id: Int) raises -> Self:
-        """Get the ith row of the matrix.
+    fn __ipow__(mut self, p: Int):
+        self = self.__pow__(p)
 
-        Args:
-            id: The row index.
+    fn _elementwise_pow(self, p: Int) -> Self:
+        var new_vec = self
 
-        Returns:
-            The ith row of the matrix.
+        @parameter
+        fn array_scalar_vectorize[simd_width: Int](index: Int) -> None:
+            new_vec._buf.ptr.store(
+                index,
+                builtin_math.pow(
+                    self._buf.ptr.load[width=simd_width](index), p
+                ),
+            )
+
+        vectorize[array_scalar_vectorize, self.width](self.size)
+        return new_vec
+
+    fn __truediv__[
+        OtherDType: DType,
+        ResultDType: DType = TypeCoercion.result[dtype, OtherDType](),
+    ](self, other: Scalar[OtherDType]) raises -> NDArray[ResultDType]:
         """
+        Enables `array / scalar`.
+        """
+        return math.div[ResultDType](
+            self.astype[ResultDType](), other.cast[ResultDType]()
+        )
 
-        if self.ndim > 2:
-            raise Error(
-                String(
-                    "The number of dimension is {}.\nIt should be 2."
-                ).format(self.ndim)
-            )
+    fn __truediv__[
+        OtherDType: DType,
+        ResultDType: DType = TypeCoercion.result[dtype, OtherDType](),
+    ](self, other: NDArray[OtherDType]) raises -> NDArray[ResultDType]:
+        """
+        Enables `array / array`.
+        """
+        return math.div[ResultDType](
+            self.astype[ResultDType](), other.astype[ResultDType]()
+        )
 
-        var width = self.shape[1]
-        var buffer = Self(Shape(width))
-        for i in range(width):
-            buffer.store(i, self._buf.ptr.load[width=1](i + id * width))
-        return buffer
+    fn __truediv__(self, other: SIMD[dtype, 1]) raises -> Self:
+        """
+        Enables `array / scalar`.
+        """
+        return math.div[dtype](self, other)
 
-    fn col(self, id: Int) raises -> Self:
-        """Get the ith column of the matrix.
+    fn __truediv__(self, other: Self) raises -> Self:
+        """
+        Enables `array / array`.
+        """
+        return math.div[dtype](self, other)
 
-        Args:
-            id: The column index.
+    fn __itruediv__(mut self, s: SIMD[dtype, 1]) raises:
+        """
+        Enables `array /= scalar`.
+        """
+        self = self.__truediv__(s)
 
-        Returns:
-            The ith column of the matrix.
+    fn __itruediv__(mut self, other: Self) raises:
         """
+        Enables `array /= array`.
+        """
+        self = self.__truediv__(other)
 
-        if self.ndim > 2:
-            raise Error(
-                String(
-                    "The number of dimension is {}.\nIt should be 2."
-                ).format(self.ndim)
-            )
+    fn __rtruediv__[
+        OtherDType: DType,
+        ResultDType: DType = TypeCoercion.result[dtype, OtherDType](),
+    ](self, s: Scalar[OtherDType]) raises -> NDArray[ResultDType]:
+        """
+        Enables `scalar / array`.
+        """
+        return math.div[ResultDType](
+            s.cast[ResultDType](), self.astype[ResultDType]()
+        )
 
-        var width = self.shape[1]
-        var height = self.shape[0]
-        var buffer = Self(Shape(height))
-        for i in range(height):
-            buffer.store(i, self._buf.ptr.load[width=1](id + i * width))
-        return buffer
+    fn __rtruediv__(self, s: SIMD[dtype, 1]) raises -> Self:
+        """
+        Enables `scalar / array`.
+        """
+        return math.div[dtype](s, self)
 
-    # # * same as mdot
-    fn rdot(self, other: Self) raises -> Self:
+    fn __floordiv__[
+        OtherDType: DType,
+        ResultDType: DType = TypeCoercion.result[dtype, OtherDType](),
+    ](self, other: Scalar[OtherDType]) raises -> NDArray[ResultDType]:
         """
-        Dot product of two matrix.
-        Matrix A: M * N.
-        Matrix B: N * L.
+        Enables `array // scalar`.
+        """
+        return math.floor_div[ResultDType](
+            self.astype[ResultDType](), other.cast[ResultDType]()
+        )
 
-        Args:
-            other: The other matrix.
+    fn __floordiv__[
+        OtherDType: DType,
+        ResultDType: DType = TypeCoercion.result[dtype, OtherDType](),
+    ](self, other: NDArray[OtherDType]) raises -> NDArray[ResultDType]:
+        """
+        Enables `array // array`.
+        """
+        return math.floor_div[ResultDType](
+            self.astype[ResultDType](), other.astype[ResultDType]()
+        )
 
-        Returns:
-            The dot product of the two matrices.
+    fn __floordiv__(self, other: SIMD[dtype, 1]) raises -> Self:
+        """
+        Enables `array // scalar`.
         """
+        return math.floor_div[dtype](self, other)
 
-        if (self.ndim != 2) or (other.ndim != 2):
-            raise Error(
-                String(
-                    "The array should have only two dimensions (matrix)."
-                    "The self array is of {} dimensions.\n"
-                    "The other array is of {} dimensions."
-                ).format(self.ndim, other.ndim)
-            )
-        if self.shape[1] != other.shape[0]:
-            raise Error(
-                String(
-                    "Second dimension of A ({}) \n"
-                    "does not match first dimension of B ({})."
-                ).format(self.shape[1], other.shape[0])
-            )
+    fn __floordiv__(self, other: Self) raises -> Self:
+        """
+        Enables `array // array`.
+        """
+        return math.floor_div[dtype](self, other)
+
+    fn __ifloordiv__(mut self, s: SIMD[dtype, 1]) raises:
+        """
+        Enables `array //= scalar`.
+        """
+        self = self.__floordiv__(s)
+
+    fn __ifloordiv__(mut self, other: Self) raises:
+        """
+        Enables `array //= array`.
+        """
+        self = self.__floordiv__(other)
+
+    fn __rfloordiv__[
+        OtherDType: DType,
+        ResultDType: DType = TypeCoercion.result[dtype, OtherDType](),
+    ](self, other: Scalar[OtherDType]) raises -> NDArray[ResultDType]:
+        """
+        Enables `scalar // array`.
+        """
+        return math.floor_div[ResultDType](
+            other.cast[ResultDType](), self.astype[ResultDType]()
+        )
+
+    fn __rfloordiv__(self, other: SIMD[dtype, 1]) raises -> Self:
+        """
+        Enables `scalar // array`.
+        """
+        return math.floor_div[dtype](other, self)
 
-        var new_matrix = Self(Shape(self.shape[0], other.shape[1]))
-        for row in range(self.shape[0]):
-            for col in range(other.shape[1]):
-                new_matrix.store(
-                    col + row * other.shape[1],
-                    self.row(row).vdot(other.col(col)),
-                )
-        return new_matrix
+    fn __mod__[
+        OtherDType: DType,
+        ResultDType: DType = TypeCoercion.result[dtype, OtherDType](),
+    ](self, other: Scalar[OtherDType]) raises -> NDArray[ResultDType]:
+        """
+        Enables `array % scalar`.
+        """
+        return math.mod[ResultDType](
+            self.astype[ResultDType](), other.cast[ResultDType]()
+        )
 
-    fn num_elements(self) -> Int:
+    fn __mod__[
+        OtherDType: DType,
+        ResultDType: DType = TypeCoercion.result[dtype, OtherDType](),
+    ](self, other: NDArray[OtherDType]) raises -> NDArray[ResultDType]:
         """
-        Function to retreive size (compatability).
+        Enables `array % array`.
+        """
+        return math.mod[ResultDType](
+            self.astype[ResultDType](), other.astype[ResultDType]()
+        )
 
-        Returns:
-            The size of the array.
+    fn __mod__(mut self, other: SIMD[dtype, 1]) raises -> Self:
         """
-        return self.size
+        Enables `array % scalar`.
+        """
+        return math.mod[dtype](self, other)
 
-    fn load(self, owned index: Int) raises -> Scalar[dtype]:
+    fn __mod__(mut self, other: NDArray[dtype]) raises -> Self:
         """
-        Safely retrieve i-th item from the underlying buffer.
+        Enables `array % array`.
+        """
+        return math.mod[dtype](self, other)
 
-        `A.load(i)` differs from `A._buf.ptr[i]` due to boundary check.
+    fn __imod__(mut self, other: SIMD[dtype, 1]) raises:
+        """
+        Enables `array %= scalar`.
+        """
+        self = math.mod[dtype](self, other)
 
-        Args:
-            index: Index of the item.
+    fn __imod__(mut self, other: NDArray[dtype]) raises:
+        """
+        Enables `array %= array`.
+        """
+        self = math.mod[dtype](self, other)
 
-        Returns:
-            The value at the index.
+    fn __rmod__(mut self, other: SIMD[dtype, 1]) raises -> Self:
+        """
+        Enables `scalar % array`.
+        """
+        return math.mod[dtype](other, self)
 
-        Example:
-        ```console
-        > array.load(15)
-        ```
-        returns the item of index 15 from the array's data buffer.
+    fn __rmod__[
+        OtherDType: DType,
+        ResultDType: DType = TypeCoercion.result[dtype, OtherDType](),
+    ](self, other: Scalar[OtherDType]) raises -> NDArray[ResultDType]:
+        """
+        Enables `scalar % array`.
+        """
+        return math.mod[ResultDType](
+            other.cast[ResultDType](), self.astype[ResultDType]()
+        )
 
-        Note that it does not checked against C-order or F-order.
-        ```console
-        > # A is a 3x3 matrix, F-order (column-major)
-        > A.load(3)  # Row 0, Col 1
-        > A.item(3)  # Row 1, Col 0
-        ```.
+    # ===-------------------------------------------------------------------===#
+    # IO dunders and relevant methods
+    # Trait implementations
+    # ===-------------------------------------------------------------------===#
+    fn __str__(self) -> String:
+        """
+        Enables String(array).
         """
+        var res: String
+        try:
+            res = self._array_to_string(0, 0, GLOBAL_PRINT_OPTIONS)
+        except e:
+            res = String("Cannot convert array to string.\n") + String(e)
 
-        if index < 0:
-            index += self.size
+        return res
 
-        if (index >= self.size) or (index < 0):
-            raise Error(
-                String("Invalid index: index out of bound [0, {}).").format(
-                    self.size
-                )
+    fn write_to[W: Writer](self, mut writer: W):
+        if self.ndim == 0:
+            # For 0darray (numojo scalar), we can directly write the value
+            writer.write(
+                String(self._buf.ptr[])
+                + String("  (0darray[" + _concise_dtype_str(self.dtype) + "])")
             )
+        else:
+            try:
+                writer.write(
+                    self._array_to_string(0, 0, GLOBAL_PRINT_OPTIONS)
+                    + "\n"
+                    + String(self.ndim)
+                    + "D-array  Shape"
+                    + String(self.shape)
+                    + "  Strides"
+                    + String(self.strides)
+                    + "  DType: "
+                    + _concise_dtype_str(self.dtype)
+                    + "  C-cont: "
+                    + String(self.flags.C_CONTIGUOUS)
+                    + "  F-cont: "
+                    + String(self.flags.F_CONTIGUOUS)
+                    + "  own data: "
+                    + String(self.flags.OWNDATA)
+                )
+            except e:
+                writer.write("Cannot convert array to string.\n" + String(e))
 
-        return self._buf.ptr[index]
-
-    fn load[width: Int = 1](self, index: Int) raises -> SIMD[dtype, width]:
+    fn __repr__(self) -> String:
         """
-        Safely loads a SIMD element of size `width` at `index`
-        from the underlying buffer.
-
-        To bypass boundary checks, use `self._buf.ptr.load` directly.
-
-        Args:
-            index: Index of the item.
+        Compute the "official" string representation of NDArray.
 
-        Returns:
-            The SIMD element at the index.
+        You can construct the array using this representation.
 
-        Raises:
-            Index out of boundary.
+        An example is:
+        ```console
+        >>>import numojo as nm
+        >>>var b = nm.arange[nm.f32](20).reshape(Shape(4, 5))
+        >>>print(repr(b))
+        numojo.array[f32](
+        '''
+        [[0.0, 1.0, 2.0, 3.0, 4.0]
+         [5.0, 6.0, 7.0, 8.0, 9.0]
+         [10.0, 11.0, 12.0, 13.0, 14.0]
+         [15.0, 16.0, 17.0, 18.0, 19.0]]
+        '''
+        )
+        ```
         """
+        var result: String
 
-        if (index < 0) or (index >= self.size):
-            raise Error(
-                String("Invalid index: index out of bound [0, {}).").format(
-                    self.size
-                )
+        try:
+            result = (
+                String("numojo.array[")
+                + _concise_dtype_str(self.dtype)
+                + String('](\n"""\n')
+                + self._array_to_string(0, 0, GLOBAL_PRINT_OPTIONS)
+                + '\n"""\n)'
             )
+        except e:
+            result = "Cannot convert array to string.\n" + String(e)
 
-        return self._buf.ptr.load[width=width](index)
+        return result
 
-    fn load[width: Int = 1](self, *indices: Int) raises -> SIMD[dtype, width]:
-        """
-        Safely loads SIMD element of size `width` at given variadic indices
-        from the underlying buffer.
+    # ===-------------------------------------------------------------------===#
+    # Trait dunders and iterator dunders
+    # ===-------------------------------------------------------------------===#
 
-        To bypass boundary checks, use `self._buf.ptr.load` directly.
+    fn __len__(self) -> Int:
+        """
+        Returns length of 0-th dimension.
+        """
+        return self.shape._buf[0]
 
-        Args:
-            indices: Variadic indices.
+    fn __iter__(self) raises -> _NDArrayIter[__origin_of(self), dtype]:
+        """
+        Iterate over elements of the NDArray and return sub-arrays as view.
 
         Returns:
-            The SIMD element at the indices.
+            An iterator of NDArray elements.
 
-        Raises:
-            Index out of boundary.
+        Example:
+        ```
+        >>> var a = nm.random.arange[nm.i8](2, 3, 4).reshape(nm.Shape(2, 3, 4))
+        >>> for i in a:
+        ...     print(i)
+        [[      0       1       2       3       ]
+         [      4       5       6       7       ]
+         [      8       9       10      11      ]]
+        2-D array  Shape: [3, 4]  DType: int8  C-cont: True  F-cont: False  own data: False
+        [[      12      13      14      15      ]
+         [      16      17      18      19      ]
+         [      20      21      22      23      ]]
+        2-D array  Shape: [3, 4]  DType: int8  C-cont: True  F-cont: False  own data: False
+        ```.
         """
 
-        if len(indices) != self.ndim:
-            raise (
-                String(
-                    "Length of indices {} does not match ndim {}".format(
-                        len(indices), self.ndim
-                    )
-                )
-            )
-
-        for i in range(self.ndim):
-            if (indices[i] < 0) or (indices[i] >= self.shape[i]):
-                raise Error(
-                    String(
-                        "Invalid index at {}-th dim: "
-                        "index out of bound [0, {})."
-                    ).format(i, self.shape[i])
-                )
+        return _NDArrayIter[__origin_of(self), dtype](
+            ptr=self._buf.ptr,
+            length=self.shape[0],
+            stride_of_axis=self.strides[0],
+            shape=self.shape._pop(axis=0),
+            strides=self.strides._pop(axis=0),
+        )
 
-        var idx: Int = _get_offset(indices, self.strides)
-        return self._buf.ptr.load[width=width](idx)
+    fn __reversed__(
+        self,
+    ) raises -> _NDArrayIter[__origin_of(self), dtype, forward=False]:
+        """Iterate backwards over elements of the NDArray, returning
+        copied value.
 
-    fn store(self, owned index: Int, val: Scalar[dtype]) raises:
+        Returns:
+            A reversed iterator of NDArray elements.
         """
-        Safely store a scalar to i-th item of the underlying buffer.
 
-        `A.store(i, a)` differs from `A._buf.ptr[i] = a` due to boundary check.
+        return _NDArrayIter[__origin_of(self), dtype, forward=False](
+            ptr=self._buf.ptr,
+            length=self.shape[0],
+            stride_of_axis=self.strides[0],
+            shape=self.shape._pop(axis=0),
+            strides=self.strides._pop(axis=0),
+        )
+
+    fn _adjust_slice(self, slice_list: List[Slice]) raises -> List[Slice]:
+        """
+        Adjusts the slice values to lie within 0 and dim.
 
         Args:
-            index: Index of the item.
-            val: Value to store.
-
-        Raises:
-            Index out of boundary.
-
-        Example:
-        ```console
-        > array.store(15, val = 100)
-        ```
-        sets the item of index 15 of the array's data buffer to 100.
+            slice_list: List of slices.
 
-        Note that it does not checked against C-order or F-order.
+        Returns:
+            Adjusted list of slices.
         """
+        var n_slices: Int = slice_list.__len__()
+        var slices = List[Slice]()
+        for i in range(n_slices):
+            if i >= self.ndim:
+                raise Error("Error: Number of slices exceeds array dimensions")
 
-        if index < 0:
-            index += self.size
+            var start: Int = 0
+            var end: Int = self.shape[i]
+            var step: Int = 1
+            if slice_list[i].start is not None:
+                start = slice_list[i].start.value()
+                if start < 0:
+                    # start += self.shape[i]
+                    raise Error(
+                        "Error: Negative indexing in slices not supported"
+                        " currently"
+                    )
 
-        if (index >= self.size) or (index < 0):
-            raise Error(
-                String("Invalid index: index out of bound [0, {}).").format(
-                    self.size
+            if slice_list[i].end is not None:
+                end = slice_list[i].end.value()
+                if end < 0:
+                    # end += self.shape[i] + 1
+                    raise Error(
+                        "Error: Negative indexing in slices not supported"
+                        " currently"
+                    )
+            step = slice_list[i].step.or_else(1)
+            if step == 0:
+                raise Error("Error: Slice step cannot be zero")
+
+            slices.append(
+                Slice(
+                    start=Optional(start),
+                    end=Optional(end),
+                    step=Optional(step),
                 )
             )
 
-        self._buf.ptr[index] = val
+        return slices^
 
-    fn store[width: Int](mut self, index: Int, val: SIMD[dtype, width]) raises:
+    fn _array_to_string(
+        self,
+        dimension: Int,
+        offset: Int,
+        owned print_options: PrintOptions,
+    ) raises -> String:
         """
-        Safely stores SIMD element of size `width` at `index`
-        of the underlying buffer.
-
-        To bypass boundary checks, use `self._buf.ptr.store` directly.
+        Convert the array to a string.
 
         Args:
-            index: Index of the item.
-            val: Value to store.
+            dimension: The current dimension.
+            offset: The offset of the current dimension.
+            print_options: The print options.
 
-        Raises:
-            Index out of boundary.
+        Returns:
+            String representation of the array.
         """
 
-        if (index < 0) or (index >= self.size):
-            raise Error(
-                String("Invalid index: index out of bound [0, {}).").format(
-                    self.size
-                )
-            )
+        if self.ndim == 0:
+            # For 0darray (numojo scalar), return the scalar value.
+            return String(self._buf.ptr[0])
 
-        self._buf.ptr.store(index, val)
+        var seperator = print_options.separator
+        var padding = print_options.padding
+        var edge_items = print_options.edge_items
 
-    fn store[
-        width: Int = 1
-    ](mut self, *indices: Int, val: SIMD[dtype, width]) raises:
-        """
-        Safely stores SIMD element of size `width` at given variadic indices
-        of the underlying buffer.
+        # The following code get the max value and the min value
+        # to determine the digits before decimals and the negative sign
+        # and then determine the formatted withd
+        var negative_sign: Bool = False  # whether there should be a negative sign
+        var number_of_digits: Int  # number of digits before or after decimal point
+        var number_of_digits_small_values: Int  # number of digits after decimal point for small values
+        var formatted_width: Int  # formatted width based on precision and digits before decimal points
+        var max_value: Scalar[dtype] = abs(
+            self._buf.ptr[]
+        )  # maximum absolute value of the items
+        var min_value: Scalar[dtype] = abs(
+            self._buf.ptr[]
+        )  # minimum absolute value of the items
+        var val: Scalar[dtype]  # storage of value of the item
 
-        To bypass boundary checks, use `self._buf.ptr.store` directly.
+        var skip: Bool
+        for index in range(self.size):
+            skip = False
+            var remainder = index
+            var indices = Item(ndim=self.ndim, initialized=False)
+            for i in range(self.ndim):
+                indices[i], remainder = divmod(
+                    remainder, NDArrayStrides(self.shape)[i]
+                )
+                if (indices[i] >= 3) and (indices[i] < self.shape[i] - 3):
+                    skip = True
+                    continue
+            if skip:
+                continue
 
-        Args:
-            indices: Variadic indices.
-            val: Value to store.
+            val = self._buf.ptr[_get_offset(indices, self.strides)]
+            if val < 0:
+                negative_sign = True
+            max_value = max(
+                max_value,
+                abs(val),
+            )
+            min_value = min(
+                min_value,
+                abs(val),
+            )
+        number_of_digits = Int(log10(Float64(max_value))) + 1
+        number_of_digits_small_values = abs(Int(log10(Float64(min_value)))) + 1
 
-        Raises:
-            Index out of boundary.
-        """
+        if dtype.is_floating_point():
+            formatted_width = (
+                print_options.precision
+                + 1
+                + number_of_digits
+                + Int(negative_sign)
+            )
+        else:
+            formatted_width = number_of_digits + Int(negative_sign)
 
-        if len(indices) != self.ndim:
-            raise (
-                String(
-                    "Length of indices {} does not match ndim {}".format(
-                        len(indices), self.ndim
+        # If the number is not too wide,
+        # or digits after decimal point is not many
+        # format it as a floating point.
+        if (formatted_width <= 14) and (number_of_digits_small_values <= 2):
+            print_options.formatted_width = formatted_width
+        # Otherwise, format it as a scientific number.
+        else:
+            print_options.float_format = "scientific"
+            print_options.formatted_width = 7 + print_options.precision
+
+        if dimension == self.ndim - 1:
+            var result: String = String("[") + padding
+            var number_of_items = self.shape[dimension]
+            if number_of_items <= edge_items * 2:  # Print all items
+                for i in range(number_of_items):
+                    var value = self.load[width=1](
+                        offset + i * self.strides[dimension]
+                    )
+                    var formatted_value = format_value(value, print_options)
+                    result = result + formatted_value
+                    if i < (number_of_items - 1):
+                        result = result + seperator
+                result = result + padding
+            else:  # Print first 3 and last 3 items
+                for i in range(edge_items):
+                    var value = self.load[width=1](
+                        offset + i * self.strides[dimension]
+                    )
+                    var formatted_value = format_value(value, print_options)
+                    result = result + formatted_value
+                    if i < (edge_items - 1):
+                        result = result + seperator
+                result = result + seperator + "..." + seperator
+                for i in range(number_of_items - edge_items, number_of_items):
+                    var value = self.load[width=1](
+                        offset + i * self.strides[dimension]
+                    )
+                    var formatted_value = format_value(value, print_options)
+                    result = result + formatted_value
+                    if i < (number_of_items - 1):
+                        result = result + seperator
+                result = result + padding
+            result = result + "]"
+            return result
+        else:
+            var result: String = String("[")
+            var number_of_items = self.shape[dimension]
+            if number_of_items <= edge_items * 2:  # Print all items
+                for i in range(number_of_items):
+                    if i == 0:
+                        result = result + self._array_to_string(
+                            dimension + 1,
+                            offset + i * self.strides[dimension].__int__(),
+                            print_options,
+                        )
+                    if i > 0:
+                        result = (
+                            result
+                            + String(" ") * (dimension + 1)
+                            + self._array_to_string(
+                                dimension + 1,
+                                offset + i * self.strides[dimension].__int__(),
+                                print_options,
+                            )
+                        )
+                    if i < (number_of_items - 1):
+                        result = result + "\n"
+            else:  # Print first 3 and last 3 items
+                for i in range(edge_items):
+                    if i == 0:
+                        result = result + self._array_to_string(
+                            dimension + 1,
+                            offset + i * self.strides[dimension].__int__(),
+                            print_options,
+                        )
+                    if i > 0:
+                        result = (
+                            result
+                            + String(" ") * (dimension + 1)
+                            + self._array_to_string(
+                                dimension + 1,
+                                offset + i * self.strides[dimension].__int__(),
+                                print_options,
+                            )
+                        )
+                    if i < (number_of_items - 1):
+                        result += "\n"
+                result = result + "...\n"
+                for i in range(number_of_items - edge_items, number_of_items):
+                    result = (
+                        result
+                        + String(" ") * (dimension + 1)
+                        + self._array_to_string(
+                            dimension + 1,
+                            offset + i * self.strides[dimension].__int__(),
+                            print_options,
+                        )
                     )
-                )
-            )
-
-        for i in range(self.ndim):
-            if (indices[i] < 0) or (indices[i] >= self.shape[i]):
-                raise Error(
-                    String(
-                        "Invalid index at {}-th dim: "
-                        "index out of bound [0, {})."
-                    ).format(i, self.shape[i])
-                )
-
-        var idx: Int = _get_offset(indices, self.strides)
-        self._buf.ptr.store(idx, val)
+                    if i < (number_of_items - 1):
+                        result = result + "\n"
+            result = result + "]"
+            return result
 
     # ===-------------------------------------------------------------------===#
     # OTHER METHODS
@@ -2982,35 +3095,6 @@ struct NDArray[dtype: DType = DType.float64](
     # # partition, put, repeat, searchsorted, setfield, squeeze, swapaxes, take,
     # # tobyets, tofile, view
     # ===-------------------------------------------------------------------===#
-    fn T(self, axes: List[Int]) raises -> Self:
-        """
-        Transpose array of any number of dimensions according to
-        arbitrary permutation of the axes.
-
-        If `axes` is not given, it is equal to flipping the axes.
-
-        Args:
-            axes: List of axes.
-
-        Returns:
-            Transposed array.
-
-        Defined in `numojo.routines.manipulation.transpose`.
-        """
-        return numojo.routines.manipulation.transpose(self, axes)
-
-    fn T(self) raises -> Self:
-        """
-        (overload) Transpose the array when `axes` is not given.
-        If `axes` is not given, it is equal to flipping the axes.
-        See docstring of `transpose`.
-
-        Returns:
-            Transposed array.
-
-        Defined in `numojo.routines.manipulation.transpose`.
-        """
-        return numojo.routines.manipulation.transpose(self)
 
     fn all(self) raises -> Bool:
         """
@@ -3117,6 +3201,30 @@ struct NDArray[dtype: DType = DType.float64](
     # fn compress(self):
     #     pass
 
+    fn col(self, id: Int) raises -> Self:
+        """Get the ith column of the matrix.
+
+        Args:
+            id: The column index.
+
+        Returns:
+            The ith column of the matrix.
+        """
+
+        if self.ndim > 2:
+            raise Error(
+                String(
+                    "The number of dimension is {}.\nIt should be 2."
+                ).format(self.ndim)
+            )
+
+        var width = self.shape[1]
+        var height = self.shape[0]
+        var buffer = Self(Shape(height))
+        for i in range(height):
+            buffer.store(i, self._buf.ptr.load[width=1](id + i * width))
+        return buffer
+
     fn copy(self) raises -> Self:
         # TODO: Add logics for non-contiguous arrays when views are implemented.
         """
@@ -3187,265 +3295,43 @@ struct NDArray[dtype: DType = DType.float64](
         """
         return cumsum[dtype](self)
 
-    fn cumsum(self, axis: Int) raises -> NDArray[dtype]:
-        """
-        Returns cumsum of array by axis.
-
-        Args:
-            axis: Axis.
-
-        Returns:
-            Cumsum of array by axis.
-        """
-        return cumsum[dtype](self, axis=axis)
-
-    fn diagonal(self):
-        pass
-
-    fn fill(mut self, val: Scalar[dtype]):
-        """
-        Fill all items of array with value.
-
-        Args:
-            val: Value to fill.
-        """
-
-        for i in range(self.size):
-            self._buf.ptr[i] = val
-
-    fn flatten(self, order: String = "C") raises -> Self:
-        """
-        Return a copy of the array collapsed into one dimension.
-
-        Args:
-            order: A NDArray.
-
-        Returns:
-            The 1 dimensional flattened NDArray.
-        """
-        return ravel(self, order=order)
-
-    fn item(
-        self, owned index: Int
-    ) raises -> ref [self._buf.ptr.origin, self._buf.ptr.address_space] Scalar[
-        dtype
-    ]:
-        """
-        Return the scalar at the coordinates.
-        If one index is given, get the i-th item of the array (not buffer).
-        It first scans over the first row, even it is a colume-major array.
-        If more than one index is given, the length of the indices must match
-        the number of dimensions of the array.
-        If the ndim is 0 (0darray), get the value as a mojo scalar.
-
-        Args:
-            index: Index of item, counted in row-major way.
-
-        Returns:
-            A scalar matching the dtype of the array.
-
-        Raises:
-            Error if array is 0darray (numojo scalar).
-            Error if index is equal or larger than array size.
-
-        Example:
-        ```console
-        >>> var A = nm.random.randn[nm.f16](2, 2, 2)
-        >>> A = A.reshape(A.shape, order="F")
-        >>> print(A)
-        [[[     0.2446289       0.5419922       ]
-          [     0.09643555      -0.90722656     ]]
-         [[     1.1806641       0.24389648      ]
-          [     0.5234375       1.0390625       ]]]
-        3-D array  Shape: [2, 2, 2]  DType: float16  order: F
-        >>> for i in range(A.size):
-        ...     print(A.item(i))
-        0.2446289
-        0.5419922
-        0.09643555
-        -0.90722656
-        1.1806641
-        0.24389648
-        0.5234375
-        1.0390625
-        >>> print(A.item(0, 1, 1))
-        -0.90722656
-        ```.
-        """
-
-        # For 0darray, raise error
-        if self.ndim == 0:
-            raise Error(
-                String(
-                    "\nError in `numojo.NDArray.item(index: Int)`: "
-                    "Cannot index a 0darray (numojo scalar). "
-                    "Use `a.item()` without arguments."
-                )
-            )
-
-        if index < 0:
-            index += self.size
-
-        if (index < 0) or (index >= self.size):
-            raise Error(
-                String(
-                    "\nError in `numojo.NDArray.item(index: Int)`:"
-                    "`index` exceeds array size ({})"
-                ).format(self.size)
-            )
-
-        if self.flags.F_CONTIGUOUS:
-            return (self._buf.ptr + _transfer_offset(index, self.strides))[]
-
-        else:
-            return (self._buf.ptr + index)[]
-
-    fn item(
-        self, *index: Int
-    ) raises -> ref [self._buf.ptr.origin, self._buf.ptr.address_space] Scalar[
-        dtype
-    ]:
-        """
-        Return the scalar at the coordinates.
-        If one index is given, get the i-th item of the array (not buffer).
-        It first scans over the first row, even it is a colume-major array.
-        If more than one index is given, the length of the indices must match
-        the number of dimensions of the array.
-        For 0darray (numojo scalar), return the scalar value.
-
-        Args:
-            index: The coordinates of the item.
-
-        Returns:
-            A scalar matching the dtype of the array.
-
-        Raises:
-            Index is equal or larger than size of dimension.
-
-        Example:
-        ```
-        >>> var A = nm.random.randn[nm.f16](2, 2, 2)
-        >>> A = A.reshape(A.shape, order="F")
-        >>> print(A)
-        [[[     0.2446289       0.5419922       ]
-          [     0.09643555      -0.90722656     ]]
-         [[     1.1806641       0.24389648      ]
-          [     0.5234375       1.0390625       ]]]
-        3-D array  Shape: [2, 2, 2]  DType: float16  order: F
-        >>> print(A.item(0, 1, 1))
-        -0.90722656
-        ```.
-        """
-
-        if len(index) != self.ndim:
-            raise Error(
-                String(
-                    "\nError in `numojo.NDArray.item(*index: Int)`:"
-                    "Number of indices ({}) do not match ndim ({})"
-                ).format(len(index), self.ndim)
-            )
-
-        # For 0darray, return the scalar value.
-        if self.ndim == 0:
-            return self._buf.ptr[]
-
-        var list_index = List[Int]()
-        for i in range(len(index)):
-            if index[i] < 0:
-                list_index.append(index[i] + self.shape[i])
-            else:
-                list_index.append(index[i])
-            if (list_index[i] < 0) or (list_index[i] >= self.shape[i]):
-                raise Error(
-                    String("{}-th index exceeds shape size {}").format(
-                        i, self.shape[i]
-                    )
-                )
-        return (self._buf.ptr + _get_offset(index, self.strides))[]
-
-    fn itemset(
-        mut self, index: Variant[Int, List[Int]], item: Scalar[dtype]
-    ) raises:
-        """Set the scalar at the coordinates.
-
+    fn cumsum(self, axis: Int) raises -> NDArray[dtype]:
+        """
+        Returns cumsum of array by axis.
+
         Args:
-            index: The coordinates of the item.
-                Can either be `Int` or `List[Int]`.
-                If `Int` is passed, it is the index of i-th item of the whole array.
-                If `List[Int]` is passed, it is the coordinate of the item.
-            item: The scalar to be set.
+            axis: Axis.
 
-        Note:
-            This is similar to `numpy.ndarray.itemset`.
-            The difference is that we takes in `List[Int]`, but numpy takes in a tuple.
+        Returns:
+            Cumsum of array by axis.
+        """
+        return cumsum[dtype](self, axis=axis)
 
-        An example goes as follows.
+    fn diagonal(self):
+        pass
 
-        ```
-        import numojo as nm
+    fn fill(mut self, val: Scalar[dtype]):
+        """
+        Fill all items of array with value.
 
-        fn main() raises:
-            var A = nm.zeros[nm.i16](3, 3)
-            print(A)
-            A.itemset(5, 256)
-            print(A)
-            A.itemset(List(1,1), 1024)
-            print(A)
-        ```
-        ```console
-        [[      0       0       0       ]
-         [      0       0       0       ]
-         [      0       0       0       ]]
-        2-D array  Shape: [3, 3]  DType: int16
-        [[      0       0       0       ]
-         [      0       0       256     ]
-         [      0       0       0       ]]
-        2-D array  Shape: [3, 3]  DType: int16
-        [[      0       0       0       ]
-         [      0       1024    256     ]
-         [      0       0       0       ]]
-        2-D array  Shape: [3, 3]  DType: int16
-        ```
+        Args:
+            val: Value to fill.
         """
 
-        # If one index is given
-        if index.isa[Int]():
-            var idx = index._get_ptr[Int]()[]
-            if idx < self.size:
-                if self.flags.F_CONTIGUOUS:
-                    # column-major should be converted to row-major
-                    # The following code can be taken out as a function that
-                    # convert any index to coordinates according to the order
-                    var c_stride = NDArrayStrides(shape=self.shape)
-                    var c_coordinates = List[Int]()
-                    for i in range(c_stride.ndim):
-                        var coordinate = idx // c_stride[i]
-                        idx = idx - c_stride[i] * coordinate
-                        c_coordinates.append(coordinate)
-                    self._buf.ptr.store(
-                        _get_offset(c_coordinates, self.strides), item
-                    )
+        for i in range(self.size):
+            self._buf.ptr[i] = val
 
-                self._buf.ptr.store(idx, item)
-            else:
-                raise Error(
-                    String(
-                        "Error: Elements of `index` ({}) \n"
-                        "exceed the array size ({})."
-                    ).format(idx, self.size)
-                )
+    fn flatten(self, order: String = "C") raises -> Self:
+        """
+        Return a copy of the array collapsed into one dimension.
 
-        else:
-            var indices = index._get_ptr[List[Int]]()[]
-            # If more than one index is given
-            if indices.__len__() != self.ndim:
-                raise Error("Error: Length of Indices do not match the shape")
-            for i in range(indices.__len__()):
-                if indices[i] >= self.shape[i]:
-                    raise Error(
-                        "Error: Elements of `index` exceed the array shape"
-                    )
-            self._buf.ptr.store(_get_offset(indices, self.strides), item)
+        Args:
+            order: A NDArray.
+
+        Returns:
+            The 1 dimensional flattened NDArray.
+        """
+        return ravel(self, order=order)
 
     fn iter_by_axis[
         forward: Bool = True
@@ -3592,6 +3478,48 @@ struct NDArray[dtype: DType = DType.float64](
 
         return result
 
+    # TODO: Remove this methods
+    fn mdot(self, other: Self) raises -> Self:
+        """
+        Dot product of two matrix.
+        Matrix A: M * N.
+        Matrix B: N * L.
+
+        Args:
+            other: The other matrix.
+
+        Returns:
+            The dot product of the two matrices.
+        """
+
+        if (self.ndim != 2) or (other.ndim != 2):
+            raise Error(
+                String(
+                    "The array should have only two dimensions (matrix).\n"
+                    "The self array has {} dimensions.\n"
+                    "The orther array has {} dimensions"
+                ).format(self.ndim, other.ndim)
+            )
+
+        if self.shape[1] != other.shape[0]:
+            raise Error(
+                String(
+                    "Second dimension of A does not match first dimension of"
+                    " B.\nA is {}x{}. \nB is {}x{}."
+                ).format(
+                    self.shape[0], self.shape[1], other.shape[0], other.shape[1]
+                )
+            )
+
+        var new_matrix = Self(Shape(self.shape[0], other.shape[1]))
+        for row in range(self.shape[0]):
+            for col in range(other.shape[1]):
+                new_matrix.__setitem__(
+                    Item(row, col),
+                    self[row : row + 1, :].vdot(other[:, col : col + 1]),
+                )
+        return new_matrix
+
     fn min(self, axis: Int = 0) raises -> Self:
         """
         Min on axis.
@@ -3751,6 +3679,15 @@ struct NDArray[dtype: DType = DType.float64](
             order=order,
         )
 
+    fn num_elements(self) -> Int:
+        """
+        Function to retreive size (compatability).
+
+        Returns:
+            The size of the array.
+        """
+        return self.size
+
     fn prod(self: Self) raises -> Scalar[dtype]:
         """
         Product of all array elements.
@@ -3773,6 +3710,45 @@ struct NDArray[dtype: DType = DType.float64](
 
         return prod(self, axis=axis)
 
+    # TODO: Remove this methods
+    fn rdot(self, other: Self) raises -> Self:
+        """
+        Dot product of two matrix.
+        Matrix A: M * N.
+        Matrix B: N * L.
+
+        Args:
+            other: The other matrix.
+
+        Returns:
+            The dot product of the two matrices.
+        """
+
+        if (self.ndim != 2) or (other.ndim != 2):
+            raise Error(
+                String(
+                    "The array should have only two dimensions (matrix)."
+                    "The self array is of {} dimensions.\n"
+                    "The other array is of {} dimensions."
+                ).format(self.ndim, other.ndim)
+            )
+        if self.shape[1] != other.shape[0]:
+            raise Error(
+                String(
+                    "Second dimension of A ({}) \n"
+                    "does not match first dimension of B ({})."
+                ).format(self.shape[1], other.shape[0])
+            )
+
+        var new_matrix = Self(Shape(self.shape[0], other.shape[1]))
+        for row in range(self.shape[0]):
+            for col in range(other.shape[1]):
+                new_matrix.store(
+                    col + row * other.shape[1],
+                    self.row(row).vdot(other.col(col)),
+                )
+        return new_matrix
+
     fn reshape(self, shape: NDArrayShape, order: String = "C") raises -> Self:
         """
         Returns an array of the same data with a new shape.
@@ -3820,6 +3796,29 @@ struct NDArray[dtype: DType = DType.float64](
         """
         return rounding.tround[dtype](self)
 
+    fn row(self, id: Int) raises -> Self:
+        """Get the ith row of the matrix.
+
+        Args:
+            id: The row index.
+
+        Returns:
+            The ith row of the matrix.
+        """
+
+        if self.ndim > 2:
+            raise Error(
+                String(
+                    "The number of dimension is {}.\nIt should be 2."
+                ).format(self.ndim)
+            )
+
+        var width = self.shape[1]
+        var buffer = Self(Shape(width))
+        for i in range(width):
+            buffer.store(i, self._buf.ptr.load[width=1](i + id * width))
+        return buffer
+
     fn sort(mut self) raises:
         """
         Sort NDArray using quick sort method.
@@ -3899,6 +3898,36 @@ struct NDArray[dtype: DType = DType.float64](
         """
         return sum(self, axis=axis)
 
+    fn T(self, axes: List[Int]) raises -> Self:
+        """
+        Transpose array of any number of dimensions according to
+        arbitrary permutation of the axes.
+
+        If `axes` is not given, it is equal to flipping the axes.
+
+        Args:
+            axes: List of axes.
+
+        Returns:
+            Transposed array.
+
+        Defined in `numojo.routines.manipulation.transpose`.
+        """
+        return numojo.routines.manipulation.transpose(self, axes)
+
+    fn T(self) raises -> Self:
+        """
+        (overload) Transpose the array when `axes` is not given.
+        If `axes` is not given, it is equal to flipping the axes.
+        See docstring of `transpose`.
+
+        Returns:
+            Transposed array.
+
+        Defined in `numojo.routines.manipulation.transpose`.
+        """
+        return numojo.routines.manipulation.transpose(self)
+
     fn tolist(self) -> List[Scalar[dtype]]:
         """
         Convert NDArray to a 1-D List.
@@ -3965,6 +3994,7 @@ struct NDArray[dtype: DType = DType.float64](
         """
         return linalg.norms.trace[dtype](self, offset, axis1, axis2)
 
+    # TODO: Remove the underscore in the method name when view is supported.
     fn _transpose(self) raises -> Self:
         """
         Returns a view of transposed array.
@@ -4026,6 +4056,25 @@ struct NDArray[dtype: DType = DType.float64](
         """
         return variance[returned_dtype](self, axis=axis, ddof=ddof)
 
+    # TODO: Remove this methods, but add it into routines.
+    fn vdot(self, other: Self) raises -> SIMD[dtype, 1]:
+        """
+        Inner product of two vectors.
+
+        Args:
+            other: The other vector.
+
+        Returns:
+            The inner product of the two vectors.
+        """
+        if self.size != other.size:
+            raise Error("The lengths of two vectors do not match.")
+
+        var sum = Scalar[dtype](0)
+        for i in range(self.size):
+            sum = sum + self.load(i) * other.load(i)
+        return sum
+
 
 # ===----------------------------------------------------------------------===#
 # NDArrayIterator
diff --git a/numojo/core/utility.mojo b/numojo/core/utility.mojo
index cf6b11bb..eb01bf71 100644
--- a/numojo/core/utility.mojo
+++ b/numojo/core/utility.mojo
@@ -1,9 +1,22 @@
+# ===----------------------------------------------------------------------=== #
+# Distributed under the Apache 2.0 License with LLVM Exceptions.
+# See LICENSE and the LLVM License for more information.
+# https://github.com/Mojo-Numerics-and-Algorithms-group/NuMojo/blob/main/LICENSE
+# https://llvm.org/LICENSE.txt
+# ===----------------------------------------------------------------------=== #
 """
 Implements N-DIMENSIONAL ARRAY UTILITY FUNCTIONS
 """
 # ===----------------------------------------------------------------------=== #
-# Implements N-DIMENSIONAL ARRAY UTILITY FUNCTIONS
-# Last updated: 2024-10-14
+# SECTIONS OF THE FILE:
+#
+# 1. Offset and traverse functions.
+# 2. Functions to traverse a multi-dimensional array.
+# 3. Apply a function to NDArray by axis.
+# 4. NDArray dtype conversions.
+# 5. Numojo.NDArray to other collections.
+# 6. Type checking functions.
+# 7. Miscellaneous utility functions.
 # ===----------------------------------------------------------------------=== #
 
 from algorithm.functional import vectorize, parallelize
@@ -18,36 +31,8 @@ from numojo.core.ndarray import NDArray
 from numojo.core.ndshape import NDArrayShape
 from numojo.core.ndstrides import NDArrayStrides
 
-
-# FIXME: No long useful from 24.6:
-# `width` is now inferred from the SIMD's width.
-fn fill_pointer[
-    dtype: DType
-](
-    mut array: UnsafePointer[Scalar[dtype]], size: Int, value: Scalar[dtype]
-) raises:
-    """
-    Fill a NDArray with a specific value.
-
-    Parameters:
-        dtype: The data type of the NDArray elements.
-
-    Args:
-        array: The pointer to the NDArray.
-        size: The size of the NDArray.
-        value: The value to fill the NDArray with.
-    """
-    alias width = simdwidthof[dtype]()
-
-    @parameter
-    fn vectorized_fill[simd_width: Int](idx: Int):
-        array.store(idx, value)
-
-    vectorize[vectorized_fill, width](size)
-
-
 # ===----------------------------------------------------------------------=== #
-# GET OFFSET FUNCTIONS FOR NDARRAY
+# Offset and traverse functions
 # ===----------------------------------------------------------------------=== #
 
 
@@ -154,14 +139,18 @@ fn _get_offset(indices: Tuple[Int, Int], strides: Tuple[Int, Int]) -> Int:
 
 fn _transfer_offset(offset: Int, strides: NDArrayStrides) raises -> Int:
     """
-    Transfers the offset between C-contiguous and F-continuous memory layout.
+    Transfers the offset by flipping the strides information.
+    It can be used to transfer between C-contiguous and F-continuous memory
+    layout. For example, in a 4x4 C-contiguous array, the item with offset 4
+    has the indices (1, 0). The item with the same indices (1, 0) in a
+    F-continuous array has an offset of 1.
 
     Args:
-        offset: The offset of the array.
+        offset: The offset in memory of an element of array.
         strides: The strides of the array.
 
     Returns:
-        The offset of the array of a different memory layout.
+        The offset of the array of a flipped memory layout.
     """
 
     var remainder = offset
@@ -478,8 +467,10 @@ fn apply_func_on_array_without_dim_reduction[
 
 
 # ===----------------------------------------------------------------------=== #
-# NDArray conversions
+# NDArray dtype conversions
 # ===----------------------------------------------------------------------=== #
+
+
 fn bool_to_numeric[
     dtype: DType
 ](array: NDArray[DType.bool]) raises -> NDArray[dtype]:
@@ -506,6 +497,9 @@ fn bool_to_numeric[
     return res
 
 
+# ===----------------------------------------------------------------------=== #
+# Numojo.NDArray to other collections
+# ===----------------------------------------------------------------------=== #
 fn to_numpy[dtype: DType](array: NDArray[dtype]) raises -> PythonObject:
     """
     Convert a NDArray to a numpy array.
@@ -598,6 +592,8 @@ fn to_tensor[dtype: DType](a: NDArray[dtype]) raises -> Tensor[dtype]:
 # ===----------------------------------------------------------------------=== #
 # Type checking functions
 # ===----------------------------------------------------------------------=== #
+
+
 @parameter
 fn is_inttype[dtype: DType]() -> Bool:
     """
@@ -715,6 +711,11 @@ fn is_booltype(dtype: DType) -> Bool:
     return False
 
 
+# ===----------------------------------------------------------------------=== #
+# Miscellaneous utility functions
+# ===----------------------------------------------------------------------=== #
+
+
 fn _list_of_range(n: Int) -> List[Int]:
     """
     Generate a list of integers starting from 0 and of size n.
diff --git a/numojo/routines/sorting.mojo b/numojo/routines/sorting.mojo
index 07a95cd9..cc58f7c0 100644
--- a/numojo/routines/sorting.mojo
+++ b/numojo/routines/sorting.mojo
@@ -1,4 +1,13 @@
 # ===----------------------------------------------------------------------=== #
+# Distributed under the Apache 2.0 License with LLVM Exceptions.
+# See LICENSE and the LLVM License for more information.
+# https://github.com/Mojo-Numerics-and-Algorithms-group/NuMojo/blob/main/LICENSE
+# https://llvm.org/LICENSE.txt
+# ===----------------------------------------------------------------------=== #
+"""
+Sorting.
+"""
+# ===----------------------------------------------------------------------=== #
 # Sorting
 # ===----------------------------------------------------------------------=== #
 

From 9274855b9955bd29bcd5fb3e7a6cefcfc60869f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?ZHU=20Yuhao=20=E6=9C=B1=E5=AE=87=E6=B5=A9?=
 <dr.yuhao.zhu@outlook.com>
Date: Sat, 15 Feb 2025 13:36:06 +0100
Subject: [PATCH 11/11] Update sorting module structure

---
 numojo/routines/sorting.mojo | 515 ++++++++++++++++++-----------------
 1 file changed, 261 insertions(+), 254 deletions(-)

diff --git a/numojo/routines/sorting.mojo b/numojo/routines/sorting.mojo
index cc58f7c0..6024bae7 100644
--- a/numojo/routines/sorting.mojo
+++ b/numojo/routines/sorting.mojo
@@ -8,7 +8,14 @@
 Sorting.
 """
 # ===----------------------------------------------------------------------=== #
-# Sorting
+# SECTIONS OF THIS FILE:
+# 1. `sort` and `argsort` functions exposed to users.
+# 2. Backend multiple sorting methods that can be used in `sort`.
+#     - Binary sort.
+#     - Bubble sort.
+#     - Quick sort (instable).
+#
+# TODO: Add more sorting algorithms.
 # ===----------------------------------------------------------------------=== #
 
 
@@ -22,11 +29,232 @@ from numojo.core.matrix import Matrix
 import numojo.core.utility as utility
 from numojo.routines.manipulation import ravel, transpose
 
-"""
-TODO:
-1) Add more sorting algorithms.
-2) Add axis.
-"""
+
+# ===----------------------------------------------------------------------=== #
+# Sorting functions exposed to users
+# ===----------------------------------------------------------------------=== #
+
+
+fn sort[dtype: DType](a: NDArray[dtype]) raises -> NDArray[dtype]:
+    """
+    Sort NDArray using quick sort method.
+    It is not guaranteed to be unstable.
+
+    When no axis is given, the array is flattened before sorting.
+
+    Parameters:
+        dtype: The input element type.
+
+    Args:
+        a: NDArray.
+    """
+
+    if a.ndim == 1:
+        return quick_sort_1d(a)
+    else:
+        return quick_sort_1d(ravel(a))
+
+
+fn sort[
+    dtype: DType
+](owned a: NDArray[dtype], axis: Int) raises -> NDArray[dtype]:
+    """
+    Sort NDArray along the given axis using quick sort method.
+    It is not guaranteed to be unstable.
+
+    When no axis is given, the array is flattened before sorting.
+
+    Parameters:
+        dtype: The input element type.
+
+    Args:
+        a: NDArray to sort.
+        axis: The axis along which the array is sorted.
+
+    """
+
+    var normalized_axis = axis
+    if axis < 0:
+        normalized_axis += a.ndim
+    if (normalized_axis < 0) or (normalized_axis >= a.ndim):
+        raise Error(
+            String("Error in `mean`: Axis {} not in bound [-{}, {})").format(
+                axis, a.ndim, a.ndim
+            )
+        )
+
+    return utility.apply_func_on_array_without_dim_reduction[
+        func=quick_sort_1d
+    ](a, axis=normalized_axis)
+
+
+fn sort[dtype: DType](A: Matrix[dtype]) raises -> Matrix[dtype]:
+    """
+    Sort the Matrix. It is first flattened before sorting.
+    """
+    var I = Matrix.zeros[DType.index](shape=A.shape)
+    var B = A.flatten()
+    _sort_inplace(B, I, 0, A.size - 1)
+
+    return B^
+
+
+fn sort[
+    dtype: DType
+](owned A: Matrix[dtype], axis: Int) raises -> Matrix[dtype]:
+    """
+    Sort the Matrix along the given axis.
+    """
+    if axis == 1:
+        var I = Matrix.zeros[DType.index](shape=A.shape)
+        for i in range(A.shape[0]):
+            _sort_inplace(
+                A, I, left=i * A.strides[0], right=(i + 1) * A.strides[0] - 1
+            )
+        return A^
+    elif axis == 0:
+        return transpose(sort(transpose(A), axis=1))
+    else:
+        raise Error(String("The axis can either be 1 or 0!"))
+
+
+fn argsort[
+    dtype: DType
+](owned A: NDArray[dtype]) raises -> NDArray[DType.index]:
+    """
+    Returns the indices that would sort an array.
+    It is not guaranteed to be unstable.
+
+    When no axis is given, the array is flattened before sorting.
+
+    Parameters:
+        dtype: The input element type.
+
+    Args:
+        A: NDArray.
+
+    Returns:
+        Indices that would sort an array.
+    """
+
+    A = ravel(A)
+    var I = NDArray[DType.index](A.shape)
+    _sort_inplace(A, I, axis=0)
+    return I^
+
+
+fn argsort[
+    dtype: DType
+](owned A: NDArray[dtype], owned axis: Int) raises -> NDArray[DType.index]:
+    """
+    Returns the indices that would sort an array.
+    It is not guaranteed to be unstable.
+
+    When no axis is given, the array is flattened before sorting.
+
+    Parameters:
+        dtype: The input element type.
+
+    Args:
+        A: NDArray to sort.
+        axis: The axis along which the array is sorted.
+
+    Returns:
+        Indices that would sort an array.
+
+    """
+
+    var I = NDArray[DType.index](A.shape)
+    _sort_inplace(A, I, axis)
+    return I^
+
+
+fn argsort[dtype: DType](A: Matrix[dtype]) raises -> Matrix[DType.index]:
+    """
+    Argsort the Matrix. It is first flattened before sorting.
+    """
+    var I = Matrix[DType.index](shape=(1, A.size))
+    for i in range(I.size):
+        I._buf.ptr[i] = i
+    var B = A.flatten()
+    _sort_inplace(B, I, 0, A.size - 1)
+
+    return I^
+
+
+fn argsort[
+    dtype: DType
+](owned A: Matrix[dtype], axis: Int) raises -> Matrix[DType.index]:
+    """
+    Argsort the Matrix along the given axis.
+    """
+    if axis == 1:
+        var I = Matrix[DType.index](shape=A.shape)
+        for i in range(I.shape[0]):
+            for j in range(I.shape[1]):
+                I._store(i, j, j)
+
+        for i in range(A.shape[0]):
+            _sort_inplace(
+                A, I, left=i * A.strides[0], right=(i + 1) * A.strides[0] - 1
+            )
+        return I^
+    elif axis == 0:
+        return transpose(argsort(transpose(A), axis=1))
+    else:
+        raise Error(String("The axis can either be 1 or 0!"))
+
+
+# ===----------------------------------------------------------------------=== #
+# Multiple sorting algorithms in the backend.
+# ===----------------------------------------------------------------------=== #
+
+
+###############
+# Binary sort #
+###############
+
+
+fn binary_sort[
+    dtype: DType = DType.float64
+](array: NDArray[dtype]) raises -> NDArray[dtype]:
+    """
+    Binary sorting of NDArray.
+
+    Example:
+        ```py
+        var arr = numojo.core.random.rand[numojo.i16](100)
+        var sorted_arr = numojo.core.sort.binary_sort(arr)
+        print(sorted_arr)
+        ```
+
+    Parameters:
+         dtype: The element type.
+
+    Args:
+        array: A NDArray.
+
+    Returns:
+        The sorted NDArray of type `dtype`.
+    """
+
+    @parameter
+    if dtype != array.dtype:
+        alias dtype = array.dtype
+
+    var result: NDArray[dtype] = NDArray[dtype](array.shape)
+    for i in range(array.size):
+        result.store(i, array.load(i).cast[dtype]())
+
+    var n = array.num_elements()
+    for end in range(n, 1, -1):
+        for i in range(1, end):
+            if result[i - 1] > result[i]:
+                var temp: Scalar[dtype] = result.load(i - 1)
+                result.store(i - 1, result.load(i))
+                result.store(i, temp)
+    return result
+
 
 ###############
 # Bubble sort #
@@ -76,6 +304,33 @@ fn bubble_sort[dtype: DType](ndarray: NDArray[dtype]) raises -> NDArray[dtype]:
 ##############
 
 
+fn quick_sort_1d[dtype: DType](a: NDArray[dtype]) raises -> NDArray[dtype]:
+    """
+    Sort 1-d array using quick sort method.
+    It is not guaranteed to be unstable.
+
+    Raises:
+        Error: If the input array is not 1-d.
+
+    Parameters:
+        dtype: The input element type.
+
+    Args:
+        a: An 1-d array.
+    """
+
+    if a.ndim != 1:
+        raise Error(
+            String(
+                "\nError in `sort_1d`: Input array must be 1-d, but got {}-d"
+            ).format(a.ndim)
+        )
+    res = a
+    var _I = NDArray[DType.index](a.shape)
+    _sort_inplace(res, _I, axis=0)
+    return res^
+
+
 fn _partition_in_range(
     mut A: NDArray,
     mut I: NDArray,
@@ -297,251 +552,3 @@ fn _sort_inplace[
         _sort_inplace(A, I, axis=-1)
         A = transpose(A, axes=transposed_axes)
         I = transpose(I, axes=transposed_axes)
-
-
-fn sort_1d[dtype: DType](a: NDArray[dtype]) raises -> NDArray[dtype]:
-    """
-    Sort 1-d array using quick sort method.
-    It is not guaranteed to be unstable.
-
-    Raises:
-        Error: If the input array is not 1-d.
-
-    Parameters:
-        dtype: The input element type.
-
-    Args:
-        a: An 1-d array.
-    """
-
-    if a.ndim != 1:
-        raise Error(
-            String(
-                "\nError in `sort_1d`: Input array must be 1-d, but got {}-d"
-            ).format(a.ndim)
-        )
-    res = a
-    var _I = NDArray[DType.index](a.shape)
-    _sort_inplace(res, _I, axis=0)
-    return res^
-
-
-fn sort[dtype: DType](a: NDArray[dtype]) raises -> NDArray[dtype]:
-    """
-    Sort NDArray using quick sort method.
-    It is not guaranteed to be unstable.
-
-    When no axis is given, the array is flattened before sorting.
-
-    Parameters:
-        dtype: The input element type.
-
-    Args:
-        a: NDArray.
-    """
-
-    if a.ndim == 1:
-        return sort_1d(a)
-    else:
-        return sort_1d(ravel(a))
-
-
-fn sort[
-    dtype: DType
-](owned a: NDArray[dtype], axis: Int) raises -> NDArray[dtype]:
-    """
-    Sort NDArray along the given axis using quick sort method.
-    It is not guaranteed to be unstable.
-
-    When no axis is given, the array is flattened before sorting.
-
-    Parameters:
-        dtype: The input element type.
-
-    Args:
-        a: NDArray to sort.
-        axis: The axis along which the array is sorted.
-
-    """
-
-    var normalized_axis = axis
-    if axis < 0:
-        normalized_axis += a.ndim
-    if (normalized_axis < 0) or (normalized_axis >= a.ndim):
-        raise Error(
-            String("Error in `mean`: Axis {} not in bound [-{}, {})").format(
-                axis, a.ndim, a.ndim
-            )
-        )
-
-    return utility.apply_func_on_array_without_dim_reduction[func=sort_1d](
-        a, axis=normalized_axis
-    )
-
-
-fn sort[dtype: DType](A: Matrix[dtype]) raises -> Matrix[dtype]:
-    """
-    Sort the Matrix. It is first flattened before sorting.
-    """
-    var I = Matrix.zeros[DType.index](shape=A.shape)
-    var B = A.flatten()
-    _sort_inplace(B, I, 0, A.size - 1)
-
-    return B^
-
-
-fn sort[
-    dtype: DType
-](owned A: Matrix[dtype], axis: Int) raises -> Matrix[dtype]:
-    """
-    Sort the Matrix along the given axis.
-    """
-    if axis == 1:
-        var I = Matrix.zeros[DType.index](shape=A.shape)
-        for i in range(A.shape[0]):
-            _sort_inplace(
-                A, I, left=i * A.strides[0], right=(i + 1) * A.strides[0] - 1
-            )
-        return A^
-    elif axis == 0:
-        return transpose(sort(transpose(A), axis=1))
-    else:
-        raise Error(String("The axis can either be 1 or 0!"))
-
-
-###############
-# Binary sort #
-###############
-
-
-fn binary_sort[
-    dtype: DType = DType.float64
-](array: NDArray[dtype]) raises -> NDArray[dtype]:
-    """
-    Binary sorting of NDArray.
-
-    Example:
-        ```py
-        var arr = numojo.core.random.rand[numojo.i16](100)
-        var sorted_arr = numojo.core.sort.binary_sort(arr)
-        print(sorted_arr)
-        ```
-
-    Parameters:
-         dtype: The element type.
-
-    Args:
-        array: A NDArray.
-
-    Returns:
-        The sorted NDArray of type `dtype`.
-    """
-
-    @parameter
-    if dtype != array.dtype:
-        alias dtype = array.dtype
-
-    var result: NDArray[dtype] = NDArray[dtype](array.shape)
-    for i in range(array.size):
-        result.store(i, array.load(i).cast[dtype]())
-
-    var n = array.num_elements()
-    for end in range(n, 1, -1):
-        for i in range(1, end):
-            if result[i - 1] > result[i]:
-                var temp: Scalar[dtype] = result.load(i - 1)
-                result.store(i - 1, result.load(i))
-                result.store(i, temp)
-    return result
-
-
-# ===----------------------------------------------------------------------=== #
-# Searching
-# ===----------------------------------------------------------------------=== #
-
-
-fn argsort[
-    dtype: DType
-](owned A: NDArray[dtype]) raises -> NDArray[DType.index]:
-    """
-    Returns the indices that would sort an array.
-    It is not guaranteed to be unstable.
-
-    When no axis is given, the array is flattened before sorting.
-
-    Parameters:
-        dtype: The input element type.
-
-    Args:
-        A: NDArray.
-
-    Returns:
-        Indices that would sort an array.
-    """
-
-    A = ravel(A)
-    var I = NDArray[DType.index](A.shape)
-    _sort_inplace(A, I, axis=0)
-    return I^
-
-
-fn argsort[
-    dtype: DType
-](owned A: NDArray[dtype], owned axis: Int) raises -> NDArray[DType.index]:
-    """
-    Returns the indices that would sort an array.
-    It is not guaranteed to be unstable.
-
-    When no axis is given, the array is flattened before sorting.
-
-    Parameters:
-        dtype: The input element type.
-
-    Args:
-        A: NDArray to sort.
-        axis: The axis along which the array is sorted.
-
-    Returns:
-        Indices that would sort an array.
-
-    """
-
-    var I = NDArray[DType.index](A.shape)
-    _sort_inplace(A, I, axis)
-    return I^
-
-
-fn argsort[dtype: DType](A: Matrix[dtype]) raises -> Matrix[DType.index]:
-    """
-    Argsort the Matrix. It is first flattened before sorting.
-    """
-    var I = Matrix[DType.index](shape=(1, A.size))
-    for i in range(I.size):
-        I._buf.ptr[i] = i
-    var B = A.flatten()
-    _sort_inplace(B, I, 0, A.size - 1)
-
-    return I^
-
-
-fn argsort[
-    dtype: DType
-](owned A: Matrix[dtype], axis: Int) raises -> Matrix[DType.index]:
-    """
-    Argsort the Matrix along the given axis.
-    """
-    if axis == 1:
-        var I = Matrix[DType.index](shape=A.shape)
-        for i in range(I.shape[0]):
-            for j in range(I.shape[1]):
-                I._store(i, j, j)
-
-        for i in range(A.shape[0]):
-            _sort_inplace(
-                A, I, left=i * A.strides[0], right=(i + 1) * A.strides[0] - 1
-            )
-        return I^
-    elif axis == 0:
-        return transpose(argsort(transpose(A), axis=1))
-    else:
-        raise Error(String("The axis can either be 1 or 0!"))