diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index c17bb841..a22f44fb 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -2,12 +2,13 @@ name: Update docs
 on:
   push:
     branches:
-      - master
+      - main
     tags:
       - v*
+  workflow_dispatch:
 
 jobs:
   update-docs:
     name: Update docs
-    uses: unifyai/workflows/.github/workflows/docs.yml@master
+    uses: unifyai/workflows/.github/workflows/docs.yml@main
     secrets: inherit
diff --git a/.github/workflows/lint-bot.yml b/.github/workflows/lint-bot.yml
index 03e43b4a..5136a5c1 100644
--- a/.github/workflows/lint-bot.yml
+++ b/.github/workflows/lint-bot.yml
@@ -11,5 +11,5 @@ permissions:
 jobs:
   fix-linting:
     name: Fix Linting
-    uses: unifyai/workflows/.github/workflows/lint-bot.yml@master
+    uses: unifyai/workflows/.github/workflows/lint-bot.yml@main
     secrets: inherit
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 3e5c9ee2..ffb247e8 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -5,4 +5,4 @@ on: [push, pull_request]
 jobs:
   check-formatting:
     name: Check formatting
-    uses: unifyai/workflows/.github/workflows/lint.yml@master
\ No newline at end of file
+    uses: unifyai/workflows/.github/workflows/lint.yml@main
\ No newline at end of file
diff --git a/.github/workflows/test-new-pr.yml b/.github/workflows/test-new-pr.yml
index b9e53d70..1701bd83 100644
--- a/.github/workflows/test-new-pr.yml
+++ b/.github/workflows/test-new-pr.yml
@@ -16,9 +16,7 @@ jobs:
         uses: tj-actions/changed-files@v37
         with:
           files: |
-            "ivy_models_tests/"
-          files_ignore: |
-            "!*.py"
+            "ivy_models_tests/**/*.py"
 
       - name: Run tests if any files in ivy_models_tests changed
         if: steps.changed-files.outputs.any_changed == 'true'
diff --git a/README.rst b/README.rst
index 146588a4..af9a35e7 100644
--- a/README.rst
+++ b/README.rst
@@ -1,8 +1,8 @@
-.. image:: https://github.com/unifyai/unifyai.github.io/blob/master/img/externally_linked/logo.png?raw=true#gh-light-mode-only
+.. image:: https://github.com/unifyai/unifyai.github.io/blob/main/img/externally_linked/logo.png?raw=true#gh-light-mode-only
    :width: 100%
    :class: only-light
 
-.. image:: https://github.com/unifyai/unifyai.github.io/blob/master/img/externally_linked/logo_dark.png?raw=true#gh-dark-mode-only
+.. image:: https://github.com/unifyai/unifyai.github.io/blob/main/img/externally_linked/logo_dark.png?raw=true#gh-dark-mode-only
    :width: 100%
    :class: only-dark
 
@@ -78,23 +78,23 @@ The layers are sometimes kept in a separate file, usually named :code:`layers.py
 .. raw:: html
 
     <div style="display: block;" align="center">
-        <img class="dark-light" width="6%" style="float: left;" src="https://raw.githubusercontent.com/unifyai/unifyai.github.io/master/img/externally_linked/logos/supported/empty.png">
+        <img class="dark-light" width="6%" style="float: left;" src="https://raw.githubusercontent.com/unifyai/unifyai.github.io/main/img/externally_linked/logos/supported/empty.png">
         <a href="https://jax.readthedocs.io">
-            <img class="dark-light" width="13%" style="float: left;" src="https://raw.githubusercontent.com/unifyai/unifyai.github.io/master/img/externally_linked/logos/supported/jax_logo.png">
+            <img class="dark-light" width="13%" style="float: left;" src="https://raw.githubusercontent.com/unifyai/unifyai.github.io/main/img/externally_linked/logos/supported/jax_logo.png">
         </a>
-        <img class="dark-light" width="12%" style="float: left;" src="https://raw.githubusercontent.com/unifyai/unifyai.github.io/master/img/externally_linked/logos/supported/empty.png">
+        <img class="dark-light" width="12%" style="float: left;" src="https://raw.githubusercontent.com/unifyai/unifyai.github.io/main/img/externally_linked/logos/supported/empty.png">
         <a href="https://www.tensorflow.org">
-            <img class="dark-light" width="13%" style="float: left;" src="https://raw.githubusercontent.com/unifyai/unifyai.github.io/master/img/externally_linked/logos/supported/tensorflow_logo.png">
+            <img class="dark-light" width="13%" style="float: left;" src="https://raw.githubusercontent.com/unifyai/unifyai.github.io/main/img/externally_linked/logos/supported/tensorflow_logo.png">
         </a>
-        <img class="dark-light" width="12%" style="float: left;" src="https://raw.githubusercontent.com/unifyai/unifyai.github.io/master/img/externally_linked/logos/supported/empty.png">
+        <img class="dark-light" width="12%" style="float: left;" src="https://raw.githubusercontent.com/unifyai/unifyai.github.io/main/img/externally_linked/logos/supported/empty.png">
         <a href="https://pytorch.org">
-            <img class="dark-light" width="13%" style="float: left;" src="https://raw.githubusercontent.com/unifyai/unifyai.github.io/master/img/externally_linked/logos/supported/pytorch_logo.png">
+            <img class="dark-light" width="13%" style="float: left;" src="https://raw.githubusercontent.com/unifyai/unifyai.github.io/main/img/externally_linked/logos/supported/pytorch_logo.png">
         </a>
-        <img class="dark-light" width="12%" style="float: left;" src="https://raw.githubusercontent.com/unifyai/unifyai.github.io/master/img/externally_linked/logos/supported/empty.png">
+        <img class="dark-light" width="12%" style="float: left;" src="https://raw.githubusercontent.com/unifyai/unifyai.github.io/main/img/externally_linked/logos/supported/empty.png">
         <a href="https://numpy.org">
-            <img class="dark-light" width="13%" style="float: left;" src="https://raw.githubusercontent.com/unifyai/unifyai.github.io/master/img/externally_linked/logos/supported/numpy_logo.png">
+            <img class="dark-light" width="13%" style="float: left;" src="https://raw.githubusercontent.com/unifyai/unifyai.github.io/main/img/externally_linked/logos/supported/numpy_logo.png">
         </a>
-        <img class="dark-light" width="6%" style="float: left;" src="https://raw.githubusercontent.com/unifyai/unifyai.github.io/master/img/externally_linked/logos/supported/empty.png">
+        <img class="dark-light" width="6%" style="float: left;" src="https://raw.githubusercontent.com/unifyai/unifyai.github.io/main/img/externally_linked/logos/supported/empty.png">
     </div>
     <br clear="all" />
 
@@ -109,26 +109,26 @@ neural memory, pre-trained models + implementations, and builder tools with trai
     <div style="display: block;">
         <a href="https://github.com/unifyai/mech">
             <picture>
-                <source class="dark-light" width="15%" style="float: left; margin: 0% 5%;" media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/unifyai/unifyai.github.io/master/img/externally_linked/logos/ivy_mech_dark.png">
-                <img class="dark-light" width="15%" style="float: left; margin: 0% 5%;" src="https://raw.githubusercontent.com/unifyai/unifyai.github.io/master/img/externally_linked/logos/ivy_mech.png">
+                <source class="dark-light" width="15%" style="float: left; margin: 0% 5%;" media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/unifyai/unifyai.github.io/main/img/externally_linked/logos/ivy_mech_dark.png">
+                <img class="dark-light" width="15%" style="float: left; margin: 0% 5%;" src="https://raw.githubusercontent.com/unifyai/unifyai.github.io/main/img/externally_linked/logos/ivy_mech.png">
             </picture>
         </a>
         <a href="https://github.com/unifyai/vision">
             <picture>
-                <source class="dark-light" width="15%" style="float: left; margin: 0% 5%;" media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/unifyai/unifyai.github.io/master/img/externally_linked/logos/ivy_vision_dark.png">
-                <img class="dark-light" width="15%" style="float: left; margin: 0% 5%;" src="https://raw.githubusercontent.com/unifyai/unifyai.github.io/master/img/externally_linked/logos/ivy_vision.png">
+                <source class="dark-light" width="15%" style="float: left; margin: 0% 5%;" media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/unifyai/unifyai.github.io/main/img/externally_linked/logos/ivy_vision_dark.png">
+                <img class="dark-light" width="15%" style="float: left; margin: 0% 5%;" src="https://raw.githubusercontent.com/unifyai/unifyai.github.io/main/img/externally_linked/logos/ivy_vision.png">
             </picture>
         </a>
         <a href="https://github.com/unifyai/robot">
             <picture>
-                <source class="dark-light" width="15%" style="float: left; margin: 0% 5%;" media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/unifyai/unifyai.github.io/master/img/externally_linked/logos/ivy_robot_dark.png">
-                <img class="dark-light" width="15%" style="float: left; margin: 0% 5%;" src="https://raw.githubusercontent.com/unifyai/unifyai.github.io/master/img/externally_linked/logos/ivy_robot.png">
+                <source class="dark-light" width="15%" style="float: left; margin: 0% 5%;" media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/unifyai/unifyai.github.io/main/img/externally_linked/logos/ivy_robot_dark.png">
+                <img class="dark-light" width="15%" style="float: left; margin: 0% 5%;" src="https://raw.githubusercontent.com/unifyai/unifyai.github.io/main/img/externally_linked/logos/ivy_robot.png">
             </picture>
         </a>
         <a href="https://github.com/unifyai/gym">
             <picture>
-                <source class="dark-light" width="15%" style="float: left; margin: 0% 5%;" media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/unifyai/unifyai.github.io/master/img/externally_linked/logos/ivy_gym_dark.png">
-                <img class="dark-light" width="15%" style="float: left; margin: 0% 5%;" src="https://raw.githubusercontent.com/unifyai/unifyai.github.io/master/img/externally_linked/logos/ivy_gym.png">
+                <source class="dark-light" width="15%" style="float: left; margin: 0% 5%;" media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/unifyai/unifyai.github.io/main/img/externally_linked/logos/ivy_gym_dark.png">
+                <img class="dark-light" width="15%" style="float: left; margin: 0% 5%;" src="https://raw.githubusercontent.com/unifyai/unifyai.github.io/main/img/externally_linked/logos/ivy_gym.png">
             </picture>
         </a>
 
@@ -166,26 +166,26 @@ neural memory, pre-trained models + implementations, and builder tools with trai
 
         <a href="https://github.com/unifyai/memory">
             <picture>
-                <source class="dark-light" width="15%" style="float: left; margin: 0% 5%;" media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/unifyai/unifyai.github.io/master/img/externally_linked/logos/ivy_memory_dark.png">
-                <img class="dark-light" width="15%" style="float: left; margin: 0% 5%;" src="https://raw.githubusercontent.com/unifyai/unifyai.github.io/master/img/externally_linked/logos/ivy_memory.png">
+                <source class="dark-light" width="15%" style="float: left; margin: 0% 5%;" media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/unifyai/unifyai.github.io/main/img/externally_linked/logos/ivy_memory_dark.png">
+                <img class="dark-light" width="15%" style="float: left; margin: 0% 5%;" src="https://raw.githubusercontent.com/unifyai/unifyai.github.io/main/img/externally_linked/logos/ivy_memory.png">
             </picture>
         </a>
         <a href="https://github.com/unifyai/builder">
             <picture>
-                <source class="dark-light" width="15%" style="float: left; margin: 0% 5%;" media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/unifyai/unifyai.github.io/master/img/externally_linked/logos/ivy_builder_dark.png">
-                <img class="dark-light" width="15%" style="float: left; margin: 0% 5%;" src="https://raw.githubusercontent.com/unifyai/unifyai.github.io/master/img/externally_linked/logos/ivy_builder.png">
+                <source class="dark-light" width="15%" style="float: left; margin: 0% 5%;" media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/unifyai/unifyai.github.io/main/img/externally_linked/logos/ivy_builder_dark.png">
+                <img class="dark-light" width="15%" style="float: left; margin: 0% 5%;" src="https://raw.githubusercontent.com/unifyai/unifyai.github.io/main/img/externally_linked/logos/ivy_builder.png">
             </picture>
         </a>
         <a href="https://github.com/unifyai/models">
             <picture>
-                <source class="dark-light" width="15%" style="float: left; margin: 0% 5%;" media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/unifyai/unifyai.github.io/master/img/externally_linked/logos/ivy_models_dark.png">
-                <img class="dark-light" width="15%" style="float: left; margin: 0% 5%;" src="https://raw.githubusercontent.com/unifyai/unifyai.github.io/master/img/externally_linked/logos/ivy_models.png">
+                <source class="dark-light" width="15%" style="float: left; margin: 0% 5%;" media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/unifyai/unifyai.github.io/main/img/externally_linked/logos/ivy_models_dark.png">
+                <img class="dark-light" width="15%" style="float: left; margin: 0% 5%;" src="https://raw.githubusercontent.com/unifyai/unifyai.github.io/main/img/externally_linked/logos/ivy_models.png">
             </picture>
         </a>
         <a href="https://github.com/unifyai/ecosystem">
             <picture>
-                <source class="dark-light" width="15%" style="float: left; margin: 0% 5%;" media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/unifyai/unifyai.github.io/master/img/externally_linked/logos/ivy_ecosystem_dark.png">
-                <img class="dark-light" width="15%" style="float: left; margin: 0% 5%;" src="https://raw.githubusercontent.com/unifyai/unifyai.github.io/master/img/externally_linked/logos/ivy_ecosystem.png">
+                <source class="dark-light" width="15%" style="float: left; margin: 0% 5%;" media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/unifyai/unifyai.github.io/main/img/externally_linked/logos/ivy_ecosystem_dark.png">
+                <img class="dark-light" width="15%" style="float: left; margin: 0% 5%;" src="https://raw.githubusercontent.com/unifyai/unifyai.github.io/main/img/externally_linked/logos/ivy_ecosystem.png">
             </picture>
         </a>
 
diff --git a/ivy_models/base/model.py b/ivy_models/base/model.py
index 8cc17707..b6c89d14 100644
--- a/ivy_models/base/model.py
+++ b/ivy_models/base/model.py
@@ -170,4 +170,7 @@ def load_from_huggingface(
             spec = self.get_spec_class().from_json_file(config_path)
             os.remove(config_path)
 
-            return self(spec=spec, v=weights)
+            model = self(spec=spec)
+            model.v = weights
+
+            return model
diff --git a/ivy_models/dino/__init__.py b/ivy_models/dino/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/ivy_models/dino/dino.py b/ivy_models/dino/dino.py
new file mode 100644
index 00000000..e56ec8a5
--- /dev/null
+++ b/ivy_models/dino/dino.py
@@ -0,0 +1,134 @@
+from ivy_models.base import BaseModel, BaseSpec
+import ivy
+from ivy_models.vit.vit import VisionTransformer
+from ivy_models.dino.layers import MultiCropWrapper, DINOHead, DINOBackbone
+from ivy_models.vit.layers import partial, ConvStemConfig
+
+class DINOConfig(BaseSpec):
+    def __init__(self, img_size: int,
+    patch_size: int,
+    num_layers: int,
+    num_heads: int,
+    hidden_dim: int,
+    mlp_dim: int,
+    in_dim: int = 0,
+    dropout: float = 0.0,
+    attention_dropout: float = 0.0,
+    num_classes: int = 1000,
+    representation_size: ivy.Optional[int] = None,
+    norm_layer: ivy.Callable[..., ivy.Module] = partial(ivy.LayerNorm, eps=1e-6),
+    conv_stem_configs: ivy.Optional[ivy.List[ConvStemConfig]] = None,
+    out_dim: int = 65536,
+    use_bn: bool = False,
+    norm_last_layer: bool = True,
+    nlayers: int = 1,
+    hidden_dim_: int = 2048,
+    bottleneck_dim: int = 256,
+    _weight_init: ivy.Initializer = ivy.GlorotUniform(),
+    _bias_init: ivy.Initializer = ivy.Zeros(),
+    with_bias: bool = True,
+    device=None,
+    dtype=None
+    ):
+        super(DINOConfig, self).__init__()
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.hidden_dim = hidden_dim
+        self.mlp_dim = mlp_dim
+        self.in_dim = in_dim
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.num_classes = num_classes
+        self.representation_size = representation_size
+        self.norm_layer = norm_layer
+        self.conv_stem_configs = conv_stem_configs
+        self.out_dim = out_dim
+        self.use_bn = use_bn
+        self.norm_last_layer = norm_last_layer
+        self.nlayers = nlayers
+        self.hidden_dim_ = hidden_dim_
+        self.bottleneck_dim = bottleneck_dim
+        self._weight_init = _weight_init
+        self._bias_init = _bias_init
+        self.with_bias = with_bias
+        self.device = device
+        self.dtype = dtype
+
+    def get(self, *attr_names):
+        new_dict = {}
+        for name in attr_names:
+            new_dict[name] = getattr(self, name)
+        return new_dict
+
+    def get_vit_attrs(self):
+        return self.get(
+            "img_size",
+            "patch_size",
+            "num_layers",
+            "num_heads",
+            "hidden_dim",
+            "mlp_dim",
+            "dropout",
+            "attention_dropout",
+            "num_classes",
+            "representation_size",
+            "norm_layer",
+            "conv_stem_configs"
+        )
+
+    def get_head_attrs(self):
+        return self.get(
+            "in_dim",
+            "out_dim",
+            "use_bn",
+            "norm_last_layer",
+            "nlayers",
+            "hidden_dim_",
+            "bottleneck_dim",
+            "_weight_init",
+            "_bias_init",
+            "with_bias"
+        )
+
+class DINONet(BaseModel):
+
+    def __init__(
+            self,
+            config: DINOConfig,
+            v: ivy.Container = None,
+    ) -> None:
+        self.config = config
+        super(DINONet, self).__init__(v=v)
+
+    @classmethod
+    def get_spec_class(self):
+        return DINOConfig
+
+    def _build(self):
+        self.student = DINOBackbone(**self.config.get_vit_attrs())
+        self.teacher = DINOBackbone(**self.config.get_vit_attrs())
+        self.config.in_dim = self.config.num_classes
+        self.teacher_head = DINOHead(**self.config.get_head_attrs())
+        self.student_head = DINOHead(**self.config.get_head_attrs())
+
+    def _forward(self, x):
+        return {
+            "student_output": self.student_head(self.student(x)),
+            "teacher_output": self.teacher_head(self.teacher(x))
+        }
+
+
+def dino_base(pretrained=False):
+    # instantiate the hyperparameters same as bert
+    # set the dropout rate to 0.0 to avoid stochasticity in the output
+    config = DINOConfig(img_size = 224, patch_size=16,
+        num_layers=12,
+        num_heads=12,
+        hidden_dim=768,
+        mlp_dim=3072, out_dim = 65536,
+    )
+    model = DINONet(config)
+    return model
+
diff --git a/ivy_models/dino/dino_vit.py b/ivy_models/dino/dino_vit.py
new file mode 100644
index 00000000..99f3df3e
--- /dev/null
+++ b/ivy_models/dino/dino_vit.py
@@ -0,0 +1,283 @@
+import math
+import ivy
+from ivy_models.dino.utils import trunc_normal_
+from ivy.stateful.initializers import Zeros, GlorotUniform
+from ivy_models.vit.layers import partial
+
+
+def drop_path(x, drop_prob: float = 0., training: bool = False):
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + ivy.random_uniform(0,1, shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(ivy.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class Mlp(ivy.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=ivy.GELU, drop=0.):
+        super(Mlp, self).__init__()
+        self.in_features = in_features
+        self.out_features = out_features or in_features
+        self.hidden_features = hidden_features or in_features
+        self.act_layer = act_layer
+        self.drop = drop
+
+    def _build(self, *args, **kwargs):
+        self.fc1 = ivy.Linear(self.in_features, self.hidden_features)
+        self.act = self.act_layer()
+        self.fc2 = ivy.Linear(self.hidden_features, self.out_features)
+        self.drop = ivy.Dropout(self.drop)
+
+    def _forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(ivy.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+        self.dim = dim
+        self.num_heads = num_heads
+        self.qkv_bias = qkv_bias
+        self.qk_scale = qk_scale
+        self.attn_drop = attn_drop
+        self.proj_drop = proj_drop
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+
+    def _build(self, *args, **kwargs):
+        self.qkv = ivy.Linear(self.dim, self.dim * 3, bias=self.qkv_bias)
+        self.attn_drop = ivy.Dropout(self.attn_drop)
+        self.proj = ivy.Linear(self.dim, self.dim)
+        self.proj_drop = ivy.Dropout(self.proj_drop)
+
+    def _forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x, attn
+
+
+class Block(ivy.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=ivy.GELU, norm_layer=ivy.LayerNorm):
+        # Additional attributes
+        self.dim = dim
+        self.num_heads = num_heads
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.qk_scale = qk_scale
+        self.drop = drop
+        self.attn_drop = attn_drop
+        self.drop_path = drop_path
+        self.act_layer = act_layer
+        self.norm_layer = norm_layer
+        super(Block, self).__init__()
+
+    def _build(self, *args, **kwargs):
+        self.norm1 = self.norm_layer(self.dim)
+        self.attn = Attention(
+            self.dim, num_heads=self.num_heads, qkv_bias=self.qkv_bias, qk_scale=self.qk_scale, attn_drop=self.attn_drop, proj_drop=self.drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else ivy.Identity()
+        self.norm2 = self.norm_layer(self.dim)
+        mlp_hidden_dim = int(self.dim * self.mlp_ratio)
+        self.mlp = Mlp(in_features=self.dim, hidden_features=mlp_hidden_dim, act_layer=self.act_layer, drop=self.drop)
+
+    def _forward(self, x, return_attention=False):
+        y, attn = self.attn(self.norm1(x))
+        if return_attention:
+            return attn
+        x = x + self.drop_path(y)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(ivy.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        num_patches = (img_size // patch_size) * (img_size // patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        super(PatchEmbed).__init__()
+
+    def _build(self, *args, **kwargs):
+        self.proj = ivy.Conv2D(self.in_chans, self.embed_dim, [self.patch_size, self.patch_size], self.patch_size, 0)
+
+    def _forward(self, x):
+        B, C, H, W = x.shape
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+
+
+class VisionTransformer(ivy.Module):
+    """ Vision Transformer """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=0, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+                 drop_path_rate=0., norm_layer=ivy.LayerNorm, device=None, dtype=None, v: ivy.Container = None, **kwargs) -> None:
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.in_chans = in_chans
+        self.num_classes = num_classes
+        self.embed_dim = embed_dim
+        self.depth = depth
+        self.num_heads = num_heads
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.qk_scale = qk_scale
+        self.drop_rate = drop_rate
+        self.attn_drop_rate = attn_drop_rate
+        self.drop_path_rate = drop_path_rate
+        self.norm_layer = norm_layer
+        self.patch_embed = PatchEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        self.num_patches = self.patch_embed.num_patches
+        self.cls_token_shape = (1, 1, embed_dim)
+        self.cls_token = Zeros()
+        self.pos_embed_shape = (1, self.num_patches + 1, embed_dim)
+        self.pos_embed = Zeros()
+        self._weight_init = GlorotUniform()
+        self._bias_init = Zeros()
+        self.num_features = self.embed_dim = embed_dim
+        self._w_shape = (embed_dim,)
+        self._b_shape = (embed_dim,)
+        super(VisionTransformer, self).__init__(v=v, device=device, dtype=dtype)
+
+    def _build(self, *args, **kwargs):
+        self.pos_drop = ivy.Dropout(prob=self.drop_rate)
+        dpr = [x.item() for x in ivy.linspace(0, self.drop_path_rate, self.depth)]  # stochastic depth decay rule
+        self.blocks = [
+            Block(
+                dim=self.embed_dim, num_heads=self.num_heads, mlp_ratio=self.mlp_ratio, qkv_bias=self.qkv_bias, qk_scale=self.qk_scale,
+                drop=self.drop_rate, attn_drop=self.attn_drop_rate, drop_path=dpr[i], norm_layer=self.norm_layer)
+            for i in range(self.depth)]
+        self.norm = self.norm_layer(self.embed_dim)
+
+        # Classifier head
+        self.head = ivy.Linear(self.embed_dim, self.num_classes) if self.num_classes > 0 else ivy.Identity()
+
+        # trunc_normal_(self.v.pos_embed, std=.02)
+        # trunc_normal_(self.v.cls_token, std=.02)
+        
+        
+    def _create_variables(self, *, device=None, dtype=None):
+        # w = self._weight_init.create_variables(
+        #     self._w_shape, device, dtype
+        # )
+        # v = {
+        #     "w": trunc_normal_(w, std=.02),
+        # }
+        # v = dict(
+        #     **v,
+        #     b=self._b_init.create_variables(
+        #         self._b_shape,
+        #         device,
+        #         dtype=dtype,
+        #     ),
+        # )
+        v = {}
+        v = dict(**v,
+            class_token= self.cls_token.create_variables(
+                self.cls_token_shape, device, dtype=dtype
+            ))
+        v = dict(**v, pos_embed= self.pos_embed.create_variables(self.pos_embed_shape, device, dtype=dtype))
+        return v
+
+    def interpolate_pos_encoding(self, x, w, h):
+        npatch = x.shape[1] - 1
+        N = self.v.pos_embed.shape[1] - 1
+        if npatch == N and w == h:
+            return self.v.pos_embed
+        class_pos_embed = self.v.pos_embed[:, 0]
+        patch_pos_embed = self.v.pos_embed[:, 1:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_embed.patch_size
+        h0 = h // self.patch_embed.patch_size
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        w0, h0 = w0 + 0.1, h0 + 0.1
+        patch_pos_embed = ivy.interpolate(
+            patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(0, 3, 1, 2),
+            scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)),
+            mode='bicubic',
+        )
+        assert int(w0) == patch_pos_embed.shape[-2] and int(h0) == patch_pos_embed.shape[-1]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return ivy.concat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
+
+    def prepare_tokens(self, x):
+        B, nc, w, h = x.shape
+        x = self.patch_embed(x)  # patch linear embedding
+
+        # add the [CLS] token to the embed patch tokens
+        cls_tokens = ivy.expand(self.v.cls_token, (B,-1,-1))
+        x = ivy.concat((cls_tokens, x), dim=1)
+
+        # add positional encoding to each token
+        x = x + self.interpolate_pos_encoding(x, w, h)
+
+        return self.pos_drop(x)
+
+    def _forward(self, x):
+        x = self.prepare_tokens(x)
+        for blk in self.blocks:
+            x = blk(x)
+        x = self.norm(x)
+        return x[:, 0]
+
+    def get_last_selfattention(self, x):
+        x = self.prepare_tokens(x)
+        for i, blk in enumerate(self.blocks):
+            if i < len(self.blocks) - 1:
+                x = blk(x)
+            else:
+                # return attention of the last block
+                return blk(x, return_attention=True)
+
+    def get_intermediate_layers(self, x, n=1):
+        x = self.prepare_tokens(x)
+        # we return the output tokens from the `n` last blocks
+        output = []
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if len(self.blocks) - i <= n:
+                output.append(self.norm(x))
+        return output
+
+
+def vit_tiny(patch_size=16, **kwargs):
+    model = VisionTransformer(
+        patch_size=patch_size, embed_dim=192, depth=12, num_heads=3, mlp_ratio=4,
+        qkv_bias=True, norm_layer=partial(ivy.LayerNorm, eps=1e-6), **kwargs)
+    return model
+
+
+if __name__ == "__main__":
+    model = vit_tiny()
diff --git a/ivy_models/dino/layers.py b/ivy_models/dino/layers.py
new file mode 100644
index 00000000..af3f619c
--- /dev/null
+++ b/ivy_models/dino/layers.py
@@ -0,0 +1,237 @@
+import ivy
+import ivy_models
+from ivy_models.base import BaseModel, BaseSpec
+from ivy_models.dino.utils import trunc_normal_
+from torchvision import transforms
+from ivy_models.vit.vit import VisionTransformer
+from ivy.stateful.initializers import Initializer, GlorotUniform, Zeros
+from ivy_models.vit.layers import partial, ConvStemConfig
+from ivy_models_tests.helpers import image_helpers
+from PIL import Image
+
+class DINOBackbone(ivy.Module):
+
+    def __init__(
+            self,
+            img_size: int,
+            patch_size: int,
+            num_layers: int,
+            num_heads: int,
+            hidden_dim: int,
+            mlp_dim: int,
+            dropout: float = 0.0,
+            attention_dropout: float = 0.0,
+            num_classes: int = 1000,
+            representation_size: ivy.Optional[int] = None,
+            norm_layer: ivy.Callable[..., ivy.Module] = partial(ivy.LayerNorm, eps=1e-6),
+            conv_stem_configs: ivy.Optional[ivy.List[ConvStemConfig]] = None,
+            spec=None,
+            v: ivy.Container = None,
+    ):
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.hidden_dim = hidden_dim
+        self.mlp_dim = mlp_dim
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.num_classes = num_classes
+        self.representation_size = representation_size
+        self.norm_layer = norm_layer
+        self.conv_stem_configs = conv_stem_configs
+        super(DINOBackbone, self).__init__(v=v)
+
+    def _build(self, *args, **kwargs):
+        self.backbone = VisionTransformer(image_size=self.img_size, patch_size=self.patch_size,
+                                          num_layers=self.num_layers,
+                                          num_heads=self.num_heads, hidden_dim=self.hidden_dim, mlp_dim=self.mlp_dim)
+
+    def _forward(self, x):
+        # if not isinstance(x, list):
+        #     x = [x]
+        idx_crops = ivy.cumsum(ivy.unique_consecutive(
+            ivy.array([inp.shape[-1] for inp in x]),
+        )[2], 0)
+        start_idx, output = 0, ivy.empty(0, device=x[0].device)
+        # for end_idx in idx_crops.tolist():
+        _out = self.backbone(x)
+        # if isinstance(_out, tuple):
+        #     _out = _out[0]
+        # # accumulate outputs
+        # output = ivy.concat((output, _out))
+        # start_idx = end_idx
+        # CHANGE BACK FOR LOOP HERE
+        output = _out
+        return output
+
+class DINOHead(ivy.Module):
+    """DINO architecture"""
+    def __init__(
+            self,
+            in_dim: int,
+            out_dim: int,
+            use_bn: bool = False,
+            norm_last_layer : bool = True,
+            nlayers: int = 3,
+            hidden_dim_: int = 2048,
+            bottleneck_dim: int = 256,
+            _weight_init: Initializer = GlorotUniform(),
+            _bias_init: Initializer = Zeros(),
+            with_bias: bool = True,
+            device=None,
+            dtype=None,
+            v: ivy.Container = None,
+        ) -> None:
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.use_bn = use_bn
+        self.norm_last_layer = norm_last_layer
+        self.nlayers = nlayers
+        self.hidden_dim_ = hidden_dim_
+        self.bottleneck_dim = bottleneck_dim
+        self._w_shape = (out_dim, in_dim)
+        self._b_shape = (out_dim,)
+        self._weight_init = _weight_init
+        self._b_init = _bias_init
+        self.with_bias = with_bias
+        super(DINOHead, self).__init__(v=v, device=device, dtype=dtype)
+
+    def _create_variables(self, device, dtype=None):
+        w = self._weight_init.create_variables(
+                self._w_shape, device,self.out_dim,
+                    self.in_dim, dtype
+        )
+        v = {
+            "w": trunc_normal_(w, std=.02),
+        }
+        v = dict(
+            **v,
+            b=self._b_init.create_variables(
+                self._b_shape,
+                device,
+                self.out_dim,
+                self.in_dim,
+                dtype=dtype,
+            ),
+        )
+        return v
+
+    def _build(self, *args, **kwargs):
+        nlayers = max(self.nlayers, 1)
+        if nlayers == 1:
+            self.mlp = ivy.Linear(self.in_dim, self.bottleneck_dim)
+        else:
+            layers = [ivy.Linear(self.in_dim, self.bottleneck_dim)]
+            # TODO: change back to batchnorm1d when changes are merged
+            if self.use_bn:
+                layers.append(ivy.BatchNorm2D(self.hidden_dim_))
+            layers.append(ivy.GELU())
+            for _ in range(nlayers-2):
+                layers.append(ivy.Linear(self.hidden_dim_, self.hidden_dim_))
+                if self.use_bn:
+                    layers.append(ivy.BatchNorm2D(self.hidden_dim_))
+                layers.append(ivy.GELU())
+            layers.append(ivy.Linear(self.hidden_dim_, self.bottleneck_dim))
+            self.mlp = ivy.Sequential(*layers)
+        # TODO: weight normalization
+        self.last_layer = ivy.Linear(self.bottleneck_dim, self.out_dim)
+        self.last_layer.v.w = ivy.full_like(self.last_layer.v.w, 1.0)
+        if self.norm_last_layer:
+            self.last_layer.v.w.requires_grad = False
+
+    # def _init_weights(self, module):
+    #     # if isinstance(module, ivy.Linear):
+    #     trunc_normal_(module.weight, std=.02)
+    #     module.w.data.normal_(mean=0.0, std=.02)
+    #     if module.b is not None:
+    #         module.b.data.zero_()
+    #     return module
+
+    def _forward(self, x):
+        print(x.shape)
+        x = self.mlp(x)
+        x = ivy.functional.lp_normalize(x, p = 2., axis = 1)
+        x = self.last_layer(x)
+        return x
+
+
+class DataAugmentationDINO(object):
+    def __init__(self, global_crops_scale, local_crops_scale, local_crops_number):
+        flip_and_color_jitter = transforms.Compose([
+            transforms.RandomHorizontalFlip(p=0.5),
+            transforms.RandomApply(
+                [transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.2, hue=0.1)],
+                p=0.8
+            ),
+            transforms.RandomGrayscale(p=0.2),
+        ])
+        normalize = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        ])
+
+        self.global_crop_1 = transforms.Compose([
+            transforms.RandomResizedCrop(224, scale=global_crops_scale, interpolation=Image.BICUBIC),
+            flip_and_color_jitter,
+            image_helpers.GaussianBlur(1.0),
+            normalize,
+        ])
+
+        self.global_crop_2 = transforms.Compose([
+            transforms.RandomResizedCrop(224, scale = global_crops_scale, interpolation=Image.BICUBIC),
+            flip_and_color_jitter,
+            image_helpers.GaussianBlur(0.1),
+            image_helpers.Solarization(0.2),
+            normalize,
+        ])
+
+        self.local_coprs_number = local_crops_number
+        self.local_crop = transforms.Compose([
+            transforms.RandomResizedCrop(96, scale=local_crops_scale, interpolation=Image.BICUBIC),
+            flip_and_color_jitter,
+            image_helpers.GaussianBlur(p=0.5),
+            normalize,
+        ])
+
+    def __call__(self, image):
+        crops = []
+        crops.append(self.global_crop_1(image))
+        crops.append(self.global_crop_2(image))
+        for _ in range(self.local_crops_number):
+            crops.append(self.local_crop(image))
+        return crops
+
+
+
+
+
+class MultiCropWrapper(ivy.Module):
+
+    def __init__(self, backbone, head):
+        super(MultiCropWrapper, self).__init__()
+        backbone.fc, backbone.head = ivy.Identity, ivy.Identity
+        self.backbone = backbone
+        self.head = head
+
+
+    def _forward(self, x):
+        if not isinstance(x, list):
+            x = [x]
+        idx_crops = ivy.cumsum(ivy.unique_consecutive(
+            ivy.array([inp.shape[-1] for inp in x]),
+            return_counts=True,
+        )[1], 0)
+
+        start_idx, output = 0, ivy.empty(0).to(x[0].device)
+        for end_idx in idx_crops:
+            _out = self.backbone(ivy.cat(x[start_idx: end_idx]))
+            # The output is a tuple with XCiT model. See:
+            # https://github.com/facebookresearch/xcit/blob/master/xcit.py#L404-L405
+            if isinstance(_out, tuple):
+                _out = _out[0]
+            # accumulate outputs
+            output = ivy.cat((output, _out))
+            start_idx = end_idx
+            # Run the head forward on the concatenated features.
+        return self.head(output)
diff --git a/ivy_models/dino/utils.py b/ivy_models/dino/utils.py
new file mode 100644
index 00000000..18df63cb
--- /dev/null
+++ b/ivy_models/dino/utils.py
@@ -0,0 +1,32 @@
+import ivy
+
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+
+    def norm_cdf(x):
+        return (1. + ivy.erf(x/ivy.sqrt(2.))) / 2.
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        ivy.warn("mean is more than 2 std from [a, b] in trunc_normal_. "
+                      "The distribution of values may be incorrect.",
+                      stacklevel=2)
+
+    ivy.stop_gradient(tensor)
+    l = norm_cdf((a - mean) / std)
+    u = norm_cdf((b - mean) / std)
+    ivy.random_uniform(low = 2 * l - 1, high = 2 * u - 1, out = tensor)
+    # TODO: ivy.erfinv
+    tensor = ivy.multiply(tensor, std * ivy.sqrt(2.))
+    tensor = ivy.add(tensor, mean)
+    tensor = ivy.clip(tensor, a, b)
+    return tensor
+
+# TODO: Add this to ivy functions
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+
+
+if __name__ == "__main__":
+    x = ivy.array([1., 2., 3., 5.])
+    x = ivy.randint(-100, 100, shape = (10,5))
+    truncated_tensor = trunc_normal_(x, std = .02)
+    assert truncated_tensor.shape == x.shape
diff --git a/ivy_models/mlpmixer/mlpmixer.py b/ivy_models/mlpmixer/mlpmixer.py
index 85ba5ed7..d7cb02af 100644
--- a/ivy_models/mlpmixer/mlpmixer.py
+++ b/ivy_models/mlpmixer/mlpmixer.py
@@ -122,7 +122,7 @@ def get_spec_class(self):
     def _forward(self, x, data_format=None):
         data_format = data_format if data_format else self.spec.data_format
         if data_format == "NCHW":
-            x = ivy.permute_dims(x, (0, 3, 1, 2))
+            x = ivy.permute_dims(x, (0, 2, 3, 1))
         x = self.conv(x)
         x = x.reshape(
             (int(x.shape[0]), int(x.shape[1]) * int(x.shape[2]), int(x.shape[3]))
diff --git a/ivy_models/unet/unet.py b/ivy_models/unet/unet.py
index 44234b31..06fb2ae7 100644
--- a/ivy_models/unet/unet.py
+++ b/ivy_models/unet/unet.py
@@ -42,6 +42,7 @@ def get_spec_class(self):
         return UNetSpec
 
     def _forward(self, x, data_format="NHWC"):
+        data_format = data_format if data_format else self.spec.data_format
         if data_format == "NCHW":
             x = ivy.permute_dims(x, (0, 2, 3, 1))
         x1 = self.inc(x)
diff --git a/ivy_models/vit/layers.py b/ivy_models/vit/layers.py
index d7885c3f..c9b60c7a 100644
--- a/ivy_models/vit/layers.py
+++ b/ivy_models/vit/layers.py
@@ -1,7 +1,6 @@
 from typing import Any, Callable, List, NamedTuple, Optional, Tuple, Union, Sequence
 import collections
 from itertools import repeat
-from collections import OrderedDict
 from functools import partial
 from ivy.stateful.initializers import Zeros
 import ivy
@@ -210,7 +209,7 @@ def __init__(
         self.num_heads = num_heads
         self.hidden_dim = hidden_dim
         self.mlp_dim = mlp_dim
-        self.dropout = dropout
+        self.dropout_p = dropout
         self.attention_dropout = attention_dropout
         self.norm_layer = norm_layer
 
@@ -224,19 +223,19 @@ def _build(self, *args, **kwargs) -> bool:
             num_heads=self.num_heads,
             dropout_rate=self.attention_dropout,
         )
-        self.dropout = ivy.Dropout(self.dropout)
+        self.dropout = ivy.Dropout(self.dropout_p)
 
         # MLP block
         self.ln_2 = self.norm_layer(self.hidden_dim)
-        self.mlp = VIT_MLPBlock(self.hidden_dim, self.mlp_dim, self.dropout)
+        self.mlp = VIT_MLPBlock(self.hidden_dim, self.mlp_dim, self.dropout_p)
 
     def _forward(self, input):
         ivy.utils.assertions.check_true(
-            input.dim() == 3,
+            input.get_num_dims() == 3,
             f"Expected (batch_size, seq_length, hidden_dim) got {input.shape}",
         )
         x = self.ln_1(input)
-        x, _ = self.self_attention(x, x, x, need_weights=False)
+        x = self.self_attention(x, x, x)
         x = self.dropout(x)
         x = x + input
 
@@ -264,31 +263,33 @@ def __init__(
         self._pos_embedding_shape = (1, seq_length, hidden_dim)
         self.pos_embedding = Zeros()  # from BERT
         self.dropout = ivy.Dropout(dropout)
-        layers: OrderedDict[str, ivy.Module] = OrderedDict()
+        layers = []
         for i in range(num_layers):
-            layers[f"encoder_layer_{i}"] = VIT_EncoderBlock(
-                num_heads,
-                hidden_dim,
-                mlp_dim,
-                dropout,
-                attention_dropout,
-                norm_layer,
+            layers.append(
+                VIT_EncoderBlock(
+                    num_heads,
+                    hidden_dim,
+                    mlp_dim,
+                    dropout,
+                    attention_dropout,
+                    norm_layer,
+                )
             )
-        self.layers = ivy.Sequential(layers)
+        self.layers = ivy.Sequential(*layers)
         self.ln = norm_layer(hidden_dim)
         super().__init__()
 
     def _create_variables(self, device, dtype=None):
         return {
-            "pos_embeddin": self.pos_embedding.create_variables(
+            "pos_embedding": self.pos_embedding.create_variables(
                 self._pos_embedding_shape, device, dtype=dtype
             )
         }
 
     def _forward(self, input):
         ivy.utils.assertions.check_true(
-            input.dim() == 3,
+            input.get_num_dims() == 3,
             f"Expected (batch_size, seq_length, hidden_dim) got {input.shape}",
         )
-        input = input + self.pos_embedding
+        input = input + self.v.pos_embedding
         return self.ln(self.layers(self.dropout(input)))
diff --git a/ivy_models/vit/vit.py b/ivy_models/vit/vit.py
index 4e7c488f..d94ed1c7 100644
--- a/ivy_models/vit/vit.py
+++ b/ivy_models/vit/vit.py
@@ -5,7 +5,6 @@
     ConvStemConfig,
     List,
     Optional,
-    OrderedDict,
     VIT_Encoder,
     Zeros,
     ivy,
@@ -28,6 +27,7 @@ def __init__(
         num_classes: int = 1000,
         representation_size: Optional[int] = None,
         norm_layer: Callable[..., ivy.Module] = partial(ivy.LayerNorm, eps=1e-6),
+        data_format: str = "NHWC",
         conv_stem_configs: Optional[List[ConvStemConfig]] = None,
     ):
         ivy.utils.assertions.check_true(
@@ -46,6 +46,7 @@ def __init__(
             num_classes=num_classes,
             representation_size=representation_size,
             norm_layer=norm_layer,
+            data_format=data_format,
             conv_stem_configs=conv_stem_configs,
         )
 
@@ -67,6 +68,7 @@ def __init__(
         representation_size: Optional[int] = None,
         norm_layer: Callable[..., ivy.Module] = partial(ivy.LayerNorm, eps=1e-6),
         conv_stem_configs: Optional[List[ConvStemConfig]] = None,
+        data_format: str = "NHWC",
         spec=None,
         v=None,
     ):
@@ -85,6 +87,7 @@ def __init__(
                 num_classes=num_classes,
                 representation_size=representation_size,
                 norm_layer=norm_layer,
+                data_format=data_format,
                 conv_stem_configs=conv_stem_configs,
             )
         )
@@ -93,22 +96,24 @@ def __init__(
     def _build(self, *args, **kwargs):
         if self.spec.conv_stem_configs is not None:
             # As per https://arxiv.org/abs/2106.14881
-            seq_proj = OrderedDict()
+            seq_proj = []
             prev_channels = 3
             for i, conv_stem_layer_config in enumerate(self.spec.conv_stem_configs):
-                seq_proj[f"conv_bn_relu_{i}"] = Conv2dNormActivation(
-                    in_channels=prev_channels,
-                    out_channels=conv_stem_layer_config.out_channels,
-                    kernel_size=conv_stem_layer_config.kernel_size,
-                    stride=conv_stem_layer_config.stride,
-                    norm_layer=conv_stem_layer_config.norm_layer,
-                    activation_layer=conv_stem_layer_config.activation_layer,
+                seq_proj.append(
+                    Conv2dNormActivation(
+                        in_channels=prev_channels,
+                        out_channels=conv_stem_layer_config.out_channels,
+                        kernel_size=conv_stem_layer_config.kernel_size,
+                        stride=conv_stem_layer_config.stride,
+                        norm_layer=conv_stem_layer_config.norm_layer,
+                        activation_layer=conv_stem_layer_config.activation_layer,
+                    )
                 )
                 prev_channels = conv_stem_layer_config.out_channels
-            seq_proj["conv_last"] = ivy.Conv2D(
-                prev_channels, self.spec.hidden_dim, [1, 1], 1, 0
+            seq_proj.append(
+                ivy.Conv2D(prev_channels, self.spec.hidden_dim, [1, 1], 1, 0)
             )
-            self.conv_proj: ivy.Module = ivy.Sequential(seq_proj)
+            self.conv_proj: ivy.Module = ivy.Sequential(*seq_proj)
         else:
             self.conv_proj = ivy.Conv2D(
                 3,
@@ -137,21 +142,19 @@ def _build(self, *args, **kwargs):
         )
         self.seq_length = seq_length
 
-        heads_layers: OrderedDict[str, ivy.Module] = OrderedDict()
+        heads_layers = []
         if self.spec.representation_size is None:
-            heads_layers["head"] = ivy.Linear(
-                self.spec.hidden_dim, self.spec.num_classes
-            )
+            heads_layers.append(ivy.Linear(self.spec.hidden_dim, self.spec.num_classes))
         else:
-            heads_layers["pre_logits"] = ivy.Linear(
-                self.spec.hidden_dim, self.spec.representation_size
+            heads_layers.append(
+                ivy.Linear(self.spec.hidden_dim, self.spec.representation_size)
             )
-            heads_layers["act"] = ivy.tanh()
-            heads_layers["head"] = ivy.Linear(
-                self.spec.representation_size, self.spec.num_classes
+            heads_layers.append(ivy.tanh())
+            heads_layers.append(
+                ivy.Linear(self.spec.representation_size, self.spec.num_classes)
             )
 
-        self.heads = ivy.Sequential(heads_layers)
+        self.heads = ivy.Sequential(*heads_layers)
 
     def _create_variables(self, device, dtype=None):
         return {
@@ -161,7 +164,7 @@ def _create_variables(self, device, dtype=None):
         }
 
     def _process_input(self, x):
-        n, c, h, w = x.shape
+        n, h, w, c = x.shape
         p = self.spec.patch_size
         ivy.utils.assertions.check_true(
             h == self.spec.image_size,
@@ -174,16 +177,10 @@ def _process_input(self, x):
         n_h = h // p
         n_w = w // p
 
-        # (n, c, h, w) -> (n, self.hidden_dim, n_h, n_w)
+        # (n, h, w, c) -> (n, n_h, n_w, self.hidden_dim)
         x = self.conv_proj(x)
-        # (n, self.hidden_dim, n_h, n_w) -> (n, self.hidden_dim, (n_h * n_w))
-        x = x.reshape(n, self.spec.hidden_dim, n_h * n_w)
-
-        # (n, self.hidden_dim, (n_h * n_w)) -> (n, (n_h * n_w), self.hidden_dim)
-        # The self attention layer expects inputs in the format (N, S, E)
-        # where S is the source sequence length, N is the batch size, E is the
-        # embedding dimension
-        x = x.permute(0, 2, 1)
+        # (n, n_h, n_w, self.hidden_dim) -> (n, (n_h * n_w), self.hidden_dim)
+        x = x.reshape(shape=(n, n_h * n_w, self.spec.hidden_dim))
 
         return x
 
@@ -191,13 +188,17 @@ def _process_input(self, x):
     def get_spec_class(self):
         return VisionTransformerSpec
 
-    def _forward(self, x):
+    def _forward(self, x, data_format: str = "NHWC"):
+        data_format = data_format if data_format else self.spec.data_format
+        if data_format == "NCHW":
+            x = ivy.permute_dims(x, (0, 2, 3, 1))
         # Reshape and permute the input tensor
         x = self._process_input(x)
         n = x.shape[0]
 
         # Expand the class token to the full batch
-        batch_class_token = self.class_token.expand(n, -1, -1)
+
+        batch_class_token = ivy.expand(self.v.class_token, (n, -1, -1))
         x = ivy.concat([batch_class_token, x], axis=1)
 
         x = self.encoder(x)
@@ -227,6 +228,7 @@ def _vision_transformer(
     num_heads: int,
     hidden_dim: int,
     mlp_dim: int,
+    data_format: str = "NHWC",
     v=None,
 ) -> VisionTransformer:
     model = VisionTransformer(
@@ -236,15 +238,21 @@ def _vision_transformer(
         num_heads=num_heads,
         hidden_dim=hidden_dim,
         mlp_dim=mlp_dim,
+        data_format=data_format,
         v=v,
     )
 
     return model
 
 
-def vit_b_16(pretrained=True) -> VisionTransformer:
+def vit_b_16(data_format="NHWC", pretrained=True) -> VisionTransformer:
     model = _vision_transformer(
-        patch_size=16, num_layers=12, num_heads=12, hidden_dim=768, mlp_dim=3072
+        patch_size=16,
+        num_layers=12,
+        num_heads=12,
+        hidden_dim=768,
+        mlp_dim=3072,
+        data_format=data_format,
     )
     if pretrained:
         url = "https://download.pytorch.org/models/vit_b_16-c867db91.pth"
@@ -258,9 +266,14 @@ def vit_b_16(pretrained=True) -> VisionTransformer:
     return model
 
 
-def vit_b_32(pretrained=True) -> VisionTransformer:
+def vit_b_32(data_format="NHWC", pretrained=True) -> VisionTransformer:
     ref_model = _vision_transformer(
-        patch_size=32, num_layers=12, num_heads=12, hidden_dim=768, mlp_dim=3072
+        patch_size=32,
+        num_layers=12,
+        num_heads=12,
+        hidden_dim=768,
+        mlp_dim=3072,
+        data_format=data_format,
     )
     if pretrained:
         url = "https://download.pytorch.org/models/vit_b_32-d86f8d99.pth"
@@ -274,9 +287,14 @@ def vit_b_32(pretrained=True) -> VisionTransformer:
     return ref_model
 
 
-def vit_l_16(pretrained=True) -> VisionTransformer:
+def vit_l_16(data_format="NHWC", pretrained=True) -> VisionTransformer:
     ref_model = _vision_transformer(
-        patch_size=16, num_layers=24, num_heads=16, hidden_dim=1024, mlp_dim=4096
+        patch_size=16,
+        num_layers=24,
+        num_heads=16,
+        hidden_dim=1024,
+        mlp_dim=4096,
+        data_format=data_format,
     )
     if pretrained:
         url = "https://download.pytorch.org/models/vit_l_16-852ce7e3.pth"
@@ -290,9 +308,14 @@ def vit_l_16(pretrained=True) -> VisionTransformer:
     return ref_model
 
 
-def vit_l_32(pretrained=True) -> VisionTransformer:
+def vit_l_32(data_format="NHWC", pretrained=True) -> VisionTransformer:
     ref_model = _vision_transformer(
-        patch_size=32, num_layers=24, num_heads=16, hidden_dim=1024, mlp_dim=4096
+        patch_size=32,
+        num_layers=24,
+        num_heads=16,
+        hidden_dim=1024,
+        mlp_dim=4096,
+        data_format=data_format,
     )
     if pretrained:
         url = "https://download.pytorch.org/models/vit_l_32-c7638314.pth"
@@ -306,12 +329,17 @@ def vit_l_32(pretrained=True) -> VisionTransformer:
     return ref_model
 
 
-def vit_h_14(pretrained=True) -> VisionTransformer:
+def vit_h_14(data_format="NHWC", pretrained=True) -> VisionTransformer:
     ref_model = _vision_transformer(
-        patch_size=14, num_layers=12, num_heads=14, hidden_dim=768, mlp_dim=3072
+        patch_size=14,
+        num_layers=32,
+        num_heads=16,
+        hidden_dim=1280,
+        mlp_dim=5120,
+        data_format=data_format,
     )
     if pretrained:
-        url = "https://download.pytorch.org/models/vit_h_14_swag-80465313.pth"
+        url = "https://download.pytorch.org/models/vit_h_14_lc_swag-c1eb923e.pth"
         w_clean = load_torch_weights(
             url,
             ref_model,
diff --git a/ivy_models_tests/dino/test_dinonet.py b/ivy_models_tests/dino/test_dinonet.py
new file mode 100644
index 00000000..7de1fe41
--- /dev/null
+++ b/ivy_models_tests/dino/test_dinonet.py
@@ -0,0 +1,57 @@
+import os
+import ivy
+import numpy as np
+# import pytest
+import traceback
+import sys
+import logging
+from ivy_models_tests import helpers
+from ivy_models.dino.dino import dino_base
+
+
+# @pytest.mark.parametrize("data_format", ["NHWC", "NCHW"])
+# def test_dino_classification(device, fw, data_format):
+#     """Test AlexNet image classification."""
+#     num_classes = 1000
+#     batch_shape = [1]
+#     this_dir = os.path.dirname(os.path.realpath(__file__))
+#
+#     # Load image
+#     img = helpers.load_and_preprocess_img(
+#         os.path.join(this_dir, "..", "..", "images", "cat.jpg"),
+#         256,
+#         224,
+#         data_format=data_format,
+#         to_ivy=True,
+#     )
+#
+#     model = dino_base()
+#
+    
+def run_model():
+    num_classes = 1000
+    batch_shape = [1]
+    this_dir = os.path.dirname(os.path.realpath(__file__))
+
+    # Load image
+    img = helpers.load_and_preprocess_img(
+        os.path.join(this_dir, "..", "..", "images", "cat.jpg"),
+        256,
+        224,
+        data_format="NHWC",
+        to_ivy=True,
+    )
+
+    model = dino_base()
+
+    try:
+        model.v = ivy.asarray(model.v)
+        logits = model(img)
+        print("LOGITS")
+        print(logits)
+    except Exception as e:
+        print(traceback.format_exc())
+        # or
+        print(sys.exc_info()[2])
+
+run_model()
diff --git a/ivy_models_tests/helpers/image_helpers.py b/ivy_models_tests/helpers/image_helpers.py
index 9cea853f..54a8b10c 100644
--- a/ivy_models_tests/helpers/image_helpers.py
+++ b/ivy_models_tests/helpers/image_helpers.py
@@ -1,6 +1,7 @@
 import ivy
 import numpy as np
-from PIL import Image
+import random
+from PIL import Image, ImageFilter, ImageOps
 from torchvision import transforms
 
 
@@ -50,3 +51,39 @@ def load_and_preprocess_img(
     if data_format == "NHWC":
         img = img.permute((0, 2, 3, 1))
     return ivy.array(img.numpy()) if to_ivy else img.numpy()
+
+
+class GaussianBlur(object):
+    """
+    Apply Gaussian Blur to the PIL image.
+    """
+    def __init__(self, p=0.5, radius_min=0.1, radius_max=2.):
+        self.prob = p
+        self.radius_min = radius_min
+        self.radius_max = radius_max
+
+    def __call__(self, img):
+        do_it = random.random() <= self.prob
+        if not do_it:
+            return img
+
+        return img.filter(
+            ImageFilter.GaussianBlur(
+                radius=random.uniform(self.radius_min, self.radius_max)
+            )
+        )
+
+
+class Solarization(object):
+    """
+    Apply Solarization to the PIL image.
+    """
+    def __init__(self, p):
+        self.p = p
+
+    def __call__(self, img):
+        if random.random() < self.p:
+            return ImageOps.solarize(img)
+        else:
+            return img
+
diff --git a/ivy_models_tests/mlpmixer/test_mlpmixer.py b/ivy_models_tests/mlpmixer/test_mlpmixer.py
new file mode 100644
index 00000000..7826f508
--- /dev/null
+++ b/ivy_models_tests/mlpmixer/test_mlpmixer.py
@@ -0,0 +1,73 @@
+import os
+import ivy
+import pytest
+import numpy as np
+
+from ivy_models.mlpmixer import mlpmixer
+from ivy_models_tests import helpers
+
+import tensorflow as tf
+from tensorflow import keras
+from keras import layers
+import jax
+
+jax.config.update("jax_enable_x64", False)
+
+load_weights = True
+model = mlpmixer(pretrained=load_weights)
+v = ivy.to_numpy(model.v)
+
+
+@pytest.mark.parametrize("data_format", ["NHWC", "NCHW"])
+def test_mlpmixer_tiny_img_classification(device, fw, data_format):
+    """Test MLPMixer image classification."""
+    num_classes = 10
+    batch_shape = [1]
+    this_dir = os.path.dirname(os.path.realpath(__file__))
+
+    # Load image
+    img = helpers.load_image_in_np(
+        os.path.join(this_dir, "..", "..", "images", "car.jpg")
+    )
+
+    # Preprocess the image
+    def get_augmentation_layers():
+        data_augmentation = keras.Sequential(
+            [
+                layers.experimental.preprocessing.Normalization(
+                    mean=(0.5, 0.5, 0.5), variance=(0.25, 0.25, 0.25)
+                ),
+                layers.experimental.preprocessing.Resizing(72, 72),
+                layers.experimental.preprocessing.RandomFlip("horizontal"),
+                layers.experimental.preprocessing.RandomRotation(factor=0.02),
+                layers.experimental.preprocessing.RandomZoom(
+                    height_factor=0.2, width_factor=0.2
+                ),
+            ],
+            name="data_augmentation",
+        )
+        return data_augmentation
+
+    data_augmentation = get_augmentation_layers()
+    img = data_augmentation(img)
+    img = tf.expand_dims(img, 0).numpy()
+    img = ivy.asarray(img)
+    if data_format == "NCHW":
+        img = ivy.permute_dims(img, (0, 3, 1, 2))
+
+    model.v = ivy.asarray(v)
+    logits = model(img, data_format=data_format)
+
+    # Cardinality test
+    assert logits.shape == tuple([ivy.to_scalar(batch_shape), num_classes])
+
+    # Value test
+    if load_weights:
+        np_out = ivy.to_numpy(logits)
+        true_indices = np.array([4, 7, 2, 9])
+        calc_indices = np.argsort(np_out[0])[-4:][::-1]
+        assert np.array_equal(np.sort(true_indices), np.sort(calc_indices))
+
+        true_logits = np.array([0.4022081, 0.24405026, 0.14345096, 0.12923254])
+        calc_logits = np.take(np_out, calc_indices)
+        assert np.allclose(true_logits, calc_logits, rtol=1e-2, atol=1e-1)
diff --git a/ivy_models_tests/squeezenet/test_squeezenet.py b/ivy_models_tests/squeezenet/test_squeezenet.py
index af77641c..ddef8a47 100644
--- a/ivy_models_tests/squeezenet/test_squeezenet.py
+++ b/ivy_models_tests/squeezenet/test_squeezenet.py
@@ -12,10 +12,9 @@
     "squeezenet1_1": squeezenet1_1,
 }
 
-ivy.seed(seed_value=42)
 load_weights = random.choice([False, True])
 model_var = random.choice(list(VARIANTS.keys()))
-model = VARIANTS[model_var](pretrained=load_weights)
+model = VARIANTS[model_var](dropout=0, pretrained=load_weights)
 v = ivy.to_numpy(model.v)
 
 
diff --git a/ivy_models_tests/vit/test_vit.py b/ivy_models_tests/vit/test_vit.py
index 381385d3..4263e4a7 100644
--- a/ivy_models_tests/vit/test_vit.py
+++ b/ivy_models_tests/vit/test_vit.py
@@ -1,31 +1,60 @@
 import os
-import ivy
-import pytest
 import numpy as np
-
-from ivy_models import vit_b_16
+import pytest
+import random
+import ivy
 from ivy_models_tests import helpers
+from ivy_models.vit import (
+    vit_b_16,
+    vit_b_32,
+    vit_l_16,
+    vit_l_32,
+)
 
-import jax
 
-jax.config.update("jax_enable_x64", False)
+VARIANTS = {
+    "vit_b_16": vit_b_16,
+    "vit_b_32": vit_b_32,
+    "vit_l_16": vit_l_16,
+    "vit_l_32": vit_l_32,
+}
 
 
-@pytest.mark.parametrize("batch_shape", [[1]])
-@pytest.mark.parametrize("load_weights", [False, True])
-def test_alexnet_tiny_img_classification(device, fw, batch_shape, load_weights):
+LOGITS = {
+    "vit_b_16": [282, 281, 285, 287, 292],
+    "vit_b_32": [282, 281, 285, 287, 292],
+    "vit_l_16": [255, 281, 282, 285, 292],
+    "vit_l_32": [282, 281, 285, 287, 292],
+}
+
+
+load_weights = random.choice([False, True])
+model_var = random.choice(list(VARIANTS.keys()))
+model = VARIANTS[model_var](pretrained=load_weights)
+v = ivy.to_numpy(model.v)
+
+
+@pytest.mark.parametrize("data_format", ["NHWC", "NCHW"])
+def test_vit_img_classification(device, fw, data_format):
     """Test ViT image classification."""
     num_classes = 1000
+    batch_shape = [1]
     this_dir = os.path.dirname(os.path.realpath(__file__))
 
     # Load image
-    img = helpers.load_and_preprocess_img(
-        os.path.join(this_dir, "..", "..", "images", "cat.jpg"), 256, 224
+    img = ivy.asarray(
+        helpers.load_and_preprocess_img(
+            os.path.join(this_dir, "..", "..", "images", "cat.jpg"),
+            256,
+            224,
+            data_format=data_format,
+            to_ivy=True,
+        ),
     )
-    img = ivy.permute_dims(img, (0, 3, 1, 2))
 
-    model = vit_b_16(pretrained=load_weights)
-    logits = model(img)
+    # Create model
+    model.v = ivy.asarray(v)
+    logits = model(img, data_format=data_format)
 
     # Cardinality test
     assert logits.shape == tuple([ivy.to_scalar(batch_shape), num_classes])
@@ -33,10 +62,6 @@ def test_alexnet_tiny_img_classification(device, fw, batch_shape, load_weights):
     # Value test
     if load_weights:
         np_out = ivy.to_numpy(logits[0])
-        true_indices = np.array([282, 281, 285, 287, 896])
-        calc_indices = np.argsort(np_out)[-5:][::-1]
+        true_indices = np.sort(np.array(LOGITS[model_var]))
+        calc_indices = np.sort(np.argsort(np_out)[-5:][::-1])
         assert np.array_equal(true_indices, calc_indices)
-
-        true_logits = np.array([23.5786, 22.791977, 20.917543, 19.49762, 16.102253])
-        calc_logits = np.take(np_out, calc_indices)
-        assert np.allclose(true_logits, calc_logits, rtol=1e-3)
diff --git a/requirements.txt b/requirements.txt
index 309e21c3..c6780027 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1 @@
-git+https://github.com/unifyai/ivy.git@master
+git+https://github.com/unifyai/ivy.git@main
diff --git a/setup.py b/setup.py
index fa690915..4276877a 100644
--- a/setup.py
+++ b/setup.py
@@ -30,7 +30,7 @@ def _replace_logos_html(txt):
     backends_chunk = chunks[2]
     bc = backends_chunk.split("\n\n")
     img_str = (
-        ".. image:: https://github.com/unifyai/unifyai.github.io/blob/master/img/externally_linked/logos/supported/frameworks.png?raw=true\n"  # noqa
+        ".. image:: https://github.com/unifyai/unifyai.github.io/blob/main/img/externally_linked/logos/supported/frameworks.png?raw=true\n"  # noqa
         "   :width: 100%"
     )
     backends_chunk = "\n\n".join(bc[0:1] + [img_str] + bc[2:])
@@ -39,7 +39,7 @@ def _replace_logos_html(txt):
     libraries_chunk = chunks[3]
     lc = libraries_chunk.split("\n\n")
     img_str = (
-        ".. image:: https://github.com/unifyai/unifyai.github.io/blob/master/img/externally_linked/ivy_libraries.png?raw=true\n"  # noqa
+        ".. image:: https://github.com/unifyai/unifyai.github.io/blob/main/img/externally_linked/ivy_libraries.png?raw=true\n"  # noqa
         "   :width: 100%"
     )
     libraries_chunk = "\n\n".join(lc[0:1] + [img_str] + lc[2:])