diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index c17bb841..a22f44fb 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -2,12 +2,13 @@ name: Update docs
on:
push:
branches:
- - master
+ - main
tags:
- v*
+ workflow_dispatch:
jobs:
update-docs:
name: Update docs
- uses: unifyai/workflows/.github/workflows/docs.yml@master
+ uses: unifyai/workflows/.github/workflows/docs.yml@main
secrets: inherit
diff --git a/.github/workflows/lint-bot.yml b/.github/workflows/lint-bot.yml
index 03e43b4a..5136a5c1 100644
--- a/.github/workflows/lint-bot.yml
+++ b/.github/workflows/lint-bot.yml
@@ -11,5 +11,5 @@ permissions:
jobs:
fix-linting:
name: Fix Linting
- uses: unifyai/workflows/.github/workflows/lint-bot.yml@master
+ uses: unifyai/workflows/.github/workflows/lint-bot.yml@main
secrets: inherit
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 3e5c9ee2..ffb247e8 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -5,4 +5,4 @@ on: [push, pull_request]
jobs:
check-formatting:
name: Check formatting
- uses: unifyai/workflows/.github/workflows/lint.yml@master
\ No newline at end of file
+ uses: unifyai/workflows/.github/workflows/lint.yml@main
\ No newline at end of file
diff --git a/.github/workflows/test-new-pr.yml b/.github/workflows/test-new-pr.yml
index b9e53d70..1701bd83 100644
--- a/.github/workflows/test-new-pr.yml
+++ b/.github/workflows/test-new-pr.yml
@@ -16,9 +16,7 @@ jobs:
uses: tj-actions/changed-files@v37
with:
files: |
- "ivy_models_tests/"
- files_ignore: |
- "!*.py"
+ "ivy_models_tests/**/*.py"
- name: Run tests if any files in ivy_models_tests changed
if: steps.changed-files.outputs.any_changed == 'true'
diff --git a/README.rst b/README.rst
index 146588a4..af9a35e7 100644
--- a/README.rst
+++ b/README.rst
@@ -1,8 +1,8 @@
-.. image:: https://github.com/unifyai/unifyai.github.io/blob/master/img/externally_linked/logo.png?raw=true#gh-light-mode-only
+.. image:: https://github.com/unifyai/unifyai.github.io/blob/main/img/externally_linked/logo.png?raw=true#gh-light-mode-only
:width: 100%
:class: only-light
-.. image:: https://github.com/unifyai/unifyai.github.io/blob/master/img/externally_linked/logo_dark.png?raw=true#gh-dark-mode-only
+.. image:: https://github.com/unifyai/unifyai.github.io/blob/main/img/externally_linked/logo_dark.png?raw=true#gh-dark-mode-only
:width: 100%
:class: only-dark
@@ -78,23 +78,23 @@ The layers are sometimes kept in a separate file, usually named :code:`layers.py
.. raw:: html
@@ -166,26 +166,26 @@ neural memory, pre-trained models + implementations, and builder tools with trai
diff --git a/ivy_models/base/model.py b/ivy_models/base/model.py
index 8cc17707..b6c89d14 100644
--- a/ivy_models/base/model.py
+++ b/ivy_models/base/model.py
@@ -170,4 +170,7 @@ def load_from_huggingface(
spec = self.get_spec_class().from_json_file(config_path)
os.remove(config_path)
- return self(spec=spec, v=weights)
+ model = self(spec=spec)
+ model.v = weights
+
+ return model
diff --git a/ivy_models/dino/__init__.py b/ivy_models/dino/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/ivy_models/dino/dino.py b/ivy_models/dino/dino.py
new file mode 100644
index 00000000..e56ec8a5
--- /dev/null
+++ b/ivy_models/dino/dino.py
@@ -0,0 +1,134 @@
+from ivy_models.base import BaseModel, BaseSpec
+import ivy
+from ivy_models.vit.vit import VisionTransformer
+from ivy_models.dino.layers import MultiCropWrapper, DINOHead, DINOBackbone
+from ivy_models.vit.layers import partial, ConvStemConfig
+
+class DINOConfig(BaseSpec):
+ def __init__(self, img_size: int,
+ patch_size: int,
+ num_layers: int,
+ num_heads: int,
+ hidden_dim: int,
+ mlp_dim: int,
+ in_dim: int = 0,
+ dropout: float = 0.0,
+ attention_dropout: float = 0.0,
+ num_classes: int = 1000,
+ representation_size: ivy.Optional[int] = None,
+ norm_layer: ivy.Callable[..., ivy.Module] = partial(ivy.LayerNorm, eps=1e-6),
+ conv_stem_configs: ivy.Optional[ivy.List[ConvStemConfig]] = None,
+ out_dim: int = 65536,
+ use_bn: bool = False,
+ norm_last_layer: bool = True,
+ nlayers: int = 1,
+ hidden_dim_: int = 2048,
+ bottleneck_dim: int = 256,
+ _weight_init: ivy.Initializer = ivy.GlorotUniform(),
+ _bias_init: ivy.Initializer = ivy.Zeros(),
+ with_bias: bool = True,
+ device=None,
+ dtype=None
+ ):
+ super(DINOConfig, self).__init__()
+ self.img_size = img_size
+ self.patch_size = patch_size
+ self.num_layers = num_layers
+ self.num_heads = num_heads
+ self.hidden_dim = hidden_dim
+ self.mlp_dim = mlp_dim
+ self.in_dim = in_dim
+ self.dropout = dropout
+ self.attention_dropout = attention_dropout
+ self.num_classes = num_classes
+ self.representation_size = representation_size
+ self.norm_layer = norm_layer
+ self.conv_stem_configs = conv_stem_configs
+ self.out_dim = out_dim
+ self.use_bn = use_bn
+ self.norm_last_layer = norm_last_layer
+ self.nlayers = nlayers
+ self.hidden_dim_ = hidden_dim_
+ self.bottleneck_dim = bottleneck_dim
+ self._weight_init = _weight_init
+ self._bias_init = _bias_init
+ self.with_bias = with_bias
+ self.device = device
+ self.dtype = dtype
+
+ def get(self, *attr_names):
+ new_dict = {}
+ for name in attr_names:
+ new_dict[name] = getattr(self, name)
+ return new_dict
+
+ def get_vit_attrs(self):
+ return self.get(
+ "img_size",
+ "patch_size",
+ "num_layers",
+ "num_heads",
+ "hidden_dim",
+ "mlp_dim",
+ "dropout",
+ "attention_dropout",
+ "num_classes",
+ "representation_size",
+ "norm_layer",
+ "conv_stem_configs"
+ )
+
+ def get_head_attrs(self):
+ return self.get(
+ "in_dim",
+ "out_dim",
+ "use_bn",
+ "norm_last_layer",
+ "nlayers",
+ "hidden_dim_",
+ "bottleneck_dim",
+ "_weight_init",
+ "_bias_init",
+ "with_bias"
+ )
+
+class DINONet(BaseModel):
+
+ def __init__(
+ self,
+ config: DINOConfig,
+ v: ivy.Container = None,
+ ) -> None:
+ self.config = config
+ super(DINONet, self).__init__(v=v)
+
+ @classmethod
+ def get_spec_class(self):
+ return DINOConfig
+
+ def _build(self):
+ self.student = DINOBackbone(**self.config.get_vit_attrs())
+ self.teacher = DINOBackbone(**self.config.get_vit_attrs())
+ self.config.in_dim = self.config.num_classes
+ self.teacher_head = DINOHead(**self.config.get_head_attrs())
+ self.student_head = DINOHead(**self.config.get_head_attrs())
+
+ def _forward(self, x):
+ return {
+ "student_output": self.student_head(self.student(x)),
+ "teacher_output": self.teacher_head(self.teacher(x))
+ }
+
+
+def dino_base(pretrained=False):
+ # instantiate the hyperparameters same as bert
+ # set the dropout rate to 0.0 to avoid stochasticity in the output
+ config = DINOConfig(img_size = 224, patch_size=16,
+ num_layers=12,
+ num_heads=12,
+ hidden_dim=768,
+ mlp_dim=3072, out_dim = 65536,
+ )
+ model = DINONet(config)
+ return model
+
diff --git a/ivy_models/dino/dino_vit.py b/ivy_models/dino/dino_vit.py
new file mode 100644
index 00000000..99f3df3e
--- /dev/null
+++ b/ivy_models/dino/dino_vit.py
@@ -0,0 +1,283 @@
+import math
+import ivy
+from ivy_models.dino.utils import trunc_normal_
+from ivy.stateful.initializers import Zeros, GlorotUniform
+from ivy_models.vit.layers import partial
+
+
+def drop_path(x, drop_prob: float = 0., training: bool = False):
+ if drop_prob == 0. or not training:
+ return x
+ keep_prob = 1 - drop_prob
+ shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
+ random_tensor = keep_prob + ivy.random_uniform(0,1, shape, dtype=x.dtype, device=x.device)
+ random_tensor.floor_() # binarize
+ output = x.div(keep_prob) * random_tensor
+ return output
+
+
+class DropPath(ivy.Module):
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+ """
+ def __init__(self, drop_prob=None):
+ super(DropPath, self).__init__()
+ self.drop_prob = drop_prob
+
+ def forward(self, x):
+ return drop_path(x, self.drop_prob, self.training)
+
+
+class Mlp(ivy.Module):
+ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=ivy.GELU, drop=0.):
+ super(Mlp, self).__init__()
+ self.in_features = in_features
+ self.out_features = out_features or in_features
+ self.hidden_features = hidden_features or in_features
+ self.act_layer = act_layer
+ self.drop = drop
+
+ def _build(self, *args, **kwargs):
+ self.fc1 = ivy.Linear(self.in_features, self.hidden_features)
+ self.act = self.act_layer()
+ self.fc2 = ivy.Linear(self.hidden_features, self.out_features)
+ self.drop = ivy.Dropout(self.drop)
+
+ def _forward(self, x):
+ x = self.fc1(x)
+ x = self.act(x)
+ x = self.drop(x)
+ x = self.fc2(x)
+ x = self.drop(x)
+ return x
+
+
+class Attention(ivy.Module):
+ def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+ self.dim = dim
+ self.num_heads = num_heads
+ self.qkv_bias = qkv_bias
+ self.qk_scale = qk_scale
+ self.attn_drop = attn_drop
+ self.proj_drop = proj_drop
+ head_dim = dim // num_heads
+ self.scale = qk_scale or head_dim ** -0.5
+
+ def _build(self, *args, **kwargs):
+ self.qkv = ivy.Linear(self.dim, self.dim * 3, bias=self.qkv_bias)
+ self.attn_drop = ivy.Dropout(self.attn_drop)
+ self.proj = ivy.Linear(self.dim, self.dim)
+ self.proj_drop = ivy.Dropout(self.proj_drop)
+
+ def _forward(self, x):
+ B, N, C = x.shape
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+ q, k, v = qkv[0], qkv[1], qkv[2]
+
+ attn = (q @ k.transpose(-2, -1)) * self.scale
+ attn = attn.softmax(dim=-1)
+ attn = self.attn_drop(attn)
+
+ x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+ x = self.proj(x)
+ x = self.proj_drop(x)
+ return x, attn
+
+
+class Block(ivy.Module):
+ def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+ drop_path=0., act_layer=ivy.GELU, norm_layer=ivy.LayerNorm):
+ # Additional attributes
+ self.dim = dim
+ self.num_heads = num_heads
+ self.mlp_ratio = mlp_ratio
+ self.qkv_bias = qkv_bias
+ self.qk_scale = qk_scale
+ self.drop = drop
+ self.attn_drop = attn_drop
+ self.drop_path = drop_path
+ self.act_layer = act_layer
+ self.norm_layer = norm_layer
+ super(Block, self).__init__()
+
+ def _build(self, *args, **kwargs):
+ self.norm1 = self.norm_layer(self.dim)
+ self.attn = Attention(
+ self.dim, num_heads=self.num_heads, qkv_bias=self.qkv_bias, qk_scale=self.qk_scale, attn_drop=self.attn_drop, proj_drop=self.drop)
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else ivy.Identity()
+ self.norm2 = self.norm_layer(self.dim)
+ mlp_hidden_dim = int(self.dim * self.mlp_ratio)
+ self.mlp = Mlp(in_features=self.dim, hidden_features=mlp_hidden_dim, act_layer=self.act_layer, drop=self.drop)
+
+ def _forward(self, x, return_attention=False):
+ y, attn = self.attn(self.norm1(x))
+ if return_attention:
+ return attn
+ x = x + self.drop_path(y)
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
+ return x
+
+
+class PatchEmbed(ivy.Module):
+ """ Image to Patch Embedding
+ """
+ def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+ num_patches = (img_size // patch_size) * (img_size // patch_size)
+ self.img_size = img_size
+ self.patch_size = patch_size
+ self.num_patches = num_patches
+
+ super(PatchEmbed).__init__()
+
+ def _build(self, *args, **kwargs):
+ self.proj = ivy.Conv2D(self.in_chans, self.embed_dim, [self.patch_size, self.patch_size], self.patch_size, 0)
+
+ def _forward(self, x):
+ B, C, H, W = x.shape
+ x = self.proj(x).flatten(2).transpose(1, 2)
+ return x
+
+
+class VisionTransformer(ivy.Module):
+ """ Vision Transformer """
+ def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=0, embed_dim=768, depth=12,
+ num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+ drop_path_rate=0., norm_layer=ivy.LayerNorm, device=None, dtype=None, v: ivy.Container = None, **kwargs) -> None:
+ self.img_size = img_size
+ self.patch_size = patch_size
+ self.in_chans = in_chans
+ self.num_classes = num_classes
+ self.embed_dim = embed_dim
+ self.depth = depth
+ self.num_heads = num_heads
+ self.mlp_ratio = mlp_ratio
+ self.qkv_bias = qkv_bias
+ self.qk_scale = qk_scale
+ self.drop_rate = drop_rate
+ self.attn_drop_rate = attn_drop_rate
+ self.drop_path_rate = drop_path_rate
+ self.norm_layer = norm_layer
+ self.patch_embed = PatchEmbed(
+ img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+ self.num_patches = self.patch_embed.num_patches
+ self.cls_token_shape = (1, 1, embed_dim)
+ self.cls_token = Zeros()
+ self.pos_embed_shape = (1, self.num_patches + 1, embed_dim)
+ self.pos_embed = Zeros()
+ self._weight_init = GlorotUniform()
+ self._bias_init = Zeros()
+ self.num_features = self.embed_dim = embed_dim
+ self._w_shape = (embed_dim,)
+ self._b_shape = (embed_dim,)
+ super(VisionTransformer, self).__init__(v=v, device=device, dtype=dtype)
+
+ def _build(self, *args, **kwargs):
+ self.pos_drop = ivy.Dropout(prob=self.drop_rate)
+ dpr = [x.item() for x in ivy.linspace(0, self.drop_path_rate, self.depth)] # stochastic depth decay rule
+ self.blocks = [
+ Block(
+ dim=self.embed_dim, num_heads=self.num_heads, mlp_ratio=self.mlp_ratio, qkv_bias=self.qkv_bias, qk_scale=self.qk_scale,
+ drop=self.drop_rate, attn_drop=self.attn_drop_rate, drop_path=dpr[i], norm_layer=self.norm_layer)
+ for i in range(self.depth)]
+ self.norm = self.norm_layer(self.embed_dim)
+
+ # Classifier head
+ self.head = ivy.Linear(self.embed_dim, self.num_classes) if self.num_classes > 0 else ivy.Identity()
+
+ # trunc_normal_(self.v.pos_embed, std=.02)
+ # trunc_normal_(self.v.cls_token, std=.02)
+
+
+ def _create_variables(self, *, device=None, dtype=None):
+ # w = self._weight_init.create_variables(
+ # self._w_shape, device, dtype
+ # )
+ # v = {
+ # "w": trunc_normal_(w, std=.02),
+ # }
+ # v = dict(
+ # **v,
+ # b=self._b_init.create_variables(
+ # self._b_shape,
+ # device,
+ # dtype=dtype,
+ # ),
+ # )
+ v = {}
+ v = dict(**v,
+ class_token= self.cls_token.create_variables(
+ self.cls_token_shape, device, dtype=dtype
+ ))
+ v = dict(**v, pos_embed= self.pos_embed.create_variables(self.pos_embed_shape, device, dtype=dtype))
+ return v
+
+ def interpolate_pos_encoding(self, x, w, h):
+ npatch = x.shape[1] - 1
+ N = self.v.pos_embed.shape[1] - 1
+ if npatch == N and w == h:
+ return self.v.pos_embed
+ class_pos_embed = self.v.pos_embed[:, 0]
+ patch_pos_embed = self.v.pos_embed[:, 1:]
+ dim = x.shape[-1]
+ w0 = w // self.patch_embed.patch_size
+ h0 = h // self.patch_embed.patch_size
+ # we add a small number to avoid floating point error in the interpolation
+ # see discussion at https://github.com/facebookresearch/dino/issues/8
+ w0, h0 = w0 + 0.1, h0 + 0.1
+ patch_pos_embed = ivy.interpolate(
+ patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(0, 3, 1, 2),
+ scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)),
+ mode='bicubic',
+ )
+ assert int(w0) == patch_pos_embed.shape[-2] and int(h0) == patch_pos_embed.shape[-1]
+ patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+ return ivy.concat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
+
+ def prepare_tokens(self, x):
+ B, nc, w, h = x.shape
+ x = self.patch_embed(x) # patch linear embedding
+
+ # add the [CLS] token to the embed patch tokens
+ cls_tokens = ivy.expand(self.v.cls_token, (B,-1,-1))
+ x = ivy.concat((cls_tokens, x), dim=1)
+
+ # add positional encoding to each token
+ x = x + self.interpolate_pos_encoding(x, w, h)
+
+ return self.pos_drop(x)
+
+ def _forward(self, x):
+ x = self.prepare_tokens(x)
+ for blk in self.blocks:
+ x = blk(x)
+ x = self.norm(x)
+ return x[:, 0]
+
+ def get_last_selfattention(self, x):
+ x = self.prepare_tokens(x)
+ for i, blk in enumerate(self.blocks):
+ if i < len(self.blocks) - 1:
+ x = blk(x)
+ else:
+ # return attention of the last block
+ return blk(x, return_attention=True)
+
+ def get_intermediate_layers(self, x, n=1):
+ x = self.prepare_tokens(x)
+ # we return the output tokens from the `n` last blocks
+ output = []
+ for i, blk in enumerate(self.blocks):
+ x = blk(x)
+ if len(self.blocks) - i <= n:
+ output.append(self.norm(x))
+ return output
+
+
+def vit_tiny(patch_size=16, **kwargs):
+ model = VisionTransformer(
+ patch_size=patch_size, embed_dim=192, depth=12, num_heads=3, mlp_ratio=4,
+ qkv_bias=True, norm_layer=partial(ivy.LayerNorm, eps=1e-6), **kwargs)
+ return model
+
+
+if __name__ == "__main__":
+ model = vit_tiny()
diff --git a/ivy_models/dino/layers.py b/ivy_models/dino/layers.py
new file mode 100644
index 00000000..af3f619c
--- /dev/null
+++ b/ivy_models/dino/layers.py
@@ -0,0 +1,237 @@
+import ivy
+import ivy_models
+from ivy_models.base import BaseModel, BaseSpec
+from ivy_models.dino.utils import trunc_normal_
+from torchvision import transforms
+from ivy_models.vit.vit import VisionTransformer
+from ivy.stateful.initializers import Initializer, GlorotUniform, Zeros
+from ivy_models.vit.layers import partial, ConvStemConfig
+from ivy_models_tests.helpers import image_helpers
+from PIL import Image
+
+class DINOBackbone(ivy.Module):
+
+ def __init__(
+ self,
+ img_size: int,
+ patch_size: int,
+ num_layers: int,
+ num_heads: int,
+ hidden_dim: int,
+ mlp_dim: int,
+ dropout: float = 0.0,
+ attention_dropout: float = 0.0,
+ num_classes: int = 1000,
+ representation_size: ivy.Optional[int] = None,
+ norm_layer: ivy.Callable[..., ivy.Module] = partial(ivy.LayerNorm, eps=1e-6),
+ conv_stem_configs: ivy.Optional[ivy.List[ConvStemConfig]] = None,
+ spec=None,
+ v: ivy.Container = None,
+ ):
+ self.img_size = img_size
+ self.patch_size = patch_size
+ self.num_layers = num_layers
+ self.num_heads = num_heads
+ self.hidden_dim = hidden_dim
+ self.mlp_dim = mlp_dim
+ self.dropout = dropout
+ self.attention_dropout = attention_dropout
+ self.num_classes = num_classes
+ self.representation_size = representation_size
+ self.norm_layer = norm_layer
+ self.conv_stem_configs = conv_stem_configs
+ super(DINOBackbone, self).__init__(v=v)
+
+ def _build(self, *args, **kwargs):
+ self.backbone = VisionTransformer(image_size=self.img_size, patch_size=self.patch_size,
+ num_layers=self.num_layers,
+ num_heads=self.num_heads, hidden_dim=self.hidden_dim, mlp_dim=self.mlp_dim)
+
+ def _forward(self, x):
+ # if not isinstance(x, list):
+ # x = [x]
+ idx_crops = ivy.cumsum(ivy.unique_consecutive(
+ ivy.array([inp.shape[-1] for inp in x]),
+ )[2], 0)
+ start_idx, output = 0, ivy.empty(0, device=x[0].device)
+ # for end_idx in idx_crops.tolist():
+ _out = self.backbone(x)
+ # if isinstance(_out, tuple):
+ # _out = _out[0]
+ # # accumulate outputs
+ # output = ivy.concat((output, _out))
+ # start_idx = end_idx
+ # CHANGE BACK FOR LOOP HERE
+ output = _out
+ return output
+
+class DINOHead(ivy.Module):
+ """DINO architecture"""
+ def __init__(
+ self,
+ in_dim: int,
+ out_dim: int,
+ use_bn: bool = False,
+ norm_last_layer : bool = True,
+ nlayers: int = 3,
+ hidden_dim_: int = 2048,
+ bottleneck_dim: int = 256,
+ _weight_init: Initializer = GlorotUniform(),
+ _bias_init: Initializer = Zeros(),
+ with_bias: bool = True,
+ device=None,
+ dtype=None,
+ v: ivy.Container = None,
+ ) -> None:
+ self.in_dim = in_dim
+ self.out_dim = out_dim
+ self.use_bn = use_bn
+ self.norm_last_layer = norm_last_layer
+ self.nlayers = nlayers
+ self.hidden_dim_ = hidden_dim_
+ self.bottleneck_dim = bottleneck_dim
+ self._w_shape = (out_dim, in_dim)
+ self._b_shape = (out_dim,)
+ self._weight_init = _weight_init
+ self._b_init = _bias_init
+ self.with_bias = with_bias
+ super(DINOHead, self).__init__(v=v, device=device, dtype=dtype)
+
+ def _create_variables(self, device, dtype=None):
+ w = self._weight_init.create_variables(
+ self._w_shape, device,self.out_dim,
+ self.in_dim, dtype
+ )
+ v = {
+ "w": trunc_normal_(w, std=.02),
+ }
+ v = dict(
+ **v,
+ b=self._b_init.create_variables(
+ self._b_shape,
+ device,
+ self.out_dim,
+ self.in_dim,
+ dtype=dtype,
+ ),
+ )
+ return v
+
+ def _build(self, *args, **kwargs):
+ nlayers = max(self.nlayers, 1)
+ if nlayers == 1:
+ self.mlp = ivy.Linear(self.in_dim, self.bottleneck_dim)
+ else:
+ layers = [ivy.Linear(self.in_dim, self.bottleneck_dim)]
+ # TODO: change back to batchnorm1d when changes are merged
+ if self.use_bn:
+ layers.append(ivy.BatchNorm2D(self.hidden_dim_))
+ layers.append(ivy.GELU())
+ for _ in range(nlayers-2):
+ layers.append(ivy.Linear(self.hidden_dim_, self.hidden_dim_))
+ if self.use_bn:
+ layers.append(ivy.BatchNorm2D(self.hidden_dim_))
+ layers.append(ivy.GELU())
+ layers.append(ivy.Linear(self.hidden_dim_, self.bottleneck_dim))
+ self.mlp = ivy.Sequential(*layers)
+ # TODO: weight normalization
+ self.last_layer = ivy.Linear(self.bottleneck_dim, self.out_dim)
+ self.last_layer.v.w = ivy.full_like(self.last_layer.v.w, 1.0)
+ if self.norm_last_layer:
+ self.last_layer.v.w.requires_grad = False
+
+ # def _init_weights(self, module):
+ # # if isinstance(module, ivy.Linear):
+ # trunc_normal_(module.weight, std=.02)
+ # module.w.data.normal_(mean=0.0, std=.02)
+ # if module.b is not None:
+ # module.b.data.zero_()
+ # return module
+
+ def _forward(self, x):
+ print(x.shape)
+ x = self.mlp(x)
+ x = ivy.functional.lp_normalize(x, p = 2., axis = 1)
+ x = self.last_layer(x)
+ return x
+
+
+class DataAugmentationDINO(object):
+ def __init__(self, global_crops_scale, local_crops_scale, local_crops_number):
+ flip_and_color_jitter = transforms.Compose([
+ transforms.RandomHorizontalFlip(p=0.5),
+ transforms.RandomApply(
+ [transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.2, hue=0.1)],
+ p=0.8
+ ),
+ transforms.RandomGrayscale(p=0.2),
+ ])
+ normalize = transforms.Compose([
+ transforms.ToTensor(),
+ transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+ ])
+
+ self.global_crop_1 = transforms.Compose([
+ transforms.RandomResizedCrop(224, scale=global_crops_scale, interpolation=Image.BICUBIC),
+ flip_and_color_jitter,
+ image_helpers.GaussianBlur(1.0),
+ normalize,
+ ])
+
+ self.global_crop_2 = transforms.Compose([
+ transforms.RandomResizedCrop(224, scale = global_crops_scale, interpolation=Image.BICUBIC),
+ flip_and_color_jitter,
+ image_helpers.GaussianBlur(0.1),
+ image_helpers.Solarization(0.2),
+ normalize,
+ ])
+
+ self.local_coprs_number = local_crops_number
+ self.local_crop = transforms.Compose([
+ transforms.RandomResizedCrop(96, scale=local_crops_scale, interpolation=Image.BICUBIC),
+ flip_and_color_jitter,
+ image_helpers.GaussianBlur(p=0.5),
+ normalize,
+ ])
+
+ def __call__(self, image):
+ crops = []
+ crops.append(self.global_crop_1(image))
+ crops.append(self.global_crop_2(image))
+ for _ in range(self.local_crops_number):
+ crops.append(self.local_crop(image))
+ return crops
+
+
+
+
+
+class MultiCropWrapper(ivy.Module):
+
+ def __init__(self, backbone, head):
+ super(MultiCropWrapper, self).__init__()
+ backbone.fc, backbone.head = ivy.Identity, ivy.Identity
+ self.backbone = backbone
+ self.head = head
+
+
+ def _forward(self, x):
+ if not isinstance(x, list):
+ x = [x]
+ idx_crops = ivy.cumsum(ivy.unique_consecutive(
+ ivy.array([inp.shape[-1] for inp in x]),
+ return_counts=True,
+ )[1], 0)
+
+ start_idx, output = 0, ivy.empty(0).to(x[0].device)
+ for end_idx in idx_crops:
+ _out = self.backbone(ivy.cat(x[start_idx: end_idx]))
+ # The output is a tuple with XCiT model. See:
+ # https://github.com/facebookresearch/xcit/blob/master/xcit.py#L404-L405
+ if isinstance(_out, tuple):
+ _out = _out[0]
+ # accumulate outputs
+ output = ivy.cat((output, _out))
+ start_idx = end_idx
+ # Run the head forward on the concatenated features.
+ return self.head(output)
diff --git a/ivy_models/dino/utils.py b/ivy_models/dino/utils.py
new file mode 100644
index 00000000..18df63cb
--- /dev/null
+++ b/ivy_models/dino/utils.py
@@ -0,0 +1,32 @@
+import ivy
+
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+
+ def norm_cdf(x):
+ return (1. + ivy.erf(x/ivy.sqrt(2.))) / 2.
+
+ if (mean < a - 2 * std) or (mean > b + 2 * std):
+ ivy.warn("mean is more than 2 std from [a, b] in trunc_normal_. "
+ "The distribution of values may be incorrect.",
+ stacklevel=2)
+
+ ivy.stop_gradient(tensor)
+ l = norm_cdf((a - mean) / std)
+ u = norm_cdf((b - mean) / std)
+ ivy.random_uniform(low = 2 * l - 1, high = 2 * u - 1, out = tensor)
+ # TODO: ivy.erfinv
+ tensor = ivy.multiply(tensor, std * ivy.sqrt(2.))
+ tensor = ivy.add(tensor, mean)
+ tensor = ivy.clip(tensor, a, b)
+ return tensor
+
+# TODO: Add this to ivy functions
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+ return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+
+
+if __name__ == "__main__":
+ x = ivy.array([1., 2., 3., 5.])
+ x = ivy.randint(-100, 100, shape = (10,5))
+ truncated_tensor = trunc_normal_(x, std = .02)
+ assert truncated_tensor.shape == x.shape
diff --git a/ivy_models/mlpmixer/mlpmixer.py b/ivy_models/mlpmixer/mlpmixer.py
index 85ba5ed7..d7cb02af 100644
--- a/ivy_models/mlpmixer/mlpmixer.py
+++ b/ivy_models/mlpmixer/mlpmixer.py
@@ -122,7 +122,7 @@ def get_spec_class(self):
def _forward(self, x, data_format=None):
data_format = data_format if data_format else self.spec.data_format
if data_format == "NCHW":
- x = ivy.permute_dims(x, (0, 3, 1, 2))
+ x = ivy.permute_dims(x, (0, 2, 3, 1))
x = self.conv(x)
x = x.reshape(
(int(x.shape[0]), int(x.shape[1]) * int(x.shape[2]), int(x.shape[3]))
diff --git a/ivy_models/unet/unet.py b/ivy_models/unet/unet.py
index 44234b31..06fb2ae7 100644
--- a/ivy_models/unet/unet.py
+++ b/ivy_models/unet/unet.py
@@ -42,6 +42,7 @@ def get_spec_class(self):
return UNetSpec
def _forward(self, x, data_format="NHWC"):
+ data_format = data_format if data_format else self.spec.data_format
if data_format == "NCHW":
x = ivy.permute_dims(x, (0, 2, 3, 1))
x1 = self.inc(x)
diff --git a/ivy_models/vit/layers.py b/ivy_models/vit/layers.py
index d7885c3f..c9b60c7a 100644
--- a/ivy_models/vit/layers.py
+++ b/ivy_models/vit/layers.py
@@ -1,7 +1,6 @@
from typing import Any, Callable, List, NamedTuple, Optional, Tuple, Union, Sequence
import collections
from itertools import repeat
-from collections import OrderedDict
from functools import partial
from ivy.stateful.initializers import Zeros
import ivy
@@ -210,7 +209,7 @@ def __init__(
self.num_heads = num_heads
self.hidden_dim = hidden_dim
self.mlp_dim = mlp_dim
- self.dropout = dropout
+ self.dropout_p = dropout
self.attention_dropout = attention_dropout
self.norm_layer = norm_layer
@@ -224,19 +223,19 @@ def _build(self, *args, **kwargs) -> bool:
num_heads=self.num_heads,
dropout_rate=self.attention_dropout,
)
- self.dropout = ivy.Dropout(self.dropout)
+ self.dropout = ivy.Dropout(self.dropout_p)
# MLP block
self.ln_2 = self.norm_layer(self.hidden_dim)
- self.mlp = VIT_MLPBlock(self.hidden_dim, self.mlp_dim, self.dropout)
+ self.mlp = VIT_MLPBlock(self.hidden_dim, self.mlp_dim, self.dropout_p)
def _forward(self, input):
ivy.utils.assertions.check_true(
- input.dim() == 3,
+ input.get_num_dims() == 3,
f"Expected (batch_size, seq_length, hidden_dim) got {input.shape}",
)
x = self.ln_1(input)
- x, _ = self.self_attention(x, x, x, need_weights=False)
+ x = self.self_attention(x, x, x)
x = self.dropout(x)
x = x + input
@@ -264,31 +263,33 @@ def __init__(
self._pos_embedding_shape = (1, seq_length, hidden_dim)
self.pos_embedding = Zeros() # from BERT
self.dropout = ivy.Dropout(dropout)
- layers: OrderedDict[str, ivy.Module] = OrderedDict()
+ layers = []
for i in range(num_layers):
- layers[f"encoder_layer_{i}"] = VIT_EncoderBlock(
- num_heads,
- hidden_dim,
- mlp_dim,
- dropout,
- attention_dropout,
- norm_layer,
+ layers.append(
+ VIT_EncoderBlock(
+ num_heads,
+ hidden_dim,
+ mlp_dim,
+ dropout,
+ attention_dropout,
+ norm_layer,
+ )
)
- self.layers = ivy.Sequential(layers)
+ self.layers = ivy.Sequential(*layers)
self.ln = norm_layer(hidden_dim)
super().__init__()
def _create_variables(self, device, dtype=None):
return {
- "pos_embeddin": self.pos_embedding.create_variables(
+ "pos_embedding": self.pos_embedding.create_variables(
self._pos_embedding_shape, device, dtype=dtype
)
}
def _forward(self, input):
ivy.utils.assertions.check_true(
- input.dim() == 3,
+ input.get_num_dims() == 3,
f"Expected (batch_size, seq_length, hidden_dim) got {input.shape}",
)
- input = input + self.pos_embedding
+ input = input + self.v.pos_embedding
return self.ln(self.layers(self.dropout(input)))
diff --git a/ivy_models/vit/vit.py b/ivy_models/vit/vit.py
index 4e7c488f..d94ed1c7 100644
--- a/ivy_models/vit/vit.py
+++ b/ivy_models/vit/vit.py
@@ -5,7 +5,6 @@
ConvStemConfig,
List,
Optional,
- OrderedDict,
VIT_Encoder,
Zeros,
ivy,
@@ -28,6 +27,7 @@ def __init__(
num_classes: int = 1000,
representation_size: Optional[int] = None,
norm_layer: Callable[..., ivy.Module] = partial(ivy.LayerNorm, eps=1e-6),
+ data_format: str = "NHWC",
conv_stem_configs: Optional[List[ConvStemConfig]] = None,
):
ivy.utils.assertions.check_true(
@@ -46,6 +46,7 @@ def __init__(
num_classes=num_classes,
representation_size=representation_size,
norm_layer=norm_layer,
+ data_format=data_format,
conv_stem_configs=conv_stem_configs,
)
@@ -67,6 +68,7 @@ def __init__(
representation_size: Optional[int] = None,
norm_layer: Callable[..., ivy.Module] = partial(ivy.LayerNorm, eps=1e-6),
conv_stem_configs: Optional[List[ConvStemConfig]] = None,
+ data_format: str = "NHWC",
spec=None,
v=None,
):
@@ -85,6 +87,7 @@ def __init__(
num_classes=num_classes,
representation_size=representation_size,
norm_layer=norm_layer,
+ data_format=data_format,
conv_stem_configs=conv_stem_configs,
)
)
@@ -93,22 +96,24 @@ def __init__(
def _build(self, *args, **kwargs):
if self.spec.conv_stem_configs is not None:
# As per https://arxiv.org/abs/2106.14881
- seq_proj = OrderedDict()
+ seq_proj = []
prev_channels = 3
for i, conv_stem_layer_config in enumerate(self.spec.conv_stem_configs):
- seq_proj[f"conv_bn_relu_{i}"] = Conv2dNormActivation(
- in_channels=prev_channels,
- out_channels=conv_stem_layer_config.out_channels,
- kernel_size=conv_stem_layer_config.kernel_size,
- stride=conv_stem_layer_config.stride,
- norm_layer=conv_stem_layer_config.norm_layer,
- activation_layer=conv_stem_layer_config.activation_layer,
+ seq_proj.append(
+ Conv2dNormActivation(
+ in_channels=prev_channels,
+ out_channels=conv_stem_layer_config.out_channels,
+ kernel_size=conv_stem_layer_config.kernel_size,
+ stride=conv_stem_layer_config.stride,
+ norm_layer=conv_stem_layer_config.norm_layer,
+ activation_layer=conv_stem_layer_config.activation_layer,
+ )
)
prev_channels = conv_stem_layer_config.out_channels
- seq_proj["conv_last"] = ivy.Conv2D(
- prev_channels, self.spec.hidden_dim, [1, 1], 1, 0
+ seq_proj.append(
+ ivy.Conv2D(prev_channels, self.spec.hidden_dim, [1, 1], 1, 0)
)
- self.conv_proj: ivy.Module = ivy.Sequential(seq_proj)
+ self.conv_proj: ivy.Module = ivy.Sequential(*seq_proj)
else:
self.conv_proj = ivy.Conv2D(
3,
@@ -137,21 +142,19 @@ def _build(self, *args, **kwargs):
)
self.seq_length = seq_length
- heads_layers: OrderedDict[str, ivy.Module] = OrderedDict()
+ heads_layers = []
if self.spec.representation_size is None:
- heads_layers["head"] = ivy.Linear(
- self.spec.hidden_dim, self.spec.num_classes
- )
+ heads_layers.append(ivy.Linear(self.spec.hidden_dim, self.spec.num_classes))
else:
- heads_layers["pre_logits"] = ivy.Linear(
- self.spec.hidden_dim, self.spec.representation_size
+ heads_layers.append(
+ ivy.Linear(self.spec.hidden_dim, self.spec.representation_size)
)
- heads_layers["act"] = ivy.tanh()
- heads_layers["head"] = ivy.Linear(
- self.spec.representation_size, self.spec.num_classes
+ heads_layers.append(ivy.tanh())
+ heads_layers.append(
+ ivy.Linear(self.spec.representation_size, self.spec.num_classes)
)
- self.heads = ivy.Sequential(heads_layers)
+ self.heads = ivy.Sequential(*heads_layers)
def _create_variables(self, device, dtype=None):
return {
@@ -161,7 +164,7 @@ def _create_variables(self, device, dtype=None):
}
def _process_input(self, x):
- n, c, h, w = x.shape
+ n, h, w, c = x.shape
p = self.spec.patch_size
ivy.utils.assertions.check_true(
h == self.spec.image_size,
@@ -174,16 +177,10 @@ def _process_input(self, x):
n_h = h // p
n_w = w // p
- # (n, c, h, w) -> (n, self.hidden_dim, n_h, n_w)
+ # (n, h, w, c) -> (n, n_h, n_w, self.hidden_dim)
x = self.conv_proj(x)
- # (n, self.hidden_dim, n_h, n_w) -> (n, self.hidden_dim, (n_h * n_w))
- x = x.reshape(n, self.spec.hidden_dim, n_h * n_w)
-
- # (n, self.hidden_dim, (n_h * n_w)) -> (n, (n_h * n_w), self.hidden_dim)
- # The self attention layer expects inputs in the format (N, S, E)
- # where S is the source sequence length, N is the batch size, E is the
- # embedding dimension
- x = x.permute(0, 2, 1)
+ # (n, n_h, n_w, self.hidden_dim) -> (n, (n_h * n_w), self.hidden_dim)
+ x = x.reshape(shape=(n, n_h * n_w, self.spec.hidden_dim))
return x
@@ -191,13 +188,17 @@ def _process_input(self, x):
def get_spec_class(self):
return VisionTransformerSpec
- def _forward(self, x):
+ def _forward(self, x, data_format: str = "NHWC"):
+ data_format = data_format if data_format else self.spec.data_format
+ if data_format == "NCHW":
+ x = ivy.permute_dims(x, (0, 2, 3, 1))
# Reshape and permute the input tensor
x = self._process_input(x)
n = x.shape[0]
# Expand the class token to the full batch
- batch_class_token = self.class_token.expand(n, -1, -1)
+
+ batch_class_token = ivy.expand(self.v.class_token, (n, -1, -1))
x = ivy.concat([batch_class_token, x], axis=1)
x = self.encoder(x)
@@ -227,6 +228,7 @@ def _vision_transformer(
num_heads: int,
hidden_dim: int,
mlp_dim: int,
+ data_format: str = "NHWC",
v=None,
) -> VisionTransformer:
model = VisionTransformer(
@@ -236,15 +238,21 @@ def _vision_transformer(
num_heads=num_heads,
hidden_dim=hidden_dim,
mlp_dim=mlp_dim,
+ data_format=data_format,
v=v,
)
return model
-def vit_b_16(pretrained=True) -> VisionTransformer:
+def vit_b_16(data_format="NHWC", pretrained=True) -> VisionTransformer:
model = _vision_transformer(
- patch_size=16, num_layers=12, num_heads=12, hidden_dim=768, mlp_dim=3072
+ patch_size=16,
+ num_layers=12,
+ num_heads=12,
+ hidden_dim=768,
+ mlp_dim=3072,
+ data_format=data_format,
)
if pretrained:
url = "https://download.pytorch.org/models/vit_b_16-c867db91.pth"
@@ -258,9 +266,14 @@ def vit_b_16(pretrained=True) -> VisionTransformer:
return model
-def vit_b_32(pretrained=True) -> VisionTransformer:
+def vit_b_32(data_format="NHWC", pretrained=True) -> VisionTransformer:
ref_model = _vision_transformer(
- patch_size=32, num_layers=12, num_heads=12, hidden_dim=768, mlp_dim=3072
+ patch_size=32,
+ num_layers=12,
+ num_heads=12,
+ hidden_dim=768,
+ mlp_dim=3072,
+ data_format=data_format,
)
if pretrained:
url = "https://download.pytorch.org/models/vit_b_32-d86f8d99.pth"
@@ -274,9 +287,14 @@ def vit_b_32(pretrained=True) -> VisionTransformer:
return ref_model
-def vit_l_16(pretrained=True) -> VisionTransformer:
+def vit_l_16(data_format="NHWC", pretrained=True) -> VisionTransformer:
ref_model = _vision_transformer(
- patch_size=16, num_layers=24, num_heads=16, hidden_dim=1024, mlp_dim=4096
+ patch_size=16,
+ num_layers=24,
+ num_heads=16,
+ hidden_dim=1024,
+ mlp_dim=4096,
+ data_format=data_format,
)
if pretrained:
url = "https://download.pytorch.org/models/vit_l_16-852ce7e3.pth"
@@ -290,9 +308,14 @@ def vit_l_16(pretrained=True) -> VisionTransformer:
return ref_model
-def vit_l_32(pretrained=True) -> VisionTransformer:
+def vit_l_32(data_format="NHWC", pretrained=True) -> VisionTransformer:
ref_model = _vision_transformer(
- patch_size=32, num_layers=24, num_heads=16, hidden_dim=1024, mlp_dim=4096
+ patch_size=32,
+ num_layers=24,
+ num_heads=16,
+ hidden_dim=1024,
+ mlp_dim=4096,
+ data_format=data_format,
)
if pretrained:
url = "https://download.pytorch.org/models/vit_l_32-c7638314.pth"
@@ -306,12 +329,17 @@ def vit_l_32(pretrained=True) -> VisionTransformer:
return ref_model
-def vit_h_14(pretrained=True) -> VisionTransformer:
+def vit_h_14(data_format="NHWC", pretrained=True) -> VisionTransformer:
ref_model = _vision_transformer(
- patch_size=14, num_layers=12, num_heads=14, hidden_dim=768, mlp_dim=3072
+ patch_size=14,
+ num_layers=32,
+ num_heads=16,
+ hidden_dim=1280,
+ mlp_dim=5120,
+ data_format=data_format,
)
if pretrained:
- url = "https://download.pytorch.org/models/vit_h_14_swag-80465313.pth"
+ url = "https://download.pytorch.org/models/vit_h_14_lc_swag-c1eb923e.pth"
w_clean = load_torch_weights(
url,
ref_model,
diff --git a/ivy_models_tests/dino/test_dinonet.py b/ivy_models_tests/dino/test_dinonet.py
new file mode 100644
index 00000000..7de1fe41
--- /dev/null
+++ b/ivy_models_tests/dino/test_dinonet.py
@@ -0,0 +1,57 @@
+import os
+import ivy
+import numpy as np
+# import pytest
+import traceback
+import sys
+import logging
+from ivy_models_tests import helpers
+from ivy_models.dino.dino import dino_base
+
+
+# @pytest.mark.parametrize("data_format", ["NHWC", "NCHW"])
+# def test_dino_classification(device, fw, data_format):
+# """Test AlexNet image classification."""
+# num_classes = 1000
+# batch_shape = [1]
+# this_dir = os.path.dirname(os.path.realpath(__file__))
+#
+# # Load image
+# img = helpers.load_and_preprocess_img(
+# os.path.join(this_dir, "..", "..", "images", "cat.jpg"),
+# 256,
+# 224,
+# data_format=data_format,
+# to_ivy=True,
+# )
+#
+# model = dino_base()
+#
+
+def run_model():
+ num_classes = 1000
+ batch_shape = [1]
+ this_dir = os.path.dirname(os.path.realpath(__file__))
+
+ # Load image
+ img = helpers.load_and_preprocess_img(
+ os.path.join(this_dir, "..", "..", "images", "cat.jpg"),
+ 256,
+ 224,
+ data_format="NHWC",
+ to_ivy=True,
+ )
+
+ model = dino_base()
+
+ try:
+ model.v = ivy.asarray(model.v)
+ logits = model(img)
+ print("LOGITS")
+ print(logits)
+ except Exception as e:
+ print(traceback.format_exc())
+ # or
+ print(sys.exc_info()[2])
+
+run_model()
diff --git a/ivy_models_tests/helpers/image_helpers.py b/ivy_models_tests/helpers/image_helpers.py
index 9cea853f..54a8b10c 100644
--- a/ivy_models_tests/helpers/image_helpers.py
+++ b/ivy_models_tests/helpers/image_helpers.py
@@ -1,6 +1,7 @@
import ivy
import numpy as np
-from PIL import Image
+import random
+from PIL import Image, ImageFilter, ImageOps
from torchvision import transforms
@@ -50,3 +51,39 @@ def load_and_preprocess_img(
if data_format == "NHWC":
img = img.permute((0, 2, 3, 1))
return ivy.array(img.numpy()) if to_ivy else img.numpy()
+
+
+class GaussianBlur(object):
+ """
+ Apply Gaussian Blur to the PIL image.
+ """
+ def __init__(self, p=0.5, radius_min=0.1, radius_max=2.):
+ self.prob = p
+ self.radius_min = radius_min
+ self.radius_max = radius_max
+
+ def __call__(self, img):
+ do_it = random.random() <= self.prob
+ if not do_it:
+ return img
+
+ return img.filter(
+ ImageFilter.GaussianBlur(
+ radius=random.uniform(self.radius_min, self.radius_max)
+ )
+ )
+
+
+class Solarization(object):
+ """
+ Apply Solarization to the PIL image.
+ """
+ def __init__(self, p):
+ self.p = p
+
+ def __call__(self, img):
+ if random.random() < self.p:
+ return ImageOps.solarize(img)
+ else:
+ return img
+
diff --git a/ivy_models_tests/mlpmixer/test_mlpmixer.py b/ivy_models_tests/mlpmixer/test_mlpmixer.py
new file mode 100644
index 00000000..7826f508
--- /dev/null
+++ b/ivy_models_tests/mlpmixer/test_mlpmixer.py
@@ -0,0 +1,73 @@
+import os
+import ivy
+import pytest
+import numpy as np
+
+from ivy_models.mlpmixer import mlpmixer
+from ivy_models_tests import helpers
+
+import tensorflow as tf
+from tensorflow import keras
+from keras import layers
+import jax
+
+jax.config.update("jax_enable_x64", False)
+
+load_weights = True
+model = mlpmixer(pretrained=load_weights)
+v = ivy.to_numpy(model.v)
+
+
+@pytest.mark.parametrize("data_format", ["NHWC", "NCHW"])
+def test_mlpmixer_tiny_img_classification(device, fw, data_format):
+ """Test MLPMixer image classification."""
+ num_classes = 10
+ batch_shape = [1]
+ this_dir = os.path.dirname(os.path.realpath(__file__))
+
+ # Load image
+ img = helpers.load_image_in_np(
+ os.path.join(this_dir, "..", "..", "images", "car.jpg")
+ )
+
+ # Preprocess the image
+ def get_augmentation_layers():
+ data_augmentation = keras.Sequential(
+ [
+ layers.experimental.preprocessing.Normalization(
+ mean=(0.5, 0.5, 0.5), variance=(0.25, 0.25, 0.25)
+ ),
+ layers.experimental.preprocessing.Resizing(72, 72),
+ layers.experimental.preprocessing.RandomFlip("horizontal"),
+ layers.experimental.preprocessing.RandomRotation(factor=0.02),
+ layers.experimental.preprocessing.RandomZoom(
+ height_factor=0.2, width_factor=0.2
+ ),
+ ],
+ name="data_augmentation",
+ )
+ return data_augmentation
+
+ data_augmentation = get_augmentation_layers()
+ img = data_augmentation(img)
+ img = tf.expand_dims(img, 0).numpy()
+ img = ivy.asarray(img)
+ if data_format == "NCHW":
+ img = ivy.permute_dims(img, (0, 3, 1, 2))
+
+ model.v = ivy.asarray(v)
+ logits = model(img, data_format=data_format)
+
+ # Cardinality test
+ assert logits.shape == tuple([ivy.to_scalar(batch_shape), num_classes])
+
+ # Value test
+ if load_weights:
+ np_out = ivy.to_numpy(logits)
+ true_indices = np.array([4, 7, 2, 9])
+ calc_indices = np.argsort(np_out[0])[-4:][::-1]
+ assert np.array_equal(np.sort(true_indices), np.sort(calc_indices))
+
+ true_logits = np.array([0.4022081, 0.24405026, 0.14345096, 0.12923254])
+ calc_logits = np.take(np_out, calc_indices)
+ assert np.allclose(true_logits, calc_logits, rtol=1e-2, atol=1e-1)
diff --git a/ivy_models_tests/squeezenet/test_squeezenet.py b/ivy_models_tests/squeezenet/test_squeezenet.py
index af77641c..ddef8a47 100644
--- a/ivy_models_tests/squeezenet/test_squeezenet.py
+++ b/ivy_models_tests/squeezenet/test_squeezenet.py
@@ -12,10 +12,9 @@
"squeezenet1_1": squeezenet1_1,
}
-ivy.seed(seed_value=42)
load_weights = random.choice([False, True])
model_var = random.choice(list(VARIANTS.keys()))
-model = VARIANTS[model_var](pretrained=load_weights)
+model = VARIANTS[model_var](dropout=0, pretrained=load_weights)
v = ivy.to_numpy(model.v)
diff --git a/ivy_models_tests/vit/test_vit.py b/ivy_models_tests/vit/test_vit.py
index 381385d3..4263e4a7 100644
--- a/ivy_models_tests/vit/test_vit.py
+++ b/ivy_models_tests/vit/test_vit.py
@@ -1,31 +1,60 @@
import os
-import ivy
-import pytest
import numpy as np
-
-from ivy_models import vit_b_16
+import pytest
+import random
+import ivy
from ivy_models_tests import helpers
+from ivy_models.vit import (
+ vit_b_16,
+ vit_b_32,
+ vit_l_16,
+ vit_l_32,
+)
-import jax
-jax.config.update("jax_enable_x64", False)
+VARIANTS = {
+ "vit_b_16": vit_b_16,
+ "vit_b_32": vit_b_32,
+ "vit_l_16": vit_l_16,
+ "vit_l_32": vit_l_32,
+}
-@pytest.mark.parametrize("batch_shape", [[1]])
-@pytest.mark.parametrize("load_weights", [False, True])
-def test_alexnet_tiny_img_classification(device, fw, batch_shape, load_weights):
+LOGITS = {
+ "vit_b_16": [282, 281, 285, 287, 292],
+ "vit_b_32": [282, 281, 285, 287, 292],
+ "vit_l_16": [255, 281, 282, 285, 292],
+ "vit_l_32": [282, 281, 285, 287, 292],
+}
+
+
+load_weights = random.choice([False, True])
+model_var = random.choice(list(VARIANTS.keys()))
+model = VARIANTS[model_var](pretrained=load_weights)
+v = ivy.to_numpy(model.v)
+
+
+@pytest.mark.parametrize("data_format", ["NHWC", "NCHW"])
+def test_vit_img_classification(device, fw, data_format):
"""Test ViT image classification."""
num_classes = 1000
+ batch_shape = [1]
this_dir = os.path.dirname(os.path.realpath(__file__))
# Load image
- img = helpers.load_and_preprocess_img(
- os.path.join(this_dir, "..", "..", "images", "cat.jpg"), 256, 224
+ img = ivy.asarray(
+ helpers.load_and_preprocess_img(
+ os.path.join(this_dir, "..", "..", "images", "cat.jpg"),
+ 256,
+ 224,
+ data_format=data_format,
+ to_ivy=True,
+ ),
)
- img = ivy.permute_dims(img, (0, 3, 1, 2))
- model = vit_b_16(pretrained=load_weights)
- logits = model(img)
+ # Create model
+ model.v = ivy.asarray(v)
+ logits = model(img, data_format=data_format)
# Cardinality test
assert logits.shape == tuple([ivy.to_scalar(batch_shape), num_classes])
@@ -33,10 +62,6 @@ def test_alexnet_tiny_img_classification(device, fw, batch_shape, load_weights):
# Value test
if load_weights:
np_out = ivy.to_numpy(logits[0])
- true_indices = np.array([282, 281, 285, 287, 896])
- calc_indices = np.argsort(np_out)[-5:][::-1]
+ true_indices = np.sort(np.array(LOGITS[model_var]))
+ calc_indices = np.sort(np.argsort(np_out)[-5:][::-1])
assert np.array_equal(true_indices, calc_indices)
-
- true_logits = np.array([23.5786, 22.791977, 20.917543, 19.49762, 16.102253])
- calc_logits = np.take(np_out, calc_indices)
- assert np.allclose(true_logits, calc_logits, rtol=1e-3)
diff --git a/requirements.txt b/requirements.txt
index 309e21c3..c6780027 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1 @@
-git+https://github.com/unifyai/ivy.git@master
+git+https://github.com/unifyai/ivy.git@main
diff --git a/setup.py b/setup.py
index fa690915..4276877a 100644
--- a/setup.py
+++ b/setup.py
@@ -30,7 +30,7 @@ def _replace_logos_html(txt):
backends_chunk = chunks[2]
bc = backends_chunk.split("\n\n")
img_str = (
- ".. image:: https://github.com/unifyai/unifyai.github.io/blob/master/img/externally_linked/logos/supported/frameworks.png?raw=true\n" # noqa
+ ".. image:: https://github.com/unifyai/unifyai.github.io/blob/main/img/externally_linked/logos/supported/frameworks.png?raw=true\n" # noqa
" :width: 100%"
)
backends_chunk = "\n\n".join(bc[0:1] + [img_str] + bc[2:])
@@ -39,7 +39,7 @@ def _replace_logos_html(txt):
libraries_chunk = chunks[3]
lc = libraries_chunk.split("\n\n")
img_str = (
- ".. image:: https://github.com/unifyai/unifyai.github.io/blob/master/img/externally_linked/ivy_libraries.png?raw=true\n" # noqa
+ ".. image:: https://github.com/unifyai/unifyai.github.io/blob/main/img/externally_linked/ivy_libraries.png?raw=true\n" # noqa
" :width: 100%"
)
libraries_chunk = "\n\n".join(lc[0:1] + [img_str] + lc[2:])