diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index c17bb841..a22f44fb 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -2,12 +2,13 @@ name: Update docs on: push: branches: - - master + - main tags: - v* + workflow_dispatch: jobs: update-docs: name: Update docs - uses: unifyai/workflows/.github/workflows/docs.yml@master + uses: unifyai/workflows/.github/workflows/docs.yml@main secrets: inherit diff --git a/.github/workflows/lint-bot.yml b/.github/workflows/lint-bot.yml index 03e43b4a..5136a5c1 100644 --- a/.github/workflows/lint-bot.yml +++ b/.github/workflows/lint-bot.yml @@ -11,5 +11,5 @@ permissions: jobs: fix-linting: name: Fix Linting - uses: unifyai/workflows/.github/workflows/lint-bot.yml@master + uses: unifyai/workflows/.github/workflows/lint-bot.yml@main secrets: inherit diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 3e5c9ee2..ffb247e8 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -5,4 +5,4 @@ on: [push, pull_request] jobs: check-formatting: name: Check formatting - uses: unifyai/workflows/.github/workflows/lint.yml@master \ No newline at end of file + uses: unifyai/workflows/.github/workflows/lint.yml@main \ No newline at end of file diff --git a/.github/workflows/test-new-pr.yml b/.github/workflows/test-new-pr.yml index b9e53d70..1701bd83 100644 --- a/.github/workflows/test-new-pr.yml +++ b/.github/workflows/test-new-pr.yml @@ -16,9 +16,7 @@ jobs: uses: tj-actions/changed-files@v37 with: files: | - "ivy_models_tests/" - files_ignore: | - "!*.py" + "ivy_models_tests/**/*.py" - name: Run tests if any files in ivy_models_tests changed if: steps.changed-files.outputs.any_changed == 'true' diff --git a/README.rst b/README.rst index 146588a4..af9a35e7 100644 --- a/README.rst +++ b/README.rst @@ -1,8 +1,8 @@ -.. image:: https://github.com/unifyai/unifyai.github.io/blob/master/img/externally_linked/logo.png?raw=true#gh-light-mode-only +.. image:: https://github.com/unifyai/unifyai.github.io/blob/main/img/externally_linked/logo.png?raw=true#gh-light-mode-only :width: 100% :class: only-light -.. image:: https://github.com/unifyai/unifyai.github.io/blob/master/img/externally_linked/logo_dark.png?raw=true#gh-dark-mode-only +.. image:: https://github.com/unifyai/unifyai.github.io/blob/main/img/externally_linked/logo_dark.png?raw=true#gh-dark-mode-only :width: 100% :class: only-dark @@ -78,23 +78,23 @@ The layers are sometimes kept in a separate file, usually named :code:`layers.py .. raw:: html
- + - + - + - + - + - + - + - + - +

@@ -109,26 +109,26 @@ neural memory, pre-trained models + implementations, and builder tools with trai
- - + + - - + + - - + + - - + + @@ -166,26 +166,26 @@ neural memory, pre-trained models + implementations, and builder tools with trai - - + + - - + + - - + + - - + + diff --git a/ivy_models/base/model.py b/ivy_models/base/model.py index 8cc17707..b6c89d14 100644 --- a/ivy_models/base/model.py +++ b/ivy_models/base/model.py @@ -170,4 +170,7 @@ def load_from_huggingface( spec = self.get_spec_class().from_json_file(config_path) os.remove(config_path) - return self(spec=spec, v=weights) + model = self(spec=spec) + model.v = weights + + return model diff --git a/ivy_models/dino/__init__.py b/ivy_models/dino/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/ivy_models/dino/dino.py b/ivy_models/dino/dino.py new file mode 100644 index 00000000..e56ec8a5 --- /dev/null +++ b/ivy_models/dino/dino.py @@ -0,0 +1,134 @@ +from ivy_models.base import BaseModel, BaseSpec +import ivy +from ivy_models.vit.vit import VisionTransformer +from ivy_models.dino.layers import MultiCropWrapper, DINOHead, DINOBackbone +from ivy_models.vit.layers import partial, ConvStemConfig + +class DINOConfig(BaseSpec): + def __init__(self, img_size: int, + patch_size: int, + num_layers: int, + num_heads: int, + hidden_dim: int, + mlp_dim: int, + in_dim: int = 0, + dropout: float = 0.0, + attention_dropout: float = 0.0, + num_classes: int = 1000, + representation_size: ivy.Optional[int] = None, + norm_layer: ivy.Callable[..., ivy.Module] = partial(ivy.LayerNorm, eps=1e-6), + conv_stem_configs: ivy.Optional[ivy.List[ConvStemConfig]] = None, + out_dim: int = 65536, + use_bn: bool = False, + norm_last_layer: bool = True, + nlayers: int = 1, + hidden_dim_: int = 2048, + bottleneck_dim: int = 256, + _weight_init: ivy.Initializer = ivy.GlorotUniform(), + _bias_init: ivy.Initializer = ivy.Zeros(), + with_bias: bool = True, + device=None, + dtype=None + ): + super(DINOConfig, self).__init__() + self.img_size = img_size + self.patch_size = patch_size + self.num_layers = num_layers + self.num_heads = num_heads + self.hidden_dim = hidden_dim + self.mlp_dim = mlp_dim + self.in_dim = in_dim + self.dropout = dropout + self.attention_dropout = attention_dropout + self.num_classes = num_classes + self.representation_size = representation_size + self.norm_layer = norm_layer + self.conv_stem_configs = conv_stem_configs + self.out_dim = out_dim + self.use_bn = use_bn + self.norm_last_layer = norm_last_layer + self.nlayers = nlayers + self.hidden_dim_ = hidden_dim_ + self.bottleneck_dim = bottleneck_dim + self._weight_init = _weight_init + self._bias_init = _bias_init + self.with_bias = with_bias + self.device = device + self.dtype = dtype + + def get(self, *attr_names): + new_dict = {} + for name in attr_names: + new_dict[name] = getattr(self, name) + return new_dict + + def get_vit_attrs(self): + return self.get( + "img_size", + "patch_size", + "num_layers", + "num_heads", + "hidden_dim", + "mlp_dim", + "dropout", + "attention_dropout", + "num_classes", + "representation_size", + "norm_layer", + "conv_stem_configs" + ) + + def get_head_attrs(self): + return self.get( + "in_dim", + "out_dim", + "use_bn", + "norm_last_layer", + "nlayers", + "hidden_dim_", + "bottleneck_dim", + "_weight_init", + "_bias_init", + "with_bias" + ) + +class DINONet(BaseModel): + + def __init__( + self, + config: DINOConfig, + v: ivy.Container = None, + ) -> None: + self.config = config + super(DINONet, self).__init__(v=v) + + @classmethod + def get_spec_class(self): + return DINOConfig + + def _build(self): + self.student = DINOBackbone(**self.config.get_vit_attrs()) + self.teacher = DINOBackbone(**self.config.get_vit_attrs()) + self.config.in_dim = self.config.num_classes + self.teacher_head = DINOHead(**self.config.get_head_attrs()) + self.student_head = DINOHead(**self.config.get_head_attrs()) + + def _forward(self, x): + return { + "student_output": self.student_head(self.student(x)), + "teacher_output": self.teacher_head(self.teacher(x)) + } + + +def dino_base(pretrained=False): + # instantiate the hyperparameters same as bert + # set the dropout rate to 0.0 to avoid stochasticity in the output + config = DINOConfig(img_size = 224, patch_size=16, + num_layers=12, + num_heads=12, + hidden_dim=768, + mlp_dim=3072, out_dim = 65536, + ) + model = DINONet(config) + return model + diff --git a/ivy_models/dino/dino_vit.py b/ivy_models/dino/dino_vit.py new file mode 100644 index 00000000..99f3df3e --- /dev/null +++ b/ivy_models/dino/dino_vit.py @@ -0,0 +1,283 @@ +import math +import ivy +from ivy_models.dino.utils import trunc_normal_ +from ivy.stateful.initializers import Zeros, GlorotUniform +from ivy_models.vit.layers import partial + + +def drop_path(x, drop_prob: float = 0., training: bool = False): + if drop_prob == 0. or not training: + return x + keep_prob = 1 - drop_prob + shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + ivy.random_uniform(0,1, shape, dtype=x.dtype, device=x.device) + random_tensor.floor_() # binarize + output = x.div(keep_prob) * random_tensor + return output + + +class DropPath(ivy.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + +class Mlp(ivy.Module): + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=ivy.GELU, drop=0.): + super(Mlp, self).__init__() + self.in_features = in_features + self.out_features = out_features or in_features + self.hidden_features = hidden_features or in_features + self.act_layer = act_layer + self.drop = drop + + def _build(self, *args, **kwargs): + self.fc1 = ivy.Linear(self.in_features, self.hidden_features) + self.act = self.act_layer() + self.fc2 = ivy.Linear(self.hidden_features, self.out_features) + self.drop = ivy.Dropout(self.drop) + + def _forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(ivy.Module): + def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): + self.dim = dim + self.num_heads = num_heads + self.qkv_bias = qkv_bias + self.qk_scale = qk_scale + self.attn_drop = attn_drop + self.proj_drop = proj_drop + head_dim = dim // num_heads + self.scale = qk_scale or head_dim ** -0.5 + + def _build(self, *args, **kwargs): + self.qkv = ivy.Linear(self.dim, self.dim * 3, bias=self.qkv_bias) + self.attn_drop = ivy.Dropout(self.attn_drop) + self.proj = ivy.Linear(self.dim, self.dim) + self.proj_drop = ivy.Dropout(self.proj_drop) + + def _forward(self, x): + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x, attn + + +class Block(ivy.Module): + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., act_layer=ivy.GELU, norm_layer=ivy.LayerNorm): + # Additional attributes + self.dim = dim + self.num_heads = num_heads + self.mlp_ratio = mlp_ratio + self.qkv_bias = qkv_bias + self.qk_scale = qk_scale + self.drop = drop + self.attn_drop = attn_drop + self.drop_path = drop_path + self.act_layer = act_layer + self.norm_layer = norm_layer + super(Block, self).__init__() + + def _build(self, *args, **kwargs): + self.norm1 = self.norm_layer(self.dim) + self.attn = Attention( + self.dim, num_heads=self.num_heads, qkv_bias=self.qkv_bias, qk_scale=self.qk_scale, attn_drop=self.attn_drop, proj_drop=self.drop) + self.drop_path = DropPath(drop_path) if drop_path > 0. else ivy.Identity() + self.norm2 = self.norm_layer(self.dim) + mlp_hidden_dim = int(self.dim * self.mlp_ratio) + self.mlp = Mlp(in_features=self.dim, hidden_features=mlp_hidden_dim, act_layer=self.act_layer, drop=self.drop) + + def _forward(self, x, return_attention=False): + y, attn = self.attn(self.norm1(x)) + if return_attention: + return attn + x = x + self.drop_path(y) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class PatchEmbed(ivy.Module): + """ Image to Patch Embedding + """ + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): + num_patches = (img_size // patch_size) * (img_size // patch_size) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + + super(PatchEmbed).__init__() + + def _build(self, *args, **kwargs): + self.proj = ivy.Conv2D(self.in_chans, self.embed_dim, [self.patch_size, self.patch_size], self.patch_size, 0) + + def _forward(self, x): + B, C, H, W = x.shape + x = self.proj(x).flatten(2).transpose(1, 2) + return x + + +class VisionTransformer(ivy.Module): + """ Vision Transformer """ + def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=0, embed_dim=768, depth=12, + num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., + drop_path_rate=0., norm_layer=ivy.LayerNorm, device=None, dtype=None, v: ivy.Container = None, **kwargs) -> None: + self.img_size = img_size + self.patch_size = patch_size + self.in_chans = in_chans + self.num_classes = num_classes + self.embed_dim = embed_dim + self.depth = depth + self.num_heads = num_heads + self.mlp_ratio = mlp_ratio + self.qkv_bias = qkv_bias + self.qk_scale = qk_scale + self.drop_rate = drop_rate + self.attn_drop_rate = attn_drop_rate + self.drop_path_rate = drop_path_rate + self.norm_layer = norm_layer + self.patch_embed = PatchEmbed( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) + self.num_patches = self.patch_embed.num_patches + self.cls_token_shape = (1, 1, embed_dim) + self.cls_token = Zeros() + self.pos_embed_shape = (1, self.num_patches + 1, embed_dim) + self.pos_embed = Zeros() + self._weight_init = GlorotUniform() + self._bias_init = Zeros() + self.num_features = self.embed_dim = embed_dim + self._w_shape = (embed_dim,) + self._b_shape = (embed_dim,) + super(VisionTransformer, self).__init__(v=v, device=device, dtype=dtype) + + def _build(self, *args, **kwargs): + self.pos_drop = ivy.Dropout(prob=self.drop_rate) + dpr = [x.item() for x in ivy.linspace(0, self.drop_path_rate, self.depth)] # stochastic depth decay rule + self.blocks = [ + Block( + dim=self.embed_dim, num_heads=self.num_heads, mlp_ratio=self.mlp_ratio, qkv_bias=self.qkv_bias, qk_scale=self.qk_scale, + drop=self.drop_rate, attn_drop=self.attn_drop_rate, drop_path=dpr[i], norm_layer=self.norm_layer) + for i in range(self.depth)] + self.norm = self.norm_layer(self.embed_dim) + + # Classifier head + self.head = ivy.Linear(self.embed_dim, self.num_classes) if self.num_classes > 0 else ivy.Identity() + + # trunc_normal_(self.v.pos_embed, std=.02) + # trunc_normal_(self.v.cls_token, std=.02) + + + def _create_variables(self, *, device=None, dtype=None): + # w = self._weight_init.create_variables( + # self._w_shape, device, dtype + # ) + # v = { + # "w": trunc_normal_(w, std=.02), + # } + # v = dict( + # **v, + # b=self._b_init.create_variables( + # self._b_shape, + # device, + # dtype=dtype, + # ), + # ) + v = {} + v = dict(**v, + class_token= self.cls_token.create_variables( + self.cls_token_shape, device, dtype=dtype + )) + v = dict(**v, pos_embed= self.pos_embed.create_variables(self.pos_embed_shape, device, dtype=dtype)) + return v + + def interpolate_pos_encoding(self, x, w, h): + npatch = x.shape[1] - 1 + N = self.v.pos_embed.shape[1] - 1 + if npatch == N and w == h: + return self.v.pos_embed + class_pos_embed = self.v.pos_embed[:, 0] + patch_pos_embed = self.v.pos_embed[:, 1:] + dim = x.shape[-1] + w0 = w // self.patch_embed.patch_size + h0 = h // self.patch_embed.patch_size + # we add a small number to avoid floating point error in the interpolation + # see discussion at https://github.com/facebookresearch/dino/issues/8 + w0, h0 = w0 + 0.1, h0 + 0.1 + patch_pos_embed = ivy.interpolate( + patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(0, 3, 1, 2), + scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)), + mode='bicubic', + ) + assert int(w0) == patch_pos_embed.shape[-2] and int(h0) == patch_pos_embed.shape[-1] + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + return ivy.concat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + def prepare_tokens(self, x): + B, nc, w, h = x.shape + x = self.patch_embed(x) # patch linear embedding + + # add the [CLS] token to the embed patch tokens + cls_tokens = ivy.expand(self.v.cls_token, (B,-1,-1)) + x = ivy.concat((cls_tokens, x), dim=1) + + # add positional encoding to each token + x = x + self.interpolate_pos_encoding(x, w, h) + + return self.pos_drop(x) + + def _forward(self, x): + x = self.prepare_tokens(x) + for blk in self.blocks: + x = blk(x) + x = self.norm(x) + return x[:, 0] + + def get_last_selfattention(self, x): + x = self.prepare_tokens(x) + for i, blk in enumerate(self.blocks): + if i < len(self.blocks) - 1: + x = blk(x) + else: + # return attention of the last block + return blk(x, return_attention=True) + + def get_intermediate_layers(self, x, n=1): + x = self.prepare_tokens(x) + # we return the output tokens from the `n` last blocks + output = [] + for i, blk in enumerate(self.blocks): + x = blk(x) + if len(self.blocks) - i <= n: + output.append(self.norm(x)) + return output + + +def vit_tiny(patch_size=16, **kwargs): + model = VisionTransformer( + patch_size=patch_size, embed_dim=192, depth=12, num_heads=3, mlp_ratio=4, + qkv_bias=True, norm_layer=partial(ivy.LayerNorm, eps=1e-6), **kwargs) + return model + + +if __name__ == "__main__": + model = vit_tiny() diff --git a/ivy_models/dino/layers.py b/ivy_models/dino/layers.py new file mode 100644 index 00000000..af3f619c --- /dev/null +++ b/ivy_models/dino/layers.py @@ -0,0 +1,237 @@ +import ivy +import ivy_models +from ivy_models.base import BaseModel, BaseSpec +from ivy_models.dino.utils import trunc_normal_ +from torchvision import transforms +from ivy_models.vit.vit import VisionTransformer +from ivy.stateful.initializers import Initializer, GlorotUniform, Zeros +from ivy_models.vit.layers import partial, ConvStemConfig +from ivy_models_tests.helpers import image_helpers +from PIL import Image + +class DINOBackbone(ivy.Module): + + def __init__( + self, + img_size: int, + patch_size: int, + num_layers: int, + num_heads: int, + hidden_dim: int, + mlp_dim: int, + dropout: float = 0.0, + attention_dropout: float = 0.0, + num_classes: int = 1000, + representation_size: ivy.Optional[int] = None, + norm_layer: ivy.Callable[..., ivy.Module] = partial(ivy.LayerNorm, eps=1e-6), + conv_stem_configs: ivy.Optional[ivy.List[ConvStemConfig]] = None, + spec=None, + v: ivy.Container = None, + ): + self.img_size = img_size + self.patch_size = patch_size + self.num_layers = num_layers + self.num_heads = num_heads + self.hidden_dim = hidden_dim + self.mlp_dim = mlp_dim + self.dropout = dropout + self.attention_dropout = attention_dropout + self.num_classes = num_classes + self.representation_size = representation_size + self.norm_layer = norm_layer + self.conv_stem_configs = conv_stem_configs + super(DINOBackbone, self).__init__(v=v) + + def _build(self, *args, **kwargs): + self.backbone = VisionTransformer(image_size=self.img_size, patch_size=self.patch_size, + num_layers=self.num_layers, + num_heads=self.num_heads, hidden_dim=self.hidden_dim, mlp_dim=self.mlp_dim) + + def _forward(self, x): + # if not isinstance(x, list): + # x = [x] + idx_crops = ivy.cumsum(ivy.unique_consecutive( + ivy.array([inp.shape[-1] for inp in x]), + )[2], 0) + start_idx, output = 0, ivy.empty(0, device=x[0].device) + # for end_idx in idx_crops.tolist(): + _out = self.backbone(x) + # if isinstance(_out, tuple): + # _out = _out[0] + # # accumulate outputs + # output = ivy.concat((output, _out)) + # start_idx = end_idx + # CHANGE BACK FOR LOOP HERE + output = _out + return output + +class DINOHead(ivy.Module): + """DINO architecture""" + def __init__( + self, + in_dim: int, + out_dim: int, + use_bn: bool = False, + norm_last_layer : bool = True, + nlayers: int = 3, + hidden_dim_: int = 2048, + bottleneck_dim: int = 256, + _weight_init: Initializer = GlorotUniform(), + _bias_init: Initializer = Zeros(), + with_bias: bool = True, + device=None, + dtype=None, + v: ivy.Container = None, + ) -> None: + self.in_dim = in_dim + self.out_dim = out_dim + self.use_bn = use_bn + self.norm_last_layer = norm_last_layer + self.nlayers = nlayers + self.hidden_dim_ = hidden_dim_ + self.bottleneck_dim = bottleneck_dim + self._w_shape = (out_dim, in_dim) + self._b_shape = (out_dim,) + self._weight_init = _weight_init + self._b_init = _bias_init + self.with_bias = with_bias + super(DINOHead, self).__init__(v=v, device=device, dtype=dtype) + + def _create_variables(self, device, dtype=None): + w = self._weight_init.create_variables( + self._w_shape, device,self.out_dim, + self.in_dim, dtype + ) + v = { + "w": trunc_normal_(w, std=.02), + } + v = dict( + **v, + b=self._b_init.create_variables( + self._b_shape, + device, + self.out_dim, + self.in_dim, + dtype=dtype, + ), + ) + return v + + def _build(self, *args, **kwargs): + nlayers = max(self.nlayers, 1) + if nlayers == 1: + self.mlp = ivy.Linear(self.in_dim, self.bottleneck_dim) + else: + layers = [ivy.Linear(self.in_dim, self.bottleneck_dim)] + # TODO: change back to batchnorm1d when changes are merged + if self.use_bn: + layers.append(ivy.BatchNorm2D(self.hidden_dim_)) + layers.append(ivy.GELU()) + for _ in range(nlayers-2): + layers.append(ivy.Linear(self.hidden_dim_, self.hidden_dim_)) + if self.use_bn: + layers.append(ivy.BatchNorm2D(self.hidden_dim_)) + layers.append(ivy.GELU()) + layers.append(ivy.Linear(self.hidden_dim_, self.bottleneck_dim)) + self.mlp = ivy.Sequential(*layers) + # TODO: weight normalization + self.last_layer = ivy.Linear(self.bottleneck_dim, self.out_dim) + self.last_layer.v.w = ivy.full_like(self.last_layer.v.w, 1.0) + if self.norm_last_layer: + self.last_layer.v.w.requires_grad = False + + # def _init_weights(self, module): + # # if isinstance(module, ivy.Linear): + # trunc_normal_(module.weight, std=.02) + # module.w.data.normal_(mean=0.0, std=.02) + # if module.b is not None: + # module.b.data.zero_() + # return module + + def _forward(self, x): + print(x.shape) + x = self.mlp(x) + x = ivy.functional.lp_normalize(x, p = 2., axis = 1) + x = self.last_layer(x) + return x + + +class DataAugmentationDINO(object): + def __init__(self, global_crops_scale, local_crops_scale, local_crops_number): + flip_and_color_jitter = transforms.Compose([ + transforms.RandomHorizontalFlip(p=0.5), + transforms.RandomApply( + [transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.2, hue=0.1)], + p=0.8 + ), + transforms.RandomGrayscale(p=0.2), + ]) + normalize = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + ]) + + self.global_crop_1 = transforms.Compose([ + transforms.RandomResizedCrop(224, scale=global_crops_scale, interpolation=Image.BICUBIC), + flip_and_color_jitter, + image_helpers.GaussianBlur(1.0), + normalize, + ]) + + self.global_crop_2 = transforms.Compose([ + transforms.RandomResizedCrop(224, scale = global_crops_scale, interpolation=Image.BICUBIC), + flip_and_color_jitter, + image_helpers.GaussianBlur(0.1), + image_helpers.Solarization(0.2), + normalize, + ]) + + self.local_coprs_number = local_crops_number + self.local_crop = transforms.Compose([ + transforms.RandomResizedCrop(96, scale=local_crops_scale, interpolation=Image.BICUBIC), + flip_and_color_jitter, + image_helpers.GaussianBlur(p=0.5), + normalize, + ]) + + def __call__(self, image): + crops = [] + crops.append(self.global_crop_1(image)) + crops.append(self.global_crop_2(image)) + for _ in range(self.local_crops_number): + crops.append(self.local_crop(image)) + return crops + + + + + +class MultiCropWrapper(ivy.Module): + + def __init__(self, backbone, head): + super(MultiCropWrapper, self).__init__() + backbone.fc, backbone.head = ivy.Identity, ivy.Identity + self.backbone = backbone + self.head = head + + + def _forward(self, x): + if not isinstance(x, list): + x = [x] + idx_crops = ivy.cumsum(ivy.unique_consecutive( + ivy.array([inp.shape[-1] for inp in x]), + return_counts=True, + )[1], 0) + + start_idx, output = 0, ivy.empty(0).to(x[0].device) + for end_idx in idx_crops: + _out = self.backbone(ivy.cat(x[start_idx: end_idx])) + # The output is a tuple with XCiT model. See: + # https://github.com/facebookresearch/xcit/blob/master/xcit.py#L404-L405 + if isinstance(_out, tuple): + _out = _out[0] + # accumulate outputs + output = ivy.cat((output, _out)) + start_idx = end_idx + # Run the head forward on the concatenated features. + return self.head(output) diff --git a/ivy_models/dino/utils.py b/ivy_models/dino/utils.py new file mode 100644 index 00000000..18df63cb --- /dev/null +++ b/ivy_models/dino/utils.py @@ -0,0 +1,32 @@ +import ivy + +def _no_grad_trunc_normal_(tensor, mean, std, a, b): + + def norm_cdf(x): + return (1. + ivy.erf(x/ivy.sqrt(2.))) / 2. + + if (mean < a - 2 * std) or (mean > b + 2 * std): + ivy.warn("mean is more than 2 std from [a, b] in trunc_normal_. " + "The distribution of values may be incorrect.", + stacklevel=2) + + ivy.stop_gradient(tensor) + l = norm_cdf((a - mean) / std) + u = norm_cdf((b - mean) / std) + ivy.random_uniform(low = 2 * l - 1, high = 2 * u - 1, out = tensor) + # TODO: ivy.erfinv + tensor = ivy.multiply(tensor, std * ivy.sqrt(2.)) + tensor = ivy.add(tensor, mean) + tensor = ivy.clip(tensor, a, b) + return tensor + +# TODO: Add this to ivy functions +def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.): + return _no_grad_trunc_normal_(tensor, mean, std, a, b) + + +if __name__ == "__main__": + x = ivy.array([1., 2., 3., 5.]) + x = ivy.randint(-100, 100, shape = (10,5)) + truncated_tensor = trunc_normal_(x, std = .02) + assert truncated_tensor.shape == x.shape diff --git a/ivy_models/mlpmixer/mlpmixer.py b/ivy_models/mlpmixer/mlpmixer.py index 85ba5ed7..d7cb02af 100644 --- a/ivy_models/mlpmixer/mlpmixer.py +++ b/ivy_models/mlpmixer/mlpmixer.py @@ -122,7 +122,7 @@ def get_spec_class(self): def _forward(self, x, data_format=None): data_format = data_format if data_format else self.spec.data_format if data_format == "NCHW": - x = ivy.permute_dims(x, (0, 3, 1, 2)) + x = ivy.permute_dims(x, (0, 2, 3, 1)) x = self.conv(x) x = x.reshape( (int(x.shape[0]), int(x.shape[1]) * int(x.shape[2]), int(x.shape[3])) diff --git a/ivy_models/unet/unet.py b/ivy_models/unet/unet.py index 44234b31..06fb2ae7 100644 --- a/ivy_models/unet/unet.py +++ b/ivy_models/unet/unet.py @@ -42,6 +42,7 @@ def get_spec_class(self): return UNetSpec def _forward(self, x, data_format="NHWC"): + data_format = data_format if data_format else self.spec.data_format if data_format == "NCHW": x = ivy.permute_dims(x, (0, 2, 3, 1)) x1 = self.inc(x) diff --git a/ivy_models/vit/layers.py b/ivy_models/vit/layers.py index d7885c3f..c9b60c7a 100644 --- a/ivy_models/vit/layers.py +++ b/ivy_models/vit/layers.py @@ -1,7 +1,6 @@ from typing import Any, Callable, List, NamedTuple, Optional, Tuple, Union, Sequence import collections from itertools import repeat -from collections import OrderedDict from functools import partial from ivy.stateful.initializers import Zeros import ivy @@ -210,7 +209,7 @@ def __init__( self.num_heads = num_heads self.hidden_dim = hidden_dim self.mlp_dim = mlp_dim - self.dropout = dropout + self.dropout_p = dropout self.attention_dropout = attention_dropout self.norm_layer = norm_layer @@ -224,19 +223,19 @@ def _build(self, *args, **kwargs) -> bool: num_heads=self.num_heads, dropout_rate=self.attention_dropout, ) - self.dropout = ivy.Dropout(self.dropout) + self.dropout = ivy.Dropout(self.dropout_p) # MLP block self.ln_2 = self.norm_layer(self.hidden_dim) - self.mlp = VIT_MLPBlock(self.hidden_dim, self.mlp_dim, self.dropout) + self.mlp = VIT_MLPBlock(self.hidden_dim, self.mlp_dim, self.dropout_p) def _forward(self, input): ivy.utils.assertions.check_true( - input.dim() == 3, + input.get_num_dims() == 3, f"Expected (batch_size, seq_length, hidden_dim) got {input.shape}", ) x = self.ln_1(input) - x, _ = self.self_attention(x, x, x, need_weights=False) + x = self.self_attention(x, x, x) x = self.dropout(x) x = x + input @@ -264,31 +263,33 @@ def __init__( self._pos_embedding_shape = (1, seq_length, hidden_dim) self.pos_embedding = Zeros() # from BERT self.dropout = ivy.Dropout(dropout) - layers: OrderedDict[str, ivy.Module] = OrderedDict() + layers = [] for i in range(num_layers): - layers[f"encoder_layer_{i}"] = VIT_EncoderBlock( - num_heads, - hidden_dim, - mlp_dim, - dropout, - attention_dropout, - norm_layer, + layers.append( + VIT_EncoderBlock( + num_heads, + hidden_dim, + mlp_dim, + dropout, + attention_dropout, + norm_layer, + ) ) - self.layers = ivy.Sequential(layers) + self.layers = ivy.Sequential(*layers) self.ln = norm_layer(hidden_dim) super().__init__() def _create_variables(self, device, dtype=None): return { - "pos_embeddin": self.pos_embedding.create_variables( + "pos_embedding": self.pos_embedding.create_variables( self._pos_embedding_shape, device, dtype=dtype ) } def _forward(self, input): ivy.utils.assertions.check_true( - input.dim() == 3, + input.get_num_dims() == 3, f"Expected (batch_size, seq_length, hidden_dim) got {input.shape}", ) - input = input + self.pos_embedding + input = input + self.v.pos_embedding return self.ln(self.layers(self.dropout(input))) diff --git a/ivy_models/vit/vit.py b/ivy_models/vit/vit.py index 4e7c488f..d94ed1c7 100644 --- a/ivy_models/vit/vit.py +++ b/ivy_models/vit/vit.py @@ -5,7 +5,6 @@ ConvStemConfig, List, Optional, - OrderedDict, VIT_Encoder, Zeros, ivy, @@ -28,6 +27,7 @@ def __init__( num_classes: int = 1000, representation_size: Optional[int] = None, norm_layer: Callable[..., ivy.Module] = partial(ivy.LayerNorm, eps=1e-6), + data_format: str = "NHWC", conv_stem_configs: Optional[List[ConvStemConfig]] = None, ): ivy.utils.assertions.check_true( @@ -46,6 +46,7 @@ def __init__( num_classes=num_classes, representation_size=representation_size, norm_layer=norm_layer, + data_format=data_format, conv_stem_configs=conv_stem_configs, ) @@ -67,6 +68,7 @@ def __init__( representation_size: Optional[int] = None, norm_layer: Callable[..., ivy.Module] = partial(ivy.LayerNorm, eps=1e-6), conv_stem_configs: Optional[List[ConvStemConfig]] = None, + data_format: str = "NHWC", spec=None, v=None, ): @@ -85,6 +87,7 @@ def __init__( num_classes=num_classes, representation_size=representation_size, norm_layer=norm_layer, + data_format=data_format, conv_stem_configs=conv_stem_configs, ) ) @@ -93,22 +96,24 @@ def __init__( def _build(self, *args, **kwargs): if self.spec.conv_stem_configs is not None: # As per https://arxiv.org/abs/2106.14881 - seq_proj = OrderedDict() + seq_proj = [] prev_channels = 3 for i, conv_stem_layer_config in enumerate(self.spec.conv_stem_configs): - seq_proj[f"conv_bn_relu_{i}"] = Conv2dNormActivation( - in_channels=prev_channels, - out_channels=conv_stem_layer_config.out_channels, - kernel_size=conv_stem_layer_config.kernel_size, - stride=conv_stem_layer_config.stride, - norm_layer=conv_stem_layer_config.norm_layer, - activation_layer=conv_stem_layer_config.activation_layer, + seq_proj.append( + Conv2dNormActivation( + in_channels=prev_channels, + out_channels=conv_stem_layer_config.out_channels, + kernel_size=conv_stem_layer_config.kernel_size, + stride=conv_stem_layer_config.stride, + norm_layer=conv_stem_layer_config.norm_layer, + activation_layer=conv_stem_layer_config.activation_layer, + ) ) prev_channels = conv_stem_layer_config.out_channels - seq_proj["conv_last"] = ivy.Conv2D( - prev_channels, self.spec.hidden_dim, [1, 1], 1, 0 + seq_proj.append( + ivy.Conv2D(prev_channels, self.spec.hidden_dim, [1, 1], 1, 0) ) - self.conv_proj: ivy.Module = ivy.Sequential(seq_proj) + self.conv_proj: ivy.Module = ivy.Sequential(*seq_proj) else: self.conv_proj = ivy.Conv2D( 3, @@ -137,21 +142,19 @@ def _build(self, *args, **kwargs): ) self.seq_length = seq_length - heads_layers: OrderedDict[str, ivy.Module] = OrderedDict() + heads_layers = [] if self.spec.representation_size is None: - heads_layers["head"] = ivy.Linear( - self.spec.hidden_dim, self.spec.num_classes - ) + heads_layers.append(ivy.Linear(self.spec.hidden_dim, self.spec.num_classes)) else: - heads_layers["pre_logits"] = ivy.Linear( - self.spec.hidden_dim, self.spec.representation_size + heads_layers.append( + ivy.Linear(self.spec.hidden_dim, self.spec.representation_size) ) - heads_layers["act"] = ivy.tanh() - heads_layers["head"] = ivy.Linear( - self.spec.representation_size, self.spec.num_classes + heads_layers.append(ivy.tanh()) + heads_layers.append( + ivy.Linear(self.spec.representation_size, self.spec.num_classes) ) - self.heads = ivy.Sequential(heads_layers) + self.heads = ivy.Sequential(*heads_layers) def _create_variables(self, device, dtype=None): return { @@ -161,7 +164,7 @@ def _create_variables(self, device, dtype=None): } def _process_input(self, x): - n, c, h, w = x.shape + n, h, w, c = x.shape p = self.spec.patch_size ivy.utils.assertions.check_true( h == self.spec.image_size, @@ -174,16 +177,10 @@ def _process_input(self, x): n_h = h // p n_w = w // p - # (n, c, h, w) -> (n, self.hidden_dim, n_h, n_w) + # (n, h, w, c) -> (n, n_h, n_w, self.hidden_dim) x = self.conv_proj(x) - # (n, self.hidden_dim, n_h, n_w) -> (n, self.hidden_dim, (n_h * n_w)) - x = x.reshape(n, self.spec.hidden_dim, n_h * n_w) - - # (n, self.hidden_dim, (n_h * n_w)) -> (n, (n_h * n_w), self.hidden_dim) - # The self attention layer expects inputs in the format (N, S, E) - # where S is the source sequence length, N is the batch size, E is the - # embedding dimension - x = x.permute(0, 2, 1) + # (n, n_h, n_w, self.hidden_dim) -> (n, (n_h * n_w), self.hidden_dim) + x = x.reshape(shape=(n, n_h * n_w, self.spec.hidden_dim)) return x @@ -191,13 +188,17 @@ def _process_input(self, x): def get_spec_class(self): return VisionTransformerSpec - def _forward(self, x): + def _forward(self, x, data_format: str = "NHWC"): + data_format = data_format if data_format else self.spec.data_format + if data_format == "NCHW": + x = ivy.permute_dims(x, (0, 2, 3, 1)) # Reshape and permute the input tensor x = self._process_input(x) n = x.shape[0] # Expand the class token to the full batch - batch_class_token = self.class_token.expand(n, -1, -1) + + batch_class_token = ivy.expand(self.v.class_token, (n, -1, -1)) x = ivy.concat([batch_class_token, x], axis=1) x = self.encoder(x) @@ -227,6 +228,7 @@ def _vision_transformer( num_heads: int, hidden_dim: int, mlp_dim: int, + data_format: str = "NHWC", v=None, ) -> VisionTransformer: model = VisionTransformer( @@ -236,15 +238,21 @@ def _vision_transformer( num_heads=num_heads, hidden_dim=hidden_dim, mlp_dim=mlp_dim, + data_format=data_format, v=v, ) return model -def vit_b_16(pretrained=True) -> VisionTransformer: +def vit_b_16(data_format="NHWC", pretrained=True) -> VisionTransformer: model = _vision_transformer( - patch_size=16, num_layers=12, num_heads=12, hidden_dim=768, mlp_dim=3072 + patch_size=16, + num_layers=12, + num_heads=12, + hidden_dim=768, + mlp_dim=3072, + data_format=data_format, ) if pretrained: url = "https://download.pytorch.org/models/vit_b_16-c867db91.pth" @@ -258,9 +266,14 @@ def vit_b_16(pretrained=True) -> VisionTransformer: return model -def vit_b_32(pretrained=True) -> VisionTransformer: +def vit_b_32(data_format="NHWC", pretrained=True) -> VisionTransformer: ref_model = _vision_transformer( - patch_size=32, num_layers=12, num_heads=12, hidden_dim=768, mlp_dim=3072 + patch_size=32, + num_layers=12, + num_heads=12, + hidden_dim=768, + mlp_dim=3072, + data_format=data_format, ) if pretrained: url = "https://download.pytorch.org/models/vit_b_32-d86f8d99.pth" @@ -274,9 +287,14 @@ def vit_b_32(pretrained=True) -> VisionTransformer: return ref_model -def vit_l_16(pretrained=True) -> VisionTransformer: +def vit_l_16(data_format="NHWC", pretrained=True) -> VisionTransformer: ref_model = _vision_transformer( - patch_size=16, num_layers=24, num_heads=16, hidden_dim=1024, mlp_dim=4096 + patch_size=16, + num_layers=24, + num_heads=16, + hidden_dim=1024, + mlp_dim=4096, + data_format=data_format, ) if pretrained: url = "https://download.pytorch.org/models/vit_l_16-852ce7e3.pth" @@ -290,9 +308,14 @@ def vit_l_16(pretrained=True) -> VisionTransformer: return ref_model -def vit_l_32(pretrained=True) -> VisionTransformer: +def vit_l_32(data_format="NHWC", pretrained=True) -> VisionTransformer: ref_model = _vision_transformer( - patch_size=32, num_layers=24, num_heads=16, hidden_dim=1024, mlp_dim=4096 + patch_size=32, + num_layers=24, + num_heads=16, + hidden_dim=1024, + mlp_dim=4096, + data_format=data_format, ) if pretrained: url = "https://download.pytorch.org/models/vit_l_32-c7638314.pth" @@ -306,12 +329,17 @@ def vit_l_32(pretrained=True) -> VisionTransformer: return ref_model -def vit_h_14(pretrained=True) -> VisionTransformer: +def vit_h_14(data_format="NHWC", pretrained=True) -> VisionTransformer: ref_model = _vision_transformer( - patch_size=14, num_layers=12, num_heads=14, hidden_dim=768, mlp_dim=3072 + patch_size=14, + num_layers=32, + num_heads=16, + hidden_dim=1280, + mlp_dim=5120, + data_format=data_format, ) if pretrained: - url = "https://download.pytorch.org/models/vit_h_14_swag-80465313.pth" + url = "https://download.pytorch.org/models/vit_h_14_lc_swag-c1eb923e.pth" w_clean = load_torch_weights( url, ref_model, diff --git a/ivy_models_tests/dino/test_dinonet.py b/ivy_models_tests/dino/test_dinonet.py new file mode 100644 index 00000000..7de1fe41 --- /dev/null +++ b/ivy_models_tests/dino/test_dinonet.py @@ -0,0 +1,57 @@ +import os +import ivy +import numpy as np +# import pytest +import traceback +import sys +import logging +from ivy_models_tests import helpers +from ivy_models.dino.dino import dino_base + + +# @pytest.mark.parametrize("data_format", ["NHWC", "NCHW"]) +# def test_dino_classification(device, fw, data_format): +# """Test AlexNet image classification.""" +# num_classes = 1000 +# batch_shape = [1] +# this_dir = os.path.dirname(os.path.realpath(__file__)) +# +# # Load image +# img = helpers.load_and_preprocess_img( +# os.path.join(this_dir, "..", "..", "images", "cat.jpg"), +# 256, +# 224, +# data_format=data_format, +# to_ivy=True, +# ) +# +# model = dino_base() +# + +def run_model(): + num_classes = 1000 + batch_shape = [1] + this_dir = os.path.dirname(os.path.realpath(__file__)) + + # Load image + img = helpers.load_and_preprocess_img( + os.path.join(this_dir, "..", "..", "images", "cat.jpg"), + 256, + 224, + data_format="NHWC", + to_ivy=True, + ) + + model = dino_base() + + try: + model.v = ivy.asarray(model.v) + logits = model(img) + print("LOGITS") + print(logits) + except Exception as e: + print(traceback.format_exc()) + # or + print(sys.exc_info()[2]) + +run_model() diff --git a/ivy_models_tests/helpers/image_helpers.py b/ivy_models_tests/helpers/image_helpers.py index 9cea853f..54a8b10c 100644 --- a/ivy_models_tests/helpers/image_helpers.py +++ b/ivy_models_tests/helpers/image_helpers.py @@ -1,6 +1,7 @@ import ivy import numpy as np -from PIL import Image +import random +from PIL import Image, ImageFilter, ImageOps from torchvision import transforms @@ -50,3 +51,39 @@ def load_and_preprocess_img( if data_format == "NHWC": img = img.permute((0, 2, 3, 1)) return ivy.array(img.numpy()) if to_ivy else img.numpy() + + +class GaussianBlur(object): + """ + Apply Gaussian Blur to the PIL image. + """ + def __init__(self, p=0.5, radius_min=0.1, radius_max=2.): + self.prob = p + self.radius_min = radius_min + self.radius_max = radius_max + + def __call__(self, img): + do_it = random.random() <= self.prob + if not do_it: + return img + + return img.filter( + ImageFilter.GaussianBlur( + radius=random.uniform(self.radius_min, self.radius_max) + ) + ) + + +class Solarization(object): + """ + Apply Solarization to the PIL image. + """ + def __init__(self, p): + self.p = p + + def __call__(self, img): + if random.random() < self.p: + return ImageOps.solarize(img) + else: + return img + diff --git a/ivy_models_tests/mlpmixer/test_mlpmixer.py b/ivy_models_tests/mlpmixer/test_mlpmixer.py new file mode 100644 index 00000000..7826f508 --- /dev/null +++ b/ivy_models_tests/mlpmixer/test_mlpmixer.py @@ -0,0 +1,73 @@ +import os +import ivy +import pytest +import numpy as np + +from ivy_models.mlpmixer import mlpmixer +from ivy_models_tests import helpers + +import tensorflow as tf +from tensorflow import keras +from keras import layers +import jax + +jax.config.update("jax_enable_x64", False) + +load_weights = True +model = mlpmixer(pretrained=load_weights) +v = ivy.to_numpy(model.v) + + +@pytest.mark.parametrize("data_format", ["NHWC", "NCHW"]) +def test_mlpmixer_tiny_img_classification(device, fw, data_format): + """Test MLPMixer image classification.""" + num_classes = 10 + batch_shape = [1] + this_dir = os.path.dirname(os.path.realpath(__file__)) + + # Load image + img = helpers.load_image_in_np( + os.path.join(this_dir, "..", "..", "images", "car.jpg") + ) + + # Preprocess the image + def get_augmentation_layers(): + data_augmentation = keras.Sequential( + [ + layers.experimental.preprocessing.Normalization( + mean=(0.5, 0.5, 0.5), variance=(0.25, 0.25, 0.25) + ), + layers.experimental.preprocessing.Resizing(72, 72), + layers.experimental.preprocessing.RandomFlip("horizontal"), + layers.experimental.preprocessing.RandomRotation(factor=0.02), + layers.experimental.preprocessing.RandomZoom( + height_factor=0.2, width_factor=0.2 + ), + ], + name="data_augmentation", + ) + return data_augmentation + + data_augmentation = get_augmentation_layers() + img = data_augmentation(img) + img = tf.expand_dims(img, 0).numpy() + img = ivy.asarray(img) + if data_format == "NCHW": + img = ivy.permute_dims(img, (0, 3, 1, 2)) + + model.v = ivy.asarray(v) + logits = model(img, data_format=data_format) + + # Cardinality test + assert logits.shape == tuple([ivy.to_scalar(batch_shape), num_classes]) + + # Value test + if load_weights: + np_out = ivy.to_numpy(logits) + true_indices = np.array([4, 7, 2, 9]) + calc_indices = np.argsort(np_out[0])[-4:][::-1] + assert np.array_equal(np.sort(true_indices), np.sort(calc_indices)) + + true_logits = np.array([0.4022081, 0.24405026, 0.14345096, 0.12923254]) + calc_logits = np.take(np_out, calc_indices) + assert np.allclose(true_logits, calc_logits, rtol=1e-2, atol=1e-1) diff --git a/ivy_models_tests/squeezenet/test_squeezenet.py b/ivy_models_tests/squeezenet/test_squeezenet.py index af77641c..ddef8a47 100644 --- a/ivy_models_tests/squeezenet/test_squeezenet.py +++ b/ivy_models_tests/squeezenet/test_squeezenet.py @@ -12,10 +12,9 @@ "squeezenet1_1": squeezenet1_1, } -ivy.seed(seed_value=42) load_weights = random.choice([False, True]) model_var = random.choice(list(VARIANTS.keys())) -model = VARIANTS[model_var](pretrained=load_weights) +model = VARIANTS[model_var](dropout=0, pretrained=load_weights) v = ivy.to_numpy(model.v) diff --git a/ivy_models_tests/vit/test_vit.py b/ivy_models_tests/vit/test_vit.py index 381385d3..4263e4a7 100644 --- a/ivy_models_tests/vit/test_vit.py +++ b/ivy_models_tests/vit/test_vit.py @@ -1,31 +1,60 @@ import os -import ivy -import pytest import numpy as np - -from ivy_models import vit_b_16 +import pytest +import random +import ivy from ivy_models_tests import helpers +from ivy_models.vit import ( + vit_b_16, + vit_b_32, + vit_l_16, + vit_l_32, +) -import jax -jax.config.update("jax_enable_x64", False) +VARIANTS = { + "vit_b_16": vit_b_16, + "vit_b_32": vit_b_32, + "vit_l_16": vit_l_16, + "vit_l_32": vit_l_32, +} -@pytest.mark.parametrize("batch_shape", [[1]]) -@pytest.mark.parametrize("load_weights", [False, True]) -def test_alexnet_tiny_img_classification(device, fw, batch_shape, load_weights): +LOGITS = { + "vit_b_16": [282, 281, 285, 287, 292], + "vit_b_32": [282, 281, 285, 287, 292], + "vit_l_16": [255, 281, 282, 285, 292], + "vit_l_32": [282, 281, 285, 287, 292], +} + + +load_weights = random.choice([False, True]) +model_var = random.choice(list(VARIANTS.keys())) +model = VARIANTS[model_var](pretrained=load_weights) +v = ivy.to_numpy(model.v) + + +@pytest.mark.parametrize("data_format", ["NHWC", "NCHW"]) +def test_vit_img_classification(device, fw, data_format): """Test ViT image classification.""" num_classes = 1000 + batch_shape = [1] this_dir = os.path.dirname(os.path.realpath(__file__)) # Load image - img = helpers.load_and_preprocess_img( - os.path.join(this_dir, "..", "..", "images", "cat.jpg"), 256, 224 + img = ivy.asarray( + helpers.load_and_preprocess_img( + os.path.join(this_dir, "..", "..", "images", "cat.jpg"), + 256, + 224, + data_format=data_format, + to_ivy=True, + ), ) - img = ivy.permute_dims(img, (0, 3, 1, 2)) - model = vit_b_16(pretrained=load_weights) - logits = model(img) + # Create model + model.v = ivy.asarray(v) + logits = model(img, data_format=data_format) # Cardinality test assert logits.shape == tuple([ivy.to_scalar(batch_shape), num_classes]) @@ -33,10 +62,6 @@ def test_alexnet_tiny_img_classification(device, fw, batch_shape, load_weights): # Value test if load_weights: np_out = ivy.to_numpy(logits[0]) - true_indices = np.array([282, 281, 285, 287, 896]) - calc_indices = np.argsort(np_out)[-5:][::-1] + true_indices = np.sort(np.array(LOGITS[model_var])) + calc_indices = np.sort(np.argsort(np_out)[-5:][::-1]) assert np.array_equal(true_indices, calc_indices) - - true_logits = np.array([23.5786, 22.791977, 20.917543, 19.49762, 16.102253]) - calc_logits = np.take(np_out, calc_indices) - assert np.allclose(true_logits, calc_logits, rtol=1e-3) diff --git a/requirements.txt b/requirements.txt index 309e21c3..c6780027 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -git+https://github.com/unifyai/ivy.git@master +git+https://github.com/unifyai/ivy.git@main diff --git a/setup.py b/setup.py index fa690915..4276877a 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,7 @@ def _replace_logos_html(txt): backends_chunk = chunks[2] bc = backends_chunk.split("\n\n") img_str = ( - ".. image:: https://github.com/unifyai/unifyai.github.io/blob/master/img/externally_linked/logos/supported/frameworks.png?raw=true\n" # noqa + ".. image:: https://github.com/unifyai/unifyai.github.io/blob/main/img/externally_linked/logos/supported/frameworks.png?raw=true\n" # noqa " :width: 100%" ) backends_chunk = "\n\n".join(bc[0:1] + [img_str] + bc[2:]) @@ -39,7 +39,7 @@ def _replace_logos_html(txt): libraries_chunk = chunks[3] lc = libraries_chunk.split("\n\n") img_str = ( - ".. image:: https://github.com/unifyai/unifyai.github.io/blob/master/img/externally_linked/ivy_libraries.png?raw=true\n" # noqa + ".. image:: https://github.com/unifyai/unifyai.github.io/blob/main/img/externally_linked/ivy_libraries.png?raw=true\n" # noqa " :width: 100%" ) libraries_chunk = "\n\n".join(lc[0:1] + [img_str] + lc[2:])