API tweaks and added docstrings for utility layers

theabhirath · theabhirath · commit 0c3d87353d92 · 2022-02-04T22:10:21.000+05:30
diff --git a/src/layers.jl b/src/layers.jl
@@ -107,7 +107,11 @@ function mlpblock(planes, hidden_planes, dropout = 0., dense = Dense; activation
         dense(hidden_planes, planes, activation), Dropout(dropout))
 end
 
-# Patching layer used by many vision transformer-like models
+"""
+    Patching{T <: Integer}
+Patching layer used by many vision transformer-like models to split the input image into patches.
+Can be instantiated with a tuple `(patch_height, patch_width)` or a single value `patch_size`.
+"""
 struct Patching{T <: Integer}
   patch_height::T
   patch_width::T
@@ -125,32 +129,33 @@ end
 
 @functor Patching
 
-# Positional embedding layer used by many vision transformer-like models
-struct PosEmbedding
-  embedding_vector
+"""
+    PosEmbedding{T}
+
+Positional embedding layer used by many vision transformer-like models. Instantiated with an 
+embedding vector which is a learnable parameter.
+"""
+struct PosEmbedding{T}
+  embedding_vector::T
 end
 
 (p::PosEmbedding)(x) = x .+ p.embedding_vector[:, 1:size(x)[2], :]
 
 @functor PosEmbedding
 
-# Class tokens used by many vision transformer-like models
-struct CLSTokens
-  cls_token
+"""
+    CLSTokens{T}
+
+Appends class tokens to the input that are used for classfication by many vision 
+transformer-like models. Instantiated with a class token vector which is a learnable parameter.
+"""
+struct CLSTokens{T}
+  cls_token::T
 end
 
 function(m::CLSTokens)(x)
   cls_tokens = repeat(m.cls_token, 1, 1, size(x)[3])
-  x = cat(cls_tokens, x; dims = 2)
+  return cat(cls_tokens, x; dims = 2)
 end
 
 @functor CLSTokens
-
-# Utility function to decide if mean pooling happens inside the model
-struct CLSPooling
-  mode
-end
-
-(m::CLSPooling)(x) = (m.mode == "cls") ? x[:, 1, :] : _seconddimmean(x)
-
-@functor CLSPooling
diff --git a/src/vit-based/vit.jl b/src/vit-based/vit.jl
@@ -46,10 +46,6 @@ end
 
 @functor MHAttention
 
-struct Transformer
-  layers
-end
-
 """
     Transformer(planes, depth, heads, headplanes, mlppanes, dropout = 0.)
 
@@ -69,13 +65,9 @@ function Transformer(planes, depth, heads, headplanes, mlpplanes, dropout = 0.)
                   SkipConnection(prenorm(planes, mlpblock(planes, mlpplanes, dropout)), +)) 
             for _ in 1:depth]
 
-  Transformer(Chain(layers...))
+  Chain(layers...)
 end
 
-(m::Transformer)(x) = m.layers(x)
-
-@functor Transformer
-
 """
     vit(imsize::NTuple{2} = (256, 256); inchannels = 3, patch_size = (16, 16), planes = 1024, 
         depth = 6, heads = 16, mlppanes = 2048, headplanes = 64, dropout = 0.1, emb_dropout = 0.1, 
@@ -120,7 +112,7 @@ function vit(imsize::NTuple{2} = (256, 256); inchannels = 3, patch_size = (16, 1
                PosEmbedding(rand(Float32, (planes, num_patches + 1, 1))),
                Dropout(emb_dropout),
                Transformer(planes, depth, heads, headplanes, mlppanes, dropout),
-               CLSPooling(pool),
+               (pool == "cls") ? x -> x[:, 1, :] : x -> _seconddimmean(x),
                Chain(LayerNorm(planes), Dense(planes, nclasses)))
 end
 
@@ -164,6 +156,6 @@ end
 (m::ViT)(x) = m.layers(x)
 
 backbone(m::ViT) = m.layers[1:end-1]
-classifier(m::MLPMixer) = m.layers[end]
+classifier(m::ViT) = m.layers[end]
 
 @functor ViT