From f73f5529a3e81852e18c0f79eea7518809442a45 Mon Sep 17 00:00:00 2001
From: juncaipeng <13006307475@163.com>
Date: Thu, 10 Nov 2022 15:01:08 +0800
Subject: [PATCH 01/16] add vit-adapter and align backbone forward

---
 configs/vit_adapter/README.md                 |  15 +
 ...rnet_deit_adapter_tiny_512_160k_ade20k.yml |  77 +++
 paddleseg/models/backbones/__init__.py        |   1 +
 paddleseg/models/backbones/vit_adapter.py     | 458 +++++++++++++++++
 paddleseg/models/layers/vit_adapter_layers.py | 462 ++++++++++++++++++
 5 files changed, 1013 insertions(+)
 create mode 100644 configs/vit_adapter/README.md
 create mode 100644 configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml
 create mode 100644 paddleseg/models/backbones/vit_adapter.py
 create mode 100644 paddleseg/models/layers/vit_adapter_layers.py

diff --git a/configs/vit_adapter/README.md b/configs/vit_adapter/README.md
new file mode 100644
index 0000000000..d7a15b0ca3
--- /dev/null
+++ b/configs/vit_adapter/README.md
@@ -0,0 +1,15 @@
+# Semantic Flow for Fast and Accurate Scene Parsing
+
+## Reference
+
+> Xiangtai Li, Ansheng You, Zhen Zhu, Houlong Zhao, Maoke Yang, Kuiyuan Yang, Shaohua Tan, Yunhai Tong:
+Semantic Flow for Fast and Accurate Scene Parsing. ECCV (1) 2020: 775-793 .
+
+## Performance
+
+### Cityscapes
+
+| Model | Backbone | Resolution | Training Iters | mIoU | mIoU (flip) | mIoU (ms+flip) | Links |
+|-|-|-|-|-|-|-|-|
+|SFNet|ResNet18_OS8|1024x1024|80000|78.72%|79.11%|79.28%|[model](https://bj.bcebos.com/paddleseg/dygraph/cityscapes/sfnet_resnet18_os8_cityscapes_1024x1024_80k/model.pdparams) \| [log](https://bj.bcebos.com/paddleseg/dygraph/cityscapes/sfnet_resnet18_os8_cityscapes_1024x1024_80k/train.log) \| [vdl](https://www.paddlepaddle.org.cn/paddle/visualdl/service/app/scalar?id=0d790ad96282048b136342fcebb08d14)|
+|SFNet|ResNet50_OS8|1024x1024|80000|81.49%|81.63%|81.85%|[model](https://bj.bcebos.com/paddleseg/dygraph/cityscapes/sfnet_resnet50_os8_cityscapes_1024x1024_80k/model.pdparams) \| [log](https://bj.bcebos.com/paddleseg/dygraph/cityscapes/sfnet_resnet50_os8_cityscapes_1024x1024_80k/train.log) \| [vdl](https://paddlepaddle.org.cn/paddle/visualdl/service/app?id=d458349ec63ea8ccd6fae84afa8ea981)|
diff --git a/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml b/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml
new file mode 100644
index 0000000000..32bfe32140
--- /dev/null
+++ b/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml
@@ -0,0 +1,77 @@
+_base_: '../_base_/ade20k.yml'
+
+batch_size: 4  # total batch size is 16
+iters: 160000
+
+train_dataset:
+  transforms:
+    - type: ResizeStepScaling
+      min_scale_factor: 0.5
+      max_scale_factor: 2.0
+      scale_step_size: 0.25
+    - type: RandomPaddingCrop
+      crop_size: [512, 512]
+    - type: RandomHorizontalFlip
+    - type: RandomDistort
+      brightness_range: 0.4
+      contrast_range: 0.4
+      saturation_range: 0.4
+    - type: Normalize
+      mean: [0.485, 0.456, 0.406]
+      std: [0.229, 0.224, 0.225]
+
+val_dataset:
+  transforms:
+    - type: Resize
+      target_size: [2048, 512]
+      keep_ratio: True
+      size_divisor: 32
+    - type: Normalize
+      mean: [0.485, 0.456, 0.406]
+      std: [0.229, 0.224, 0.225]
+  
+export:
+  transforms:
+    - type: Resize
+      target_size: [2048, 512]
+      keep_ratio: True
+      size_divisor: 32
+    - type: Normalize
+      mean: [0.485, 0.456, 0.406]
+      std: [0.229, 0.224, 0.225]
+
+optimizer:
+  _inherited_: False
+  type: AdamW
+  weight_decay: 0.01
+
+lr_scheduler:
+  type: PolynomialDecay
+  learning_rate: 0.0012
+  end_lr: 0
+  power: 1.0
+  warmup_iters: 1500
+  warmup_start_lr: 1.0e-6
+
+loss:
+  types:
+    - type: CrossEntropyLoss
+  coef: [1]
+
+model:
+  type: TopFormer
+  backbone:
+    type: ViTAdapter
+    num_heads: 3
+    patch_size: 16
+    embed_dim: 192
+    depth: 12
+    mlp_ratio: 4
+    drop_path_rate: 0.1
+    conv_inplane: 64
+    n_points: 4
+    deform_num_heads: 6
+    cffn_ratio: 0.25
+    deform_ratio: 1.0
+    interaction_indexes: [[0, 2], [3, 5], [6, 8], [9, 11]]
+  pretrained: pretrained_model/upernet_deit_adapter_tiny_512_160_ade20k_from_torch.pdparams
\ No newline at end of file
diff --git a/paddleseg/models/backbones/__init__.py b/paddleseg/models/backbones/__init__.py
index 2241aaf77a..d5088b42c5 100644
--- a/paddleseg/models/backbones/__init__.py
+++ b/paddleseg/models/backbones/__init__.py
@@ -26,3 +26,4 @@
 from .ghostnet import *
 from .top_transformer import *
 from .uhrnet import *
+from .vit_adapter import *
diff --git a/paddleseg/models/backbones/vit_adapter.py b/paddleseg/models/backbones/vit_adapter.py
new file mode 100644
index 0000000000..3e4269423b
--- /dev/null
+++ b/paddleseg/models/backbones/vit_adapter.py
@@ -0,0 +1,458 @@
+# The ViT-Adapter code was heavily based on https://github.com/czczup/ViT-Adapter
+
+import math
+from functools import partial
+
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import Uniform, KaimingNormal
+from paddle.nn import Conv2D, BatchNorm, AdaptiveAvgPool2D, Linear
+
+from paddleseg.cvlibs import manager
+from paddleseg.cvlibs.param_init import normal_init, trunc_normal_init, constant_init
+from paddleseg.utils import utils, logger
+from paddleseg.models.backbones.transformer_utils import to_2tuple, DropPath
+
+from paddleseg.models.layers.vit_adapter_layers import SpatialPriorModule, InteractionBlock, deform_inputs
+
+__all__ = ['ViTAdapter']
+
+
+class PatchEmbed(nn.Layer):
+    """2D Image to Patch Embedding."""
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 embed_dim=768,
+                 norm_layer=None,
+                 flatten=True):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.grid_size = (img_size[0] // patch_size[0],
+                          img_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+
+        self.proj = nn.Conv2D(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+    def forward(self, x):
+        x = self.proj(x)
+        _, _, H, W = x.shape
+        if self.flatten:
+            x = x.flatten(2).transpose([0, 2, 1])  # BCHW -> BNC
+        x = self.norm(x)
+        return x, H, W
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, H, W):
+        x_shape = paddle.shape(x)
+        N, C = x_shape[1], x_shape[2]
+        qkv = self.qkv(x).reshape((-1, N, 3, self.num_heads, C //
+                                   self.num_heads)).transpose((2, 0, 3, 1, 4))
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
+        attn = nn.functional.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 layer_scale=False):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity(
+        )
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+        self.layer_scale = layer_scale
+        if layer_scale:
+            self.gamma1 = self.create_parameter(
+                shape=(dim, ),
+                default_initializer=paddle.nn.initializer.Constant(value=1.))
+            self.gamma2 = self.create_parameter(
+                shape=(dim, ),
+                default_initializer=paddle.nn.initializer.Constant(value=1.))
+
+    def forward(self, x, H, W):
+        if self.layer_scale:
+            x = x + self.drop_path(self.gamma1 * self.attn(self.norm1(x), H, W))
+            x = x + self.drop_path(self.gamma2 * self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(self.attn(self.norm1(x), H, W))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class VisionTransformer(nn.Layer):
+    """Vision Transformer.
+
+    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
+        - https://arxiv.org/abs/2010.11929
+
+    Includes distillation token & head support for `DeiT: Data-efficient Image Transformers`
+        - https://arxiv.org/abs/2012.12877
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_channels=3,
+                 num_classes=1000,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 layer_scale=True,
+                 embed_layer=PatchEmbed,
+                 norm_layer=partial(
+                     nn.LayerNorm, epsilon=1e-6),
+                 act_layer=nn.GELU,
+                 pretrained=None):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_channels (int): number of input channels
+            num_classes (int): number of classes for classification head
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            drop_rate (float): dropout rate
+            attn_drop_rate (float): attention dropout rate
+            drop_path_rate (float): stochastic depth rate
+            embed_layer (nn.Module): patch embedding layer
+            norm_layer: (nn.Module): normalization layer
+            pretrained: (str): pretrained path
+        """
+        super().__init__()
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 1
+        norm_layer = norm_layer or partial(nn.LayerNorm, epsilon=1e-6)
+        act_layer = act_layer or nn.GELU
+        self.norm_layer = norm_layer
+        self.act_layer = act_layer
+        self.pretrain_size = img_size
+        self.drop_path_rate = drop_path_rate
+        self.drop_rate = drop_rate
+
+        self.patch_embed = embed_layer(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_channels,
+            embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        self.pos_embed = self.create_parameter(
+            shape=(1, num_patches + self.num_tokens, embed_dim),
+            default_initializer=paddle.nn.initializer.Constant(value=0.))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        dpr = np.linspace(0, drop_path_rate,
+                          depth)  # stochastic depth decay rule
+        self.blocks = nn.Sequential(*[
+            Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                layer_scale=layer_scale) for i in range(depth)
+        ])
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def init_weight(self):
+        utils.load_pretrained_model(self, self.pretrained)
+
+    def forward_features(self, x):
+        x, H, W = self.patch_embed(x)
+        cls_token = self.cls_token.expand(
+            x.shape[0], -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        x = paddle.concat([cls_tokens, x], axis=1)
+        x = self.pos_drop(x + self.pos_embed)
+        for blk in self.blocks:
+            x = blk(x, H, W)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        return x
+
+
+@manager.BACKBONES.add_component
+class ViTAdapter(VisionTransformer):
+    """ The ViT-Adapter
+    """
+
+    def __init__(self,
+                 pretrain_size=224,
+                 num_heads=12,
+                 conv_inplane=64,
+                 n_points=4,
+                 deform_num_heads=6,
+                 init_values=0.,
+                 interaction_indexes=None,
+                 with_cffn=True,
+                 cffn_ratio=0.25,
+                 deform_ratio=1.0,
+                 add_vit_feature=True,
+                 pretrained=None,
+                 use_extra_extractor=True,
+                 *args,
+                 **kwargs):
+
+        super().__init__(
+            num_heads=num_heads, pretrained=pretrained, *args, **kwargs)
+
+        self.cls_token = None
+        self.num_block = len(self.blocks)
+        self.pretrain_size = (pretrain_size, pretrain_size)
+        self.interaction_indexes = interaction_indexes
+        self.add_vit_feature = add_vit_feature
+        embed_dim = self.embed_dim
+
+        self.level_embed = self.create_parameter(
+            shape=(3, embed_dim),
+            default_initializer=paddle.nn.initializer.Constant(value=0.))
+        self.spm = SpatialPriorModule(
+            inplanes=conv_inplane, embed_dim=embed_dim)
+        self.interactions = nn.Sequential(*[
+            InteractionBlock(
+                dim=embed_dim,
+                num_heads=deform_num_heads,
+                n_points=n_points,
+                init_values=init_values,
+                drop_path=self.drop_path_rate,
+                norm_layer=self.norm_layer,
+                with_cffn=with_cffn,
+                cffn_ratio=cffn_ratio,
+                deform_ratio=deform_ratio,
+                extra_extractor=((True if i == len(interaction_indexes) - 1 else
+                                  False) and use_extra_extractor))
+            for i in range(len(interaction_indexes))
+        ])
+        self.up = nn.Conv2DTranspose(embed_dim, embed_dim, 2, 2)
+        self.norm1 = nn.SyncBatchNorm(embed_dim)
+        self.norm2 = nn.SyncBatchNorm(embed_dim)
+        self.norm3 = nn.SyncBatchNorm(embed_dim)
+        self.norm4 = nn.SyncBatchNorm(embed_dim)
+
+        self.up.apply(self._init_weights)
+        self.spm.apply(self._init_weights)
+        self.interactions.apply(self._init_weights)
+        self.apply(self._init_deform_weights)
+        normal_init(self.level_embed)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_init(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                constant_init(m.bias, value=0)
+        elif isinstance(m, nn.LayerNorm) or isinstance(m, (nn.BatchNorm2D,
+                                                           nn.SyncBatchNorm)):
+            constant_init(m.bias, value=0)
+            constant_init(m.weight, value=1.0)
+        elif isinstance(m, nn.Conv2D) or isinstance(m, nn.Conv2DTranspose):
+            fan_out = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
+            fan_out //= m._groups
+            normal_init(m.weight, std=math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                constant_init(m.bias, value=0)
+
+    def _get_pos_embed(self, pos_embed, H, W):
+        pos_embed = pos_embed.reshape(
+            [1, self.pretrain_size[0] // 16, self.pretrain_size[1] // 16,
+             -1]).transpose([0, 3, 1, 2])
+        pos_embed = F.interpolate(pos_embed, size=(H, W), mode='bicubic', align_corners=False).\
+            reshape([1, -1, H * W]).transpose([0, 2, 1])
+        return pos_embed
+
+    def _init_deform_weights(self, m):
+        '''
+        if isinstance(m, MSDeformAttn):
+            m._reset_parameters()
+        '''
+        pass
+
+    def _add_level_embed(self, c2, c3, c4):
+        c2 = c2 + self.level_embed[0]
+        c3 = c3 + self.level_embed[1]
+        c4 = c4 + self.level_embed[2]
+        return c2, c3, c4
+
+    def forward(self, x):
+        debug = True
+        if debug:
+            import random
+            import numpy as np
+            random.seed(0)
+            np.random.seed(0)
+            x = np.random.rand(1, 3, 512, 512).astype("float32")
+            x = paddle.to_tensor(x, dtype='float32')
+            print('x0:', x.numpy().mean())
+
+        deform_inputs1, deform_inputs2 = deform_inputs(x)
+
+        # SPM forward
+        c1, c2, c3, c4 = self.spm(x)
+        c2, c3, c4 = self._add_level_embed(c2, c3, c4)
+        c = paddle.concat([c2, c3, c4], axis=1)
+
+        if debug:
+            print('----2----')
+            for i in deform_inputs1:
+                print(i.numpy().mean())
+            for i in deform_inputs2:
+                print(i.numpy().mean())
+
+        # Patch Embedding forward
+        x, H, W = self.patch_embed(x)
+        bs, n, dim = x.shape
+        pos_embed = self._get_pos_embed(self.pos_embed[:, 1:], H, W)
+        x = self.pos_drop(x + pos_embed)
+
+        if debug:
+            print('-------3----')
+            print(x.numpy().mean())
+
+        # Interaction
+        outs = list()
+        for i, layer in enumerate(self.interactions):
+            indexes = self.interaction_indexes[i]
+            x, c = layer(x, c, self.blocks[indexes[0]:indexes[-1] + 1],
+                         deform_inputs1, deform_inputs2, H, W)
+            outs.append(x.transpose([0, 2, 1]).reshape([bs, dim, H, W]))
+            if debug:
+                print('-----4-{}------'.format(i))
+                print(x.numpy().mean())
+                print(c.numpy().mean())
+
+# Split & Reshape
+        c2 = c[:, 0:c2.shape[1], :]
+        c3 = c[:, c2.shape[1]:c2.shape[1] + c3.shape[1], :]
+        c4 = c[:, c2.shape[1] + c3.shape[1]:, :]
+
+        c2 = c2.transpose([0, 2, 1]).reshape([bs, dim, H * 2, W * 2])
+        c3 = c3.transpose([0, 2, 1]).reshape([bs, dim, H, W])
+        c4 = c4.transpose([0, 2, 1]).reshape([bs, dim, H // 2, W // 2])
+        c1 = self.up(c2) + c1
+
+        if self.add_vit_feature:
+            x1, x2, x3, x4 = outs
+            x1 = F.interpolate(
+                x1, scale_factor=4, mode='bilinear', align_corners=False)
+            x2 = F.interpolate(
+                x2, scale_factor=2, mode='bilinear', align_corners=False)
+            x4 = F.interpolate(
+                x4, scale_factor=0.5, mode='bilinear', align_corners=False)
+            c1, c2, c3, c4 = c1 + x1, c2 + x2, c3 + x3, c4 + x4
+
+        # Final Norm
+        f1 = self.norm1(c1)
+        f2 = self.norm2(c2)
+        f3 = self.norm3(c3)
+        f4 = self.norm4(c4)
+        if debug:
+            print('-----5------')
+            print(f1.cpu().numpy().mean())
+            print(f2.cpu().numpy().mean())
+            print(f3.cpu().numpy().mean())
+            print(f4.cpu().numpy().mean())
+            # f1 = f1.cpu().numpy().mean()
+            # with msdeformatt
+            #assert np.allclose(f1, -0.03254774, rtol=0.0, atol=1e-6)
+            # without msdeformatt
+            #assert np.allclose(f1, -0.024487903, rtol=0.0, atol=1e-6)
+            exit()
+        return [f1, f2, f3, f4]
diff --git a/paddleseg/models/layers/vit_adapter_layers.py b/paddleseg/models/layers/vit_adapter_layers.py
new file mode 100644
index 0000000000..a73581890c
--- /dev/null
+++ b/paddleseg/models/layers/vit_adapter_layers.py
@@ -0,0 +1,462 @@
+# The ViT-Adapter code was heavily based on https://github.com/czczup/ViT-Adapter
+
+from functools import partial
+
+import paddle
+import paddle.nn as nn
+from paddleseg.models.backbones.transformer_utils import DropPath
+
+
+def get_reference_points(spatial_shapes):
+    reference_points_list = []
+    for _, (H_, W_) in enumerate(spatial_shapes):
+        ref_y, ref_x = paddle.meshgrid(
+            paddle.linspace(
+                0.5, H_ - 0.5, H_, dtype='float32'),
+            paddle.linspace(
+                0.5, W_ - 0.5, W_, dtype='float32'))
+        ref_y = ref_y.reshape([1, -1]) / H_
+        ref_x = ref_x.reshape([1, -1]) / W_
+        ref = paddle.stack((ref_x, ref_y), -1)
+        reference_points_list.append(ref)
+    reference_points = paddle.concat(reference_points_list, 1)
+    reference_points = paddle.unsqueeze(reference_points, axis=2)
+    return reference_points
+
+
+def deform_inputs(x):
+    bs, c, h, w = x.shape
+    spatial_shapes = paddle.to_tensor(
+        [(h // 8, w // 8), (h // 16, w // 16), (h // 32, w // 32)],
+        dtype='int64')
+    level_start_index = paddle.concat((paddle.zeros(
+        (1, ), dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1]))
+    reference_points = get_reference_points([(h // 16, w // 16)])
+    deform_inputs1 = [reference_points, spatial_shapes, level_start_index]
+
+    spatial_shapes = paddle.to_tensor([(h // 16, w // 16)], dtype='int64')
+    level_start_index = paddle.concat((paddle.zeros(
+        (1, ), dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1]))
+    reference_points = get_reference_points(
+        [(h // 8, w // 8), (h // 16, w // 16), (h // 32, w // 32)])
+    deform_inputs2 = [reference_points, spatial_shapes, level_start_index]
+
+    return deform_inputs1, deform_inputs2
+
+
+class ConvFFN(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.dwconv = DWConv(hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x, H, W):
+        x = self.fc1(x)
+        x = self.dwconv(x, H, W)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class DWConv(nn.Layer):
+    def __init__(self, dim=768):
+        super().__init__()
+        self.dwconv = nn.Conv2D(dim, dim, 3, 1, 1, bias_attr=True, groups=dim)
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        n = N // 21
+        x1 = x[:, 0:16 * n, :].transpose([0, 2, 1]).reshape(
+            [B, C, H * 2, W * 2])
+        x2 = x[:, 16 * n:20 * n, :].transpose([0, 2, 1]).reshape([B, C, H, W])
+        x3 = x[:, 20 * n:, :].transpose([0, 2, 1]).reshape(
+            [B, C, H // 2, W // 2])
+        x1 = self.dwconv(x1).flatten(2).transpose([0, 2, 1])
+        x2 = self.dwconv(x2).flatten(2).transpose([0, 2, 1])
+        x3 = self.dwconv(x3).flatten(2).transpose([0, 2, 1])
+        x = paddle.concat([x1, x2, x3], axis=1)
+        return x
+
+
+class Extractor(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=6,
+                 n_points=4,
+                 n_levels=1,
+                 deform_ratio=1.0,
+                 with_cffn=True,
+                 cffn_ratio=0.25,
+                 drop=0.,
+                 drop_path=0.,
+                 norm_layer=partial(
+                     nn.LayerNorm, epsilon=1e-6)):
+        super().__init__()
+        self.query_norm = norm_layer(dim)
+        self.feat_norm = norm_layer(dim)
+        '''
+        self.attn = MSDeformAttn(d_model=dim, n_levels=n_levels, n_heads=num_heads,
+                                 n_points=n_points, ratio=deform_ratio)
+        '''
+        self.with_cffn = with_cffn
+        if with_cffn:
+            self.ffn = ConvFFN(
+                in_features=dim,
+                hidden_features=int(dim * cffn_ratio),
+                drop=drop)
+            self.ffn_norm = norm_layer(dim)
+            self.drop_path = DropPath(
+                drop_path) if drop_path > 0. else nn.Identity()
+
+    def forward(self, query, reference_points, feat, spatial_shapes,
+                level_start_index, H, W):
+        def _inner_forward(query, feat):
+            '''
+            attn = self.attn(self.query_norm(query), reference_points,
+                             self.feat_norm(feat), spatial_shapes,
+                             level_start_index, None)
+            query = query + attn
+            '''
+
+            if self.with_cffn:
+                query = query + self.drop_path(
+                    self.ffn(self.ffn_norm(query), H, W))
+            return query
+
+        query = _inner_forward(query, feat)
+
+        return query
+
+
+class Injector(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=6,
+                 n_points=4,
+                 n_levels=1,
+                 deform_ratio=1.0,
+                 norm_layer=partial(
+                     nn.LayerNorm, epsilon=1e-6),
+                 init_values=0.):
+        super().__init__()
+        self.query_norm = norm_layer(dim)
+        self.feat_norm = norm_layer(dim)
+        '''
+        self.attn = MSDeformAttn(d_model=dim, n_levels=n_levels, n_heads=num_heads,
+                                 n_points=n_points, ratio=deform_ratio)
+        '''
+        self.gamma = self.create_parameter(
+            shape=(dim, ),
+            default_initializer=paddle.nn.initializer.Constant(
+                value=init_values))
+
+    def forward(self, query, reference_points, feat, spatial_shapes,
+                level_start_index):
+        def _inner_forward(query, feat):
+            '''
+            attn = self.attn(self.query_norm(query), reference_points,
+                             self.feat_norm(feat), spatial_shapes,
+                             level_start_index, None)
+            return query + self.gamma * attn
+            '''
+            return query
+
+        query = _inner_forward(query, feat)
+
+        return query
+
+
+class InteractionBlock(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=6,
+                 n_points=4,
+                 norm_layer=partial(
+                     nn.LayerNorm, epsilon=1e-6),
+                 drop=0.,
+                 drop_path=0.,
+                 with_cffn=True,
+                 cffn_ratio=0.25,
+                 init_values=0.,
+                 deform_ratio=1.0,
+                 extra_extractor=False):
+        super().__init__()
+
+        self.injector = Injector(
+            dim=dim,
+            n_levels=3,
+            num_heads=num_heads,
+            init_values=init_values,
+            n_points=n_points,
+            norm_layer=norm_layer,
+            deform_ratio=deform_ratio)
+        self.extractor = Extractor(
+            dim=dim,
+            n_levels=1,
+            num_heads=num_heads,
+            n_points=n_points,
+            norm_layer=norm_layer,
+            deform_ratio=deform_ratio,
+            with_cffn=with_cffn,
+            cffn_ratio=cffn_ratio,
+            drop=drop,
+            drop_path=drop_path)
+        if extra_extractor:
+            self.extra_extractors = nn.Sequential(*[
+                Extractor(
+                    dim=dim,
+                    num_heads=num_heads,
+                    n_points=n_points,
+                    norm_layer=norm_layer,
+                    with_cffn=with_cffn,
+                    cffn_ratio=cffn_ratio,
+                    deform_ratio=deform_ratio,
+                    drop=drop,
+                    drop_path=drop_path) for _ in range(2)
+            ])
+        else:
+            self.extra_extractors = None
+
+    def forward(self, x, c, blocks, deform_inputs1, deform_inputs2, H, W):
+        debug = False
+        x = self.injector(
+            query=x,
+            reference_points=deform_inputs1[0],
+            feat=c,
+            spatial_shapes=deform_inputs1[1],
+            level_start_index=deform_inputs1[2])
+        if debug:
+            print('x', x.cpu().numpy().mean())
+
+        for idx, blk in enumerate(blocks):
+            x = blk(x, H, W)
+            if debug:
+                print('x block_{}'.format(idx), x.cpu().numpy().mean())
+
+        c = self.extractor(
+            query=c,
+            reference_points=deform_inputs2[0],
+            feat=x,
+            spatial_shapes=deform_inputs2[1],
+            level_start_index=deform_inputs2[2],
+            H=H,
+            W=W)
+        if debug:
+            print('c', c.cpu().numpy().mean())
+
+        if self.extra_extractors is not None:
+            for extractor in self.extra_extractors:
+                c = extractor(
+                    query=c,
+                    reference_points=deform_inputs2[0],
+                    feat=x,
+                    spatial_shapes=deform_inputs2[1],
+                    level_start_index=deform_inputs2[2],
+                    H=H,
+                    W=W)
+            if debug:
+                print('c', c.cpu().numpy().mean())
+
+        return x, c
+
+
+class InteractionBlockWithCls(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=6,
+                 n_points=4,
+                 norm_layer=partial(
+                     nn.LayerNorm, eps=1e-6),
+                 drop=0.,
+                 drop_path=0.,
+                 with_cffn=True,
+                 cffn_ratio=0.25,
+                 init_values=0.,
+                 deform_ratio=1.0,
+                 extra_extractor=False):
+        super().__init__()
+
+        self.injector = Injector(
+            dim=dim,
+            n_levels=3,
+            num_heads=num_heads,
+            init_values=init_values,
+            n_points=n_points,
+            norm_layer=norm_layer,
+            deform_ratio=deform_ratio)
+        self.extractor = Extractor(
+            dim=dim,
+            n_levels=1,
+            num_heads=num_heads,
+            n_points=n_points,
+            norm_layer=norm_layer,
+            deform_ratio=deform_ratio,
+            with_cffn=with_cffn,
+            cffn_ratio=cffn_ratio,
+            drop=drop,
+            drop_path=drop_path)
+        if extra_extractor:
+            self.extra_extractors = nn.Sequential(*[
+                Extractor(
+                    dim=dim,
+                    num_heads=num_heads,
+                    n_points=n_points,
+                    norm_layer=norm_layer,
+                    with_cffn=with_cffn,
+                    cffn_ratio=cffn_ratio,
+                    deform_ratio=deform_ratio,
+                    drop=drop,
+                    drop_path=drop_path) for _ in range(2)
+            ])
+        else:
+            self.extra_extractors = None
+
+    def forward(self, x, c, cls, blocks, deform_inputs1, deform_inputs2, H, W):
+        x = self.injector(
+            query=x,
+            reference_points=deform_inputs1[0],
+            feat=c,
+            spatial_shapes=deform_inputs1[1],
+            level_start_index=deform_inputs1[2])
+        x = paddle.concat((cls, x), axis=1)
+        for idx, blk in enumerate(blocks):
+            x = blk(x, H, W)
+        cls, x = x[:, :1, ], x[:, 1:, ]
+        c = self.extractor(
+            query=c,
+            reference_points=deform_inputs2[0],
+            feat=x,
+            spatial_shapes=deform_inputs2[1],
+            level_start_index=deform_inputs2[2],
+            H=H,
+            W=W)
+        if self.extra_extractors is not None:
+            for extractor in self.extra_extractors:
+                c = extractor(
+                    query=c,
+                    reference_points=deform_inputs2[0],
+                    feat=x,
+                    spatial_shapes=deform_inputs2[1],
+                    level_start_index=deform_inputs2[2],
+                    H=H,
+                    W=W)
+        return x, c, cls
+
+
+class SpatialPriorModule(nn.Layer):
+    def __init__(self, inplanes=64, embed_dim=384):
+        super().__init__()
+
+        self.stem = nn.Sequential(*[
+            nn.Conv2D(
+                3,
+                inplanes,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                bias_attr=False), nn.SyncBatchNorm(inplanes), nn.ReLU(),
+            nn.Conv2D(
+                inplanes,
+                inplanes,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias_attr=False), nn.SyncBatchNorm(inplanes), nn.ReLU(),
+            nn.Conv2D(
+                inplanes,
+                inplanes,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias_attr=False), nn.SyncBatchNorm(inplanes), nn.ReLU(),
+            nn.MaxPool2D(
+                kernel_size=3, stride=2, padding=1)
+        ])
+        self.conv2 = nn.Sequential(*[
+            nn.Conv2D(
+                inplanes,
+                2 * inplanes,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                bias_attr=False), nn.SyncBatchNorm(2 * inplanes), nn.ReLU()
+        ])
+        self.conv3 = nn.Sequential(*[
+            nn.Conv2D(
+                2 * inplanes,
+                4 * inplanes,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                bias_attr=False), nn.SyncBatchNorm(4 * inplanes), nn.ReLU()
+        ])
+        self.conv4 = nn.Sequential(*[
+            nn.Conv2D(
+                4 * inplanes,
+                4 * inplanes,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                bias_attr=False), nn.SyncBatchNorm(4 * inplanes), nn.ReLU()
+        ])
+        self.fc1 = nn.Conv2D(
+            inplanes,
+            embed_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias_attr=True)
+        self.fc2 = nn.Conv2D(
+            2 * inplanes,
+            embed_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias_attr=True)
+        self.fc3 = nn.Conv2D(
+            4 * inplanes,
+            embed_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias_attr=True)
+        self.fc4 = nn.Conv2D(
+            4 * inplanes,
+            embed_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias_attr=True)
+
+    def forward(self, x):
+        def _inner_forward(x):
+            c1 = self.stem(x)
+            c2 = self.conv2(c1)
+            c3 = self.conv3(c2)
+            c4 = self.conv4(c3)
+            c1 = self.fc1(c1)
+            c2 = self.fc2(c2)
+            c3 = self.fc3(c3)
+            c4 = self.fc4(c4)
+
+            bs, dim, _, _ = c1.shape
+            c2 = c2.reshape([bs, dim, -1]).transpose([0, 2, 1])  # 8s
+            c3 = c3.reshape([bs, dim, -1]).transpose([0, 2, 1])  # 16s
+            c4 = c4.reshape([bs, dim, -1]).transpose([0, 2, 1])  # 32s
+
+            return c1, c2, c3, c4
+
+        outs = _inner_forward(x)
+        return outs

From 3fee1ed6c8f91244a32fb9363f8d8d8b23aa62d8 Mon Sep 17 00:00:00 2001
From: juncaipeng <13006307475@163.com>
Date: Mon, 14 Nov 2022 15:24:24 +0800
Subject: [PATCH 02/16] align head infer forward

---
 ...rnet_deit_adapter_tiny_512_160k_ade20k.yml |   4 +-
 paddleseg/models/__init__.py                  |   1 +
 paddleseg/models/backbones/vit_adapter.py     |   4 +-
 paddleseg/models/upernet_vit_adapter.py       | 297 ++++++++++++++++++
 4 files changed, 303 insertions(+), 3 deletions(-)
 create mode 100644 paddleseg/models/upernet_vit_adapter.py

diff --git a/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml b/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml
index 32bfe32140..b1d4ac74b3 100644
--- a/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml
+++ b/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml
@@ -59,7 +59,7 @@ loss:
   coef: [1]
 
 model:
-  type: TopFormer
+  type: UPerNetViTAdapter
   backbone:
     type: ViTAdapter
     num_heads: 3
@@ -74,4 +74,6 @@ model:
     cffn_ratio: 0.25
     deform_ratio: 1.0
     interaction_indexes: [[0, 2], [3, 5], [6, 8], [9, 11]]
+  backbone_indices: [0, 1, 2, 3]
+  aux_loss: True
   pretrained: pretrained_model/upernet_deit_adapter_tiny_512_160_ade20k_from_torch.pdparams
\ No newline at end of file
diff --git a/paddleseg/models/__init__.py b/paddleseg/models/__init__.py
index 1943bc5c86..bd9c3034ba 100644
--- a/paddleseg/models/__init__.py
+++ b/paddleseg/models/__init__.py
@@ -66,3 +66,4 @@
 from .mscale_ocrnet import MscaleOCRNet
 from .topformer import TopFormer
 from .rtformer import RTFormer
+from .upernet_vit_adapter import UPerNetViTAdapter
diff --git a/paddleseg/models/backbones/vit_adapter.py b/paddleseg/models/backbones/vit_adapter.py
index 3e4269423b..fcdc0cf757 100644
--- a/paddleseg/models/backbones/vit_adapter.py
+++ b/paddleseg/models/backbones/vit_adapter.py
@@ -301,6 +301,7 @@ def __init__(self,
         self.interaction_indexes = interaction_indexes
         self.add_vit_feature = add_vit_feature
         embed_dim = self.embed_dim
+        self.feat_channels = [embed_dim] * 4
 
         self.level_embed = self.create_parameter(
             shape=(3, embed_dim),
@@ -418,7 +419,7 @@ def forward(self, x):
                 print(x.numpy().mean())
                 print(c.numpy().mean())
 
-# Split & Reshape
+        # Split & Reshape
         c2 = c[:, 0:c2.shape[1], :]
         c3 = c[:, c2.shape[1]:c2.shape[1] + c3.shape[1], :]
         c4 = c[:, c2.shape[1] + c3.shape[1]:, :]
@@ -454,5 +455,4 @@ def forward(self, x):
             #assert np.allclose(f1, -0.03254774, rtol=0.0, atol=1e-6)
             # without msdeformatt
             #assert np.allclose(f1, -0.024487903, rtol=0.0, atol=1e-6)
-            exit()
         return [f1, f2, f3, f4]
diff --git a/paddleseg/models/upernet_vit_adapter.py b/paddleseg/models/upernet_vit_adapter.py
new file mode 100644
index 0000000000..117dbdf426
--- /dev/null
+++ b/paddleseg/models/upernet_vit_adapter.py
@@ -0,0 +1,297 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddleseg import utils
+from paddleseg.cvlibs import manager
+from paddleseg.models import layers
+
+
+@manager.MODELS.add_component
+class UPerNetViTAdapter(nn.Layer):
+    """
+    The UPerNet implementation based on PaddlePaddle.
+
+    The original article refers to
+    Tete Xiao, et, al. "Unified Perceptual Parsing for Scene Understanding"
+    (https://arxiv.org/abs/1807.10221).
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (Paddle.nn.Layer): Backbone network, currently support Resnet50/101.
+        backbone_indices (tuple): Four values in the tuple indicate the indices of output of backbone.
+        channels (int): The channels of inter layers. Default: 512.
+        aux_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: False.
+        align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
+            e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
+        dropout_ratio (float): Dropout ratio for upernet head. Default: 0.1.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices,
+                 channels=512,
+                 pool_scales=[1, 2, 3, 6],
+                 dropout_ratio=0.1,
+                 aux_loss=True,
+                 align_corners=False,
+                 pretrained=None):
+        super().__init__()
+        self.backbone = backbone
+        self.backbone_indices = backbone_indices
+        self.align_corners = align_corners
+
+        in_channels = [self.backbone.feat_channels[i] for i in backbone_indices]
+        self.head = UPerNetHead(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            channels=channels,
+            pool_scales=pool_scales,
+            dropout_ratio=dropout_ratio,
+            aux_loss=aux_loss,
+            align_corners=align_corners)
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        feats = self.backbone(x)
+        feats = [feats[i] for i in self.backbone_indices]
+        logit_list = self.head(feats)
+        logit_list = [
+            F.interpolate(
+                logit,
+                paddle.shape(x)[2:],
+                mode='bilinear',
+                align_corners=self.align_corners) for logit in logit_list
+        ]
+        return logit_list
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class ConvBNReLU(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 bias_attr=False,
+                 **kwargs):
+        super().__init__()
+        self.conv = nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size,
+            bias_attr=bias_attr,
+            **kwargs)
+        self.bn = nn.BatchNorm2D(out_channels)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+
+
+class PPM(nn.Layer):
+    """Pooling Pyramid Module used in PSPNet.
+
+    Args:
+        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
+            Module.
+        in_channels (int): Input channels.
+        channels (int): Channels after modules, before conv_seg.
+        conv_cfg (dict|None): Config of conv layers.
+        norm_cfg (dict|None): Config of norm layers.
+        act_cfg (dict): Config of activation layers.
+        align_corners (bool): align_corners argument of F.interpolate.
+    """
+
+    def __init__(self, pool_scales, in_channels, channels, align_corners):
+        super().__init__()
+        self.pool_scales = pool_scales
+        self.in_channels = in_channels
+        self.channels = channels
+        self.align_corners = align_corners
+        self.stages = nn.LayerList()
+        for pool_scale in pool_scales:
+            self.stages.append(
+                nn.Sequential(
+                    nn.AdaptiveAvgPool2D(output_size=(pool_scale, pool_scale)),
+                    ConvBNReLU(
+                        in_channels=in_channels,
+                        out_channels=channels,
+                        kernel_size=1)))
+
+    def forward(self, x):
+        """Forward function."""
+        ppm_outs = []
+        for ppm in self.stages:
+            ppm_out = ppm(x)
+            upsampled_ppm_out = F.interpolate(
+                ppm_out,
+                paddle.shape(x)[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+            ppm_outs.append(upsampled_ppm_out)
+        return ppm_outs
+
+
+class UPerNetHead(nn.Layer):
+    """Unified Perceptual Parsing for Scene Understanding.
+
+    This head is the implementation of `UPerNet
+    <https://arxiv.org/abs/1807.10221>`_.
+
+    Args:
+        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
+            Module applied on the last feature. Default: (1, 2, 3, 6).
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 channels,
+                 pool_scales=[1, 2, 3, 6],
+                 dropout_ratio=0.1,
+                 aux_loss=False,
+                 aux_channels=256,
+                 align_corners=False):
+        super().__init__()
+        self.align_corners = align_corners
+
+        # PSP Module
+        self.psp_modules = PPM(pool_scales,
+                               in_channels[-1],
+                               channels,
+                               align_corners=align_corners)
+        self.bottleneck = ConvBNReLU(
+            in_channels[-1] + len(pool_scales) * channels,
+            channels,
+            3,
+            padding=1)
+        # FPN Module
+        self.lateral_convs = nn.LayerList()
+        self.fpn_convs = nn.LayerList()
+        for ch in in_channels[:-1]:  # skip the top layer
+            l_conv = ConvBNReLU(ch, channels, 1)
+            fpn_conv = ConvBNReLU(channels, channels, 3, padding=1)
+            self.lateral_convs.append(l_conv)
+            self.fpn_convs.append(fpn_conv)
+
+        self.fpn_bottleneck = ConvBNReLU(
+            len(in_channels) * channels, channels, 3, padding=1)
+
+        if dropout_ratio > 0:
+            self.dropout = nn.Dropout2D(dropout_ratio)
+        else:
+            self.dropout = None
+        self.conv_seg = nn.Conv2D(channels, num_classes, kernel_size=1)
+
+        self.aux_loss = aux_loss
+        if self.aux_loss:
+            self.aux_conv = ConvBNReLU(
+                in_channels[2], aux_channels, 3, padding=1)
+            self.aux_conv_seg = nn.Conv2D(
+                aux_channels, num_classes, kernel_size=1)
+
+    def psp_forward(self, inputs):
+        """Forward function of PSP module."""
+        x = inputs[-1]
+        psp_outs = [x]
+        psp_outs.extend(self.psp_modules(x))
+        psp_outs = paddle.concat(psp_outs, axis=1)
+        output = self.bottleneck(psp_outs)
+        return output
+
+    def forward(self, inputs):
+        """Forward function."""
+        debug = True
+
+        if debug:
+            print('-------head 1----')
+            for x in inputs:
+                print(x.shape, x.numpy().mean())
+
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+        laterals.append(self.psp_forward(inputs))
+
+        if debug:
+            print('-------head 2----')
+            for x in laterals:
+                print(x.shape, x.numpy().mean())
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            upsampled = F.interpolate(
+                laterals[i],
+                paddle.shape(laterals[i - 1])[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+            laterals[i - 1] = laterals[i - 1] + upsampled
+
+        # build outputs
+        fpn_outs = [
+            self.fpn_convs[i](laterals[i])
+            for i in range(used_backbone_levels - 1)
+        ]
+        fpn_outs.append(laterals[-1])  # append psp feature
+
+        if debug:
+            print('-------head 3----')
+            for x in fpn_outs:
+                print(x.shape, x.numpy().mean())
+
+        for i in range(used_backbone_levels - 1, 0, -1):
+            fpn_outs[i] = F.interpolate(
+                fpn_outs[i],
+                size=paddle.shape(fpn_outs[0])[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+        fpn_outs = paddle.concat(fpn_outs, axis=1)
+        output = self.fpn_bottleneck(fpn_outs)
+
+        if debug:
+            print('-------head 4----')
+            print(output.shape, output.numpy().mean())
+
+        if self.dropout is not None:
+            output = self.dropout(output)
+        output = self.conv_seg(output)
+        logits_list = [output]
+
+        if self.aux_loss:
+            aux_output = self.aux_conv(inputs[2])
+            aux_output = self.aux_conv_seg(aux_output)
+            logits_list.append(aux_output)
+
+        if debug:
+            print('-------head 5----')
+            for x in logits_list:
+                print(x.shape, x.numpy().mean())
+            exit()
+        return output

From 79e597ac4c3dbe91f7a5d2f6e56632a44e2926f6 Mon Sep 17 00:00:00 2001
From: juncaipeng <13006307475@163.com>
Date: Tue, 15 Nov 2022 15:11:01 +0800
Subject: [PATCH 03/16] aling model infer forward with ms_deform_attn

---
 paddleseg/models/backbones/vit_adapter.py     |   9 +-
 paddleseg/models/layers/vit_adapter_layers.py | 181 ++++++++++++++++--
 paddleseg/models/upernet_vit_adapter.py       |   2 +
 3 files changed, 167 insertions(+), 25 deletions(-)

diff --git a/paddleseg/models/backbones/vit_adapter.py b/paddleseg/models/backbones/vit_adapter.py
index fcdc0cf757..c002467b61 100644
--- a/paddleseg/models/backbones/vit_adapter.py
+++ b/paddleseg/models/backbones/vit_adapter.py
@@ -1,4 +1,4 @@
-# The ViT-Adapter code was heavily based on https://github.com/czczup/ViT-Adapter
+# This is heavily based on https://github.com/czczup/ViT-Adapter
 
 import math
 from functools import partial
@@ -18,7 +18,8 @@
 from paddleseg.utils import utils, logger
 from paddleseg.models.backbones.transformer_utils import to_2tuple, DropPath
 
-from paddleseg.models.layers.vit_adapter_layers import SpatialPriorModule, InteractionBlock, deform_inputs
+from paddleseg.models.layers.vit_adapter_layers import (
+    SpatialPriorModule, InteractionBlock, deform_inputs, MSDeformAttn)
 
 __all__ = ['ViTAdapter']
 
@@ -360,11 +361,8 @@ def _get_pos_embed(self, pos_embed, H, W):
         return pos_embed
 
     def _init_deform_weights(self, m):
-        '''
         if isinstance(m, MSDeformAttn):
             m._reset_parameters()
-        '''
-        pass
 
     def _add_level_embed(self, c2, c3, c4):
         c2 = c2 + self.level_embed[0]
@@ -450,6 +448,7 @@ def forward(self, x):
             print(f2.cpu().numpy().mean())
             print(f3.cpu().numpy().mean())
             print(f4.cpu().numpy().mean())
+            exit()
             # f1 = f1.cpu().numpy().mean()
             # with msdeformatt
             #assert np.allclose(f1, -0.03254774, rtol=0.0, atol=1e-6)
diff --git a/paddleseg/models/layers/vit_adapter_layers.py b/paddleseg/models/layers/vit_adapter_layers.py
index a73581890c..3f596bf7e9 100644
--- a/paddleseg/models/layers/vit_adapter_layers.py
+++ b/paddleseg/models/layers/vit_adapter_layers.py
@@ -1,10 +1,16 @@
-# The ViT-Adapter code was heavily based on https://github.com/czczup/ViT-Adapter
+# This is heavily based on https://github.com/czczup/ViT-Adapter
 
+import math
+import warnings
 from functools import partial
 
 import paddle
 import paddle.nn as nn
+import paddle.nn.functional as F
 from paddleseg.models.backbones.transformer_utils import DropPath
+from paddleseg.cvlibs.param_init import constant_init, xavier_uniform
+
+import ms_deform_attn as msda  # first install ms_deform_attn
 
 
 def get_reference_points(spatial_shapes):
@@ -44,6 +50,13 @@ def deform_inputs(x):
     return deform_inputs1, deform_inputs2
 
 
+def _is_power_of_2(n):
+    if (not isinstance(n, int)) or (n < 0):
+        raise ValueError('invalid input for _is_power_of_2: {} (type: {})'.
+                         format(n, type(n)))
+    return (n & (n - 1) == 0) and n != 0
+
+
 class ConvFFN(nn.Layer):
     def __init__(self,
                  in_features,
@@ -90,6 +103,135 @@ def forward(self, x, H, W):
         return x
 
 
+class MSDeformAttn(nn.Layer):
+    def __init__(self,
+                 d_model=256,
+                 n_levels=4,
+                 n_heads=8,
+                 n_points=4,
+                 ratio=1.0):
+        """Multi-Scale Deformable Attention Module.
+
+        :param d_model      hidden dimension
+        :param n_levels     number of feature levels
+        :param n_heads      number of attention heads
+        :param n_points     number of sampling points per attention head per feature level
+        """
+        super().__init__()
+        if d_model % n_heads != 0:
+            raise ValueError('d_model must be divisible by n_heads, '
+                             'but got {} and {}'.format(d_model, n_heads))
+        _d_per_head = d_model // n_heads
+        # you'd better set _d_per_head to a power of 2
+        # which is more efficient in our CUDA implementation
+        if not _is_power_of_2(_d_per_head):
+            warnings.warn("You'd better set d_model in MSDeformAttn to make "
+                          'the dimension of each attention head a power of 2 '
+                          'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = 64
+
+        self.d_model = d_model
+        self.n_levels = n_levels
+        self.n_heads = n_heads
+        self.n_points = n_points
+        self.ratio = ratio
+        self.sampling_offsets = nn.Linear(d_model,
+                                          n_heads * n_levels * n_points * 2)
+        self.attention_weights = nn.Linear(d_model,
+                                           n_heads * n_levels * n_points)
+        self.value_proj = nn.Linear(d_model, int(d_model * ratio))
+        self.output_proj = nn.Linear(int(d_model * ratio), d_model)
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        constant_init(self.sampling_offsets.weight, value=0.)
+        thetas = paddle.arange(
+            self.n_heads, dtype='float32') * (2.0 * math.pi / self.n_heads)
+        grid_init = paddle.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init / grid_init.abs().max(
+            -1, keepdim=True)[0]).reshape([self.n_heads, 1, 1, 2]).tile(
+                [1, self.n_levels, self.n_points, 1])
+        for i in range(self.n_points):
+            grid_init[:, :, i, :] *= i + 1
+
+        with paddle.no_grad():
+            grid_init = grid_init.reshape([-1])
+            self.sampling_offsets.bias = self.create_parameter(
+                shape=grid_init.shape,
+                default_initializer=paddle.nn.initializer.Assign(grid_init))
+
+        constant_init(self.attention_weights.weight, value=0.)
+        constant_init(self.attention_weights.bias, value=0.)
+        xavier_uniform(self.value_proj.weight)
+        constant_init(self.value_proj.bias, value=0.)
+        xavier_uniform(self.output_proj.weight)
+        constant_init(self.output_proj.bias, value=0.)
+
+    def forward(self,
+                query,
+                reference_points,
+                input_flatten,
+                input_spatial_shapes,
+                input_level_start_index,
+                input_padding_mask=None):
+        """
+        :param query                       (N, Length_{query}, C)
+        :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
+                                        or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
+        :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
+        :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+        :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
+        :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
+
+        :return output                     (N, Length_{query}, C)
+        """
+
+        def masked_fill(x, mask, value):
+            y = paddle.full(x.shape, value, x.dtype)
+            return paddle.where(mask, y, x)
+
+        N, Len_q, _ = query.shape
+        N, Len_in, _ = input_flatten.shape
+        assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]
+                ).sum() == Len_in
+
+        value = self.value_proj(input_flatten)
+        if input_padding_mask is not None:
+            value = masked_fill(value, input_padding_mask[..., None], float(0))
+
+        value = value.reshape([
+            N, Len_in, self.n_heads,
+            int(self.ratio * self.d_model) // self.n_heads
+        ])
+        sampling_offsets = self.sampling_offsets(query).reshape(
+            [N, Len_q, self.n_heads, self.n_levels, self.n_points, 2])
+        attention_weights = self.attention_weights(query).reshape(
+            [N, Len_q, self.n_heads, self.n_levels * self.n_points])
+        attention_weights = F.softmax(attention_weights, -1).\
+            reshape([N, Len_q, self.n_heads, self.n_levels, self.n_points])
+
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = paddle.stack(
+                [input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]],
+                -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                                 + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = reference_points[:, :, None, :, None, :2] \
+                                 + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
+        else:
+            raise ValueError(
+                'Last dim of reference_points must be 2 or 4, but get {} instead.'
+                .format(reference_points.shape[-1]))
+        output = msda.ms_deform_attn(
+            value, input_spatial_shapes, input_level_start_index,
+            sampling_locations, attention_weights, self.im2col_step)
+        output = self.output_proj(output)
+        return output
+
+
 class Extractor(nn.Layer):
     def __init__(self,
                  dim,
@@ -106,10 +248,12 @@ def __init__(self,
         super().__init__()
         self.query_norm = norm_layer(dim)
         self.feat_norm = norm_layer(dim)
-        '''
-        self.attn = MSDeformAttn(d_model=dim, n_levels=n_levels, n_heads=num_heads,
-                                 n_points=n_points, ratio=deform_ratio)
-        '''
+        self.attn = MSDeformAttn(
+            d_model=dim,
+            n_levels=n_levels,
+            n_heads=num_heads,
+            n_points=n_points,
+            ratio=deform_ratio)
         self.with_cffn = with_cffn
         if with_cffn:
             self.ffn = ConvFFN(
@@ -123,12 +267,10 @@ def __init__(self,
     def forward(self, query, reference_points, feat, spatial_shapes,
                 level_start_index, H, W):
         def _inner_forward(query, feat):
-            '''
-            attn = self.attn(self.query_norm(query), reference_points,
-                             self.feat_norm(feat), spatial_shapes,
-                             level_start_index, None)
+            attn = self.attn(
+                self.query_norm(query), reference_points,
+                self.feat_norm(feat), spatial_shapes, level_start_index, None)
             query = query + attn
-            '''
 
             if self.with_cffn:
                 query = query + self.drop_path(
@@ -153,10 +295,12 @@ def __init__(self,
         super().__init__()
         self.query_norm = norm_layer(dim)
         self.feat_norm = norm_layer(dim)
-        '''
-        self.attn = MSDeformAttn(d_model=dim, n_levels=n_levels, n_heads=num_heads,
-                                 n_points=n_points, ratio=deform_ratio)
-        '''
+        self.attn = MSDeformAttn(
+            d_model=dim,
+            n_levels=n_levels,
+            n_heads=num_heads,
+            n_points=n_points,
+            ratio=deform_ratio)
         self.gamma = self.create_parameter(
             shape=(dim, ),
             default_initializer=paddle.nn.initializer.Constant(
@@ -165,13 +309,10 @@ def __init__(self,
     def forward(self, query, reference_points, feat, spatial_shapes,
                 level_start_index):
         def _inner_forward(query, feat):
-            '''
-            attn = self.attn(self.query_norm(query), reference_points,
-                             self.feat_norm(feat), spatial_shapes,
-                             level_start_index, None)
+            attn = self.attn(
+                self.query_norm(query), reference_points,
+                self.feat_norm(feat), spatial_shapes, level_start_index, None)
             return query + self.gamma * attn
-            '''
-            return query
 
         query = _inner_forward(query, feat)
 
diff --git a/paddleseg/models/upernet_vit_adapter.py b/paddleseg/models/upernet_vit_adapter.py
index 117dbdf426..ca6336dd1a 100644
--- a/paddleseg/models/upernet_vit_adapter.py
+++ b/paddleseg/models/upernet_vit_adapter.py
@@ -162,6 +162,8 @@ class UPerNetHead(nn.Layer):
     This head is the implementation of `UPerNet
     <https://arxiv.org/abs/1807.10221>`_.
 
+    This is heavily based on https://github.com/czczup/ViT-Adapter
+
     Args:
         pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
             Module applied on the last feature. Default: (1, 2, 3, 6).

From fe4608673c797969a2213eaf1e8443819ae40208 Mon Sep 17 00:00:00 2001
From: juncaipeng <13006307475@163.com>
Date: Tue, 15 Nov 2022 17:27:10 +0800
Subject: [PATCH 04/16] align ade20k inference 1115

---
 ...upernet_deit_adapter_tiny_512_160k_ade20k.yml | 16 +++++++++++++---
 paddleseg/core/val.py                            |  1 +
 paddleseg/models/backbones/vit_adapter.py        |  7 ++-----
 paddleseg/models/upernet_vit_adapter.py          | 11 +++++++----
 4 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml b/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml
index b1d4ac74b3..fcdcd0f32c 100644
--- a/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml
+++ b/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml
@@ -1,6 +1,6 @@
 _base_: '../_base_/ade20k.yml'
 
-batch_size: 4  # total batch size is 16
+batch_size: 2  # total batch size is 16
 iters: 160000
 
 train_dataset:
@@ -30,6 +30,11 @@ val_dataset:
       mean: [0.485, 0.456, 0.406]
       std: [0.229, 0.224, 0.225]
   
+test_config:
+  is_slide: True
+  crop_size: [512, 512]
+  stride: [341, 341]
+  
 export:
   transforms:
     - type: Resize
@@ -47,7 +52,7 @@ optimizer:
 
 lr_scheduler:
   type: PolynomialDecay
-  learning_rate: 0.0012
+  learning_rate: 1.2e-4
   end_lr: 0
   power: 1.0
   warmup_iters: 1500
@@ -56,7 +61,8 @@ lr_scheduler:
 loss:
   types:
     - type: CrossEntropyLoss
-  coef: [1]
+    - type: CrossEntropyLoss
+  coef: [1, 0.4]
 
 model:
   type: UPerNetViTAdapter
@@ -75,5 +81,9 @@ model:
     deform_ratio: 1.0
     interaction_indexes: [[0, 2], [3, 5], [6, 8], [9, 11]]
   backbone_indices: [0, 1, 2, 3]
+  channels: 512
+  pool_scales: [1, 2, 3, 6]
+  dropout_ratio: 0.1
   aux_loss: True
+  aux_channels: 256
   pretrained: pretrained_model/upernet_deit_adapter_tiny_512_160_ade20k_from_torch.pdparams
\ No newline at end of file
diff --git a/paddleseg/core/val.py b/paddleseg/core/val.py
index 80a820b6bc..958946ab04 100644
--- a/paddleseg/core/val.py
+++ b/paddleseg/core/val.py
@@ -209,6 +209,7 @@ def evaluate(model,
             if local_rank == 0 and print_detail:
                 progbar_val.update(iter + 1, [('batch_cost', batch_cost),
                                               ('reader cost', reader_cost)])
+                print(total_iters, iter + 1)
             reader_cost_averager.reset()
             batch_cost_averager.reset()
             batch_start = time.time()
diff --git a/paddleseg/models/backbones/vit_adapter.py b/paddleseg/models/backbones/vit_adapter.py
index c002467b61..c30c5e35bd 100644
--- a/paddleseg/models/backbones/vit_adapter.py
+++ b/paddleseg/models/backbones/vit_adapter.py
@@ -371,7 +371,7 @@ def _add_level_embed(self, c2, c3, c4):
         return c2, c3, c4
 
     def forward(self, x):
-        debug = True
+        debug = False
         if debug:
             import random
             import numpy as np
@@ -448,10 +448,7 @@ def forward(self, x):
             print(f2.cpu().numpy().mean())
             print(f3.cpu().numpy().mean())
             print(f4.cpu().numpy().mean())
-            exit()
             # f1 = f1.cpu().numpy().mean()
             # with msdeformatt
-            #assert np.allclose(f1, -0.03254774, rtol=0.0, atol=1e-6)
-            # without msdeformatt
-            #assert np.allclose(f1, -0.024487903, rtol=0.0, atol=1e-6)
+            #assert np.allclose(f1, -0.03252137, rtol=0.0, atol=1e-6)
         return [f1, f2, f3, f4]
diff --git a/paddleseg/models/upernet_vit_adapter.py b/paddleseg/models/upernet_vit_adapter.py
index ca6336dd1a..76f649a3c1 100644
--- a/paddleseg/models/upernet_vit_adapter.py
+++ b/paddleseg/models/upernet_vit_adapter.py
@@ -50,6 +50,7 @@ def __init__(self,
                  pool_scales=[1, 2, 3, 6],
                  dropout_ratio=0.1,
                  aux_loss=True,
+                 aux_channels=256,
                  align_corners=False,
                  pretrained=None):
         super().__init__()
@@ -65,6 +66,7 @@ def __init__(self,
             pool_scales=pool_scales,
             dropout_ratio=dropout_ratio,
             aux_loss=aux_loss,
+            aux_channels=aux_channels,
             align_corners=align_corners)
 
         self.pretrained = pretrained
@@ -227,8 +229,7 @@ def psp_forward(self, inputs):
 
     def forward(self, inputs):
         """Forward function."""
-        debug = True
-
+        debug = False
         if debug:
             print('-------head 1----')
             for x in inputs:
@@ -286,7 +287,7 @@ def forward(self, inputs):
         output = self.conv_seg(output)
         logits_list = [output]
 
-        if self.aux_loss:
+        if self.aux_loss and self.training:
             aux_output = self.aux_conv(inputs[2])
             aux_output = self.aux_conv_seg(aux_output)
             logits_list.append(aux_output)
@@ -295,5 +296,7 @@ def forward(self, inputs):
             print('-------head 5----')
             for x in logits_list:
                 print(x.shape, x.numpy().mean())
+            # -20.250404  -15.875856
             exit()
-        return output
+
+        return logits_list

From 7193096d387d9cafc9a04e9e4d30270201f89be5 Mon Sep 17 00:00:00 2001
From: juncaipeng <13006307475@163.com>
Date: Wed, 16 Nov 2022 20:27:00 +0800
Subject: [PATCH 05/16] 1116 0

---
 .../upernet_deit_adapter_tiny_512_160k_ade20k.yml  |  3 ++-
 paddleseg/models/backbones/vit_adapter.py          | 14 --------------
 paddleseg/models/layers/vit_adapter_layers.py      | 10 +++++-----
 3 files changed, 7 insertions(+), 20 deletions(-)

diff --git a/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml b/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml
index fcdcd0f32c..299efd0899 100644
--- a/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml
+++ b/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml
@@ -80,10 +80,11 @@ model:
     cffn_ratio: 0.25
     deform_ratio: 1.0
     interaction_indexes: [[0, 2], [3, 5], [6, 8], [9, 11]]
+    pretrained: pretrained_model/deit_tiny_patch16_224-a1311bcf_from_torch.pdparams
   backbone_indices: [0, 1, 2, 3]
   channels: 512
   pool_scales: [1, 2, 3, 6]
   dropout_ratio: 0.1
   aux_loss: True
   aux_channels: 256
-  pretrained: pretrained_model/upernet_deit_adapter_tiny_512_160_ade20k_from_torch.pdparams
\ No newline at end of file
+  #pretrained: pretrained_model/upernet_deit_adapter_tiny_512_160_ade20k_from_torch.pdparams
\ No newline at end of file
diff --git a/paddleseg/models/backbones/vit_adapter.py b/paddleseg/models/backbones/vit_adapter.py
index c30c5e35bd..9178b25363 100644
--- a/paddleseg/models/backbones/vit_adapter.py
+++ b/paddleseg/models/backbones/vit_adapter.py
@@ -256,20 +256,6 @@ def __init__(self,
     def init_weight(self):
         utils.load_pretrained_model(self, self.pretrained)
 
-    def forward_features(self, x):
-        x, H, W = self.patch_embed(x)
-        cls_token = self.cls_token.expand(
-            x.shape[0], -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
-        x = paddle.concat([cls_tokens, x], axis=1)
-        x = self.pos_drop(x + self.pos_embed)
-        for blk in self.blocks:
-            x = blk(x, H, W)
-        return x
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        return x
-
 
 @manager.BACKBONES.add_component
 class ViTAdapter(VisionTransformer):
diff --git a/paddleseg/models/layers/vit_adapter_layers.py b/paddleseg/models/layers/vit_adapter_layers.py
index 3f596bf7e9..4848343027 100644
--- a/paddleseg/models/layers/vit_adapter_layers.py
+++ b/paddleseg/models/layers/vit_adapter_layers.py
@@ -156,11 +156,11 @@ def _reset_parameters(self):
         for i in range(self.n_points):
             grid_init[:, :, i, :] *= i + 1
 
-        with paddle.no_grad():
-            grid_init = grid_init.reshape([-1])
-            self.sampling_offsets.bias = self.create_parameter(
-                shape=grid_init.shape,
-                default_initializer=paddle.nn.initializer.Assign(grid_init))
+        grid_init = grid_init.reshape([-1])
+        self.sampling_offsets.bias = self.create_parameter(
+            shape=grid_init.shape,
+            default_initializer=paddle.nn.initializer.Assign(grid_init))
+        self.sampling_offsets.bias.stop_gradient = True
 
         constant_init(self.attention_weights.weight, value=0.)
         constant_init(self.attention_weights.bias, value=0.)

From 8263d01871b20d2da5709d8fd2869dd4cd36baf8 Mon Sep 17 00:00:00 2001
From: juncaipeng <13006307475@163.com>
Date: Tue, 22 Nov 2022 13:22:09 +0800
Subject: [PATCH 06/16] align ce loss by adding avg_no_ignore

---
 .../upernet_deit_adapter_tiny_512_160k_ade20k.yml     |  2 +-
 paddleseg/models/backbones/vit_adapter.py             |  4 +++-
 paddleseg/models/losses/cross_entropy_loss.py         | 11 ++++++++---
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml b/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml
index 299efd0899..332c2a39f0 100644
--- a/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml
+++ b/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml
@@ -61,7 +61,7 @@ lr_scheduler:
 loss:
   types:
     - type: CrossEntropyLoss
-    - type: CrossEntropyLoss
+      avg_non_ignore: False
   coef: [1, 0.4]
 
 model:
diff --git a/paddleseg/models/backbones/vit_adapter.py b/paddleseg/models/backbones/vit_adapter.py
index 9178b25363..a993e742ac 100644
--- a/paddleseg/models/backbones/vit_adapter.py
+++ b/paddleseg/models/backbones/vit_adapter.py
@@ -358,7 +358,7 @@ def _add_level_embed(self, c2, c3, c4):
 
     def forward(self, x):
         debug = False
-        if debug:
+        if False:
             import random
             import numpy as np
             random.seed(0)
@@ -380,6 +380,8 @@ def forward(self, x):
                 print(i.numpy().mean())
             for i in deform_inputs2:
                 print(i.numpy().mean())
+            print(x.numpy().mean())
+            print(c.numpy().mean())
 
         # Patch Embedding forward
         x, H, W = self.patch_embed(x)
diff --git a/paddleseg/models/losses/cross_entropy_loss.py b/paddleseg/models/losses/cross_entropy_loss.py
index c934a0a5b4..b1cfb3a624 100644
--- a/paddleseg/models/losses/cross_entropy_loss.py
+++ b/paddleseg/models/losses/cross_entropy_loss.py
@@ -33,6 +33,7 @@ class CrossEntropyLoss(nn.Layer):
         top_k_percent_pixels (float, optional): the value lies in [0.0, 1.0].
             When its value < 1.0, only compute the loss for the top k percent pixels
             (e.g., the top 20% pixels). This is useful for hard pixel mining. Default ``1.0``.
+        avg_non_ignore (bool, optional): Whether the loss is only averaged over non-ignored value of pixels. Default: True.
         data_format (str, optional): The tensor format to use, 'NCHW' or 'NHWC'. Default ``'NCHW'``.
     """
 
@@ -40,10 +41,12 @@ def __init__(self,
                  weight=None,
                  ignore_index=255,
                  top_k_percent_pixels=1.0,
+                 avg_non_ignore=True,
                  data_format='NCHW'):
         super(CrossEntropyLoss, self).__init__()
         self.ignore_index = ignore_index
         self.top_k_percent_pixels = top_k_percent_pixels
+        self.avg_non_ignore = avg_non_ignore
         self.EPS = 1e-8
         self.data_format = data_format
         if weight is not None:
@@ -107,10 +110,12 @@ def _post_process_loss(self, logit, label, semantic_weights, loss):
         Returns:
             (Tensor): The average loss.
         """
-        mask = label != self.ignore_index
-        mask = paddle.cast(mask, 'float32')
-        label.stop_gradient = True
+        if self.avg_non_ignore:
+            mask = paddle.cast(label != self.ignore_index, dtype='float32')
+        else:
+            mask = paddle.ones(label.shape, dtype='float32')
         mask.stop_gradient = True
+        label.stop_gradient = True
 
         if loss.ndim > mask.ndim:
             loss = paddle.squeeze(loss, axis=-1)

From d011c2bb942c718c28119429948151c499baef3b Mon Sep 17 00:00:00 2001
From: juncaipeng <13006307475@163.com>
Date: Tue, 22 Nov 2022 16:07:46 +0800
Subject: [PATCH 07/16] change yml for real train

---
 ...rnet_deit_adapter_tiny_512_160k_ade20k.yml |  2 +-
 paddleseg/core/train.py                       | 26 +++++++++++++++++++
 paddleseg/models/backbones/vit_adapter.py     | 11 --------
 paddleseg/models/upernet_vit_adapter.py       |  1 -
 4 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml b/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml
index 332c2a39f0..b2a15c2004 100644
--- a/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml
+++ b/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml
@@ -52,7 +52,7 @@ optimizer:
 
 lr_scheduler:
   type: PolynomialDecay
-  learning_rate: 1.2e-4
+  learning_rate: 6.0e-5   # the origin lr is 1.2e-6, but the real used lr in vit_adapter is 6e-5
   end_lr: 0
   power: 1.0
   warmup_iters: 1500
diff --git a/paddleseg/core/train.py b/paddleseg/core/train.py
index 0da04b01d4..01cd1cc054 100644
--- a/paddleseg/core/train.py
+++ b/paddleseg/core/train.py
@@ -173,6 +173,18 @@ def train(model,
             reader_cost_averager.record(time.time() - batch_start)
             images = data['img']
             labels = data['label'].astype('int64')
+
+            debug = False  # set debug as True, change yml to load pretrained weights, and set dropout as 0
+            if debug:
+                import numpy as np
+                images = paddle.to_tensor(np.load('img.npy'))
+                labels = paddle.to_tensor(np.load(
+                    'gt_semantic_seg.npy')).squeeze()
+                '''
+                print('img', images.detach().cpu().numpy().mean())
+                print('gt_semantic_seg', labels.detach().cpu().numpy().mean())
+                '''
+
             edges = None
             if 'edge' in data.keys():
                 edges = data['edge'].astype('int64')
@@ -211,6 +223,20 @@ def train(model,
                     losses=losses)
                 loss = sum(loss_list)
                 loss.backward()
+
+                if debug:
+                    print(loss_list)
+                    '''
+                    loss = sum(loss_list) * 1e3
+                    loss.backward()
+                    
+                    print(loss)
+                    for name, tensor in model.named_parameters():
+                        if tensor.grad is not None:
+                            print(name, tensor.grad.numpy().mean())
+                    exit()
+                    '''
+
                 # if the optimizer is ReduceOnPlateau, the loss is the one which has been pass into step.
                 if isinstance(optimizer, paddle.optimizer.lr.ReduceOnPlateau):
                     optimizer.step(loss)
diff --git a/paddleseg/models/backbones/vit_adapter.py b/paddleseg/models/backbones/vit_adapter.py
index a993e742ac..a9824884d5 100644
--- a/paddleseg/models/backbones/vit_adapter.py
+++ b/paddleseg/models/backbones/vit_adapter.py
@@ -358,14 +358,6 @@ def _add_level_embed(self, c2, c3, c4):
 
     def forward(self, x):
         debug = False
-        if False:
-            import random
-            import numpy as np
-            random.seed(0)
-            np.random.seed(0)
-            x = np.random.rand(1, 3, 512, 512).astype("float32")
-            x = paddle.to_tensor(x, dtype='float32')
-            print('x0:', x.numpy().mean())
 
         deform_inputs1, deform_inputs2 = deform_inputs(x)
 
@@ -436,7 +428,4 @@ def forward(self, x):
             print(f2.cpu().numpy().mean())
             print(f3.cpu().numpy().mean())
             print(f4.cpu().numpy().mean())
-            # f1 = f1.cpu().numpy().mean()
-            # with msdeformatt
-            #assert np.allclose(f1, -0.03252137, rtol=0.0, atol=1e-6)
         return [f1, f2, f3, f4]
diff --git a/paddleseg/models/upernet_vit_adapter.py b/paddleseg/models/upernet_vit_adapter.py
index 76f649a3c1..7e56eb71f4 100644
--- a/paddleseg/models/upernet_vit_adapter.py
+++ b/paddleseg/models/upernet_vit_adapter.py
@@ -296,7 +296,6 @@ def forward(self, inputs):
             print('-------head 5----')
             for x in logits_list:
                 print(x.shape, x.numpy().mean())
-            # -20.250404  -15.875856
             exit()
 
         return logits_list

From ca688f2cebd04a151f9f1363cda292068909d54d Mon Sep 17 00:00:00 2001
From: juncaipeng <13006307475@163.com>
Date: Fri, 25 Nov 2022 11:14:45 +0800
Subject: [PATCH 08/16] refine for merge

---
 .../upernet_deit_adapter_tiny_512_160k_ade20k.yml         | 8 +++-----
 paddleseg/models/backbones/vit_adapter.py                 | 2 +-
 paddleseg/models/layers/vit_adapter_layers.py             | 2 +-
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml b/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml
index b2a15c2004..ab0f257346 100644
--- a/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml
+++ b/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml
@@ -1,6 +1,6 @@
 _base_: '../_base_/ade20k.yml'
 
-batch_size: 2  # total batch size is 16
+batch_size: 4  # total batch size is 16
 iters: 160000
 
 train_dataset:
@@ -8,7 +8,6 @@ train_dataset:
     - type: ResizeStepScaling
       min_scale_factor: 0.5
       max_scale_factor: 2.0
-      scale_step_size: 0.25
     - type: RandomPaddingCrop
       crop_size: [512, 512]
     - type: RandomHorizontalFlip
@@ -52,7 +51,7 @@ optimizer:
 
 lr_scheduler:
   type: PolynomialDecay
-  learning_rate: 6.0e-5   # the origin lr is 1.2e-6, but the real used lr in vit_adapter is 6e-5
+  learning_rate: 6.0e-5
   end_lr: 0
   power: 1.0
   warmup_iters: 1500
@@ -86,5 +85,4 @@ model:
   pool_scales: [1, 2, 3, 6]
   dropout_ratio: 0.1
   aux_loss: True
-  aux_channels: 256
-  #pretrained: pretrained_model/upernet_deit_adapter_tiny_512_160_ade20k_from_torch.pdparams
\ No newline at end of file
+  aux_channels: 256
\ No newline at end of file
diff --git a/paddleseg/models/backbones/vit_adapter.py b/paddleseg/models/backbones/vit_adapter.py
index a9824884d5..48e4e374a6 100644
--- a/paddleseg/models/backbones/vit_adapter.py
+++ b/paddleseg/models/backbones/vit_adapter.py
@@ -1,4 +1,4 @@
-# This is heavily based on https://github.com/czczup/ViT-Adapter
+# This file is heavily based on https://github.com/czczup/ViT-Adapter
 
 import math
 from functools import partial
diff --git a/paddleseg/models/layers/vit_adapter_layers.py b/paddleseg/models/layers/vit_adapter_layers.py
index 4848343027..eaafc5c1d2 100644
--- a/paddleseg/models/layers/vit_adapter_layers.py
+++ b/paddleseg/models/layers/vit_adapter_layers.py
@@ -1,4 +1,4 @@
-# This is heavily based on https://github.com/czczup/ViT-Adapter
+# This file is heavily based on https://github.com/czczup/ViT-Adapter
 
 import math
 import warnings

From 466493bbe6e3ccc5188b8214f56f1594850f4b60 Mon Sep 17 00:00:00 2001
From: juncaipeng <13006307475@163.com>
Date: Fri, 25 Nov 2022 11:30:40 +0800
Subject: [PATCH 09/16] refine for merge 1

---
 paddleseg/core/train.py                       | 24 -------------------
 paddleseg/core/val.py                         |  1 -
 paddleseg/models/backbones/vit_adapter.py     |  7 +-----
 paddleseg/models/layers/vit_adapter_layers.py |  8 ++++++-
 4 files changed, 8 insertions(+), 32 deletions(-)

diff --git a/paddleseg/core/train.py b/paddleseg/core/train.py
index 01cd1cc054..635edf9ffa 100644
--- a/paddleseg/core/train.py
+++ b/paddleseg/core/train.py
@@ -174,17 +174,6 @@ def train(model,
             images = data['img']
             labels = data['label'].astype('int64')
 
-            debug = False  # set debug as True, change yml to load pretrained weights, and set dropout as 0
-            if debug:
-                import numpy as np
-                images = paddle.to_tensor(np.load('img.npy'))
-                labels = paddle.to_tensor(np.load(
-                    'gt_semantic_seg.npy')).squeeze()
-                '''
-                print('img', images.detach().cpu().numpy().mean())
-                print('gt_semantic_seg', labels.detach().cpu().numpy().mean())
-                '''
-
             edges = None
             if 'edge' in data.keys():
                 edges = data['edge'].astype('int64')
@@ -224,19 +213,6 @@ def train(model,
                 loss = sum(loss_list)
                 loss.backward()
 
-                if debug:
-                    print(loss_list)
-                    '''
-                    loss = sum(loss_list) * 1e3
-                    loss.backward()
-                    
-                    print(loss)
-                    for name, tensor in model.named_parameters():
-                        if tensor.grad is not None:
-                            print(name, tensor.grad.numpy().mean())
-                    exit()
-                    '''
-
                 # if the optimizer is ReduceOnPlateau, the loss is the one which has been pass into step.
                 if isinstance(optimizer, paddle.optimizer.lr.ReduceOnPlateau):
                     optimizer.step(loss)
diff --git a/paddleseg/core/val.py b/paddleseg/core/val.py
index 958946ab04..80a820b6bc 100644
--- a/paddleseg/core/val.py
+++ b/paddleseg/core/val.py
@@ -209,7 +209,6 @@ def evaluate(model,
             if local_rank == 0 and print_detail:
                 progbar_val.update(iter + 1, [('batch_cost', batch_cost),
                                               ('reader cost', reader_cost)])
-                print(total_iters, iter + 1)
             reader_cost_averager.reset()
             batch_cost_averager.reset()
             batch_start = time.time()
diff --git a/paddleseg/models/backbones/vit_adapter.py b/paddleseg/models/backbones/vit_adapter.py
index 48e4e374a6..333b4afe35 100644
--- a/paddleseg/models/backbones/vit_adapter.py
+++ b/paddleseg/models/backbones/vit_adapter.py
@@ -8,16 +8,11 @@
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
-from paddle import ParamAttr
-from paddle.regularizer import L2Decay
-from paddle.nn.initializer import Uniform, KaimingNormal
-from paddle.nn import Conv2D, BatchNorm, AdaptiveAvgPool2D, Linear
 
 from paddleseg.cvlibs import manager
-from paddleseg.cvlibs.param_init import normal_init, trunc_normal_init, constant_init
 from paddleseg.utils import utils, logger
+from paddleseg.cvlibs.param_init import normal_init, trunc_normal_init, constant_init
 from paddleseg.models.backbones.transformer_utils import to_2tuple, DropPath
-
 from paddleseg.models.layers.vit_adapter_layers import (
     SpatialPriorModule, InteractionBlock, deform_inputs, MSDeformAttn)
 
diff --git a/paddleseg/models/layers/vit_adapter_layers.py b/paddleseg/models/layers/vit_adapter_layers.py
index eaafc5c1d2..37dcb2d3e3 100644
--- a/paddleseg/models/layers/vit_adapter_layers.py
+++ b/paddleseg/models/layers/vit_adapter_layers.py
@@ -10,7 +10,13 @@
 from paddleseg.models.backbones.transformer_utils import DropPath
 from paddleseg.cvlibs.param_init import constant_init, xavier_uniform
 
-import ms_deform_attn as msda  # first install ms_deform_attn
+try:
+    import ms_deform_attn as msda
+except:
+    print(
+        "import ms_deform_attn failed, please refer the following doc to install ms_deform_attn lib: "
+        "https://github.com/PaddlePaddle/PaddleSeg/tree/develop/configs/upernet_vit_adapter"
+    )
 
 
 def get_reference_points(spatial_shapes):

From cc7aa0aec3eb91815dfc4d6b858dc130973dc461 Mon Sep 17 00:00:00 2001
From: juncaipeng <13006307475@163.com>
Date: Fri, 25 Nov 2022 12:56:18 +0800
Subject: [PATCH 10/16] refine for merge 2

---
 configs/vit_adapter/README.md                 | 10 +--
 ..._vit_adapter_tiny_ade20k_512x512_160k.yml} |  0
 paddleseg/core/train.py                       |  2 -
 paddleseg/models/upernet_vit_adapter.py       | 82 +++++++------------
 4 files changed, 32 insertions(+), 62 deletions(-)
 rename configs/vit_adapter/{upernet_deit_adapter_tiny_512_160k_ade20k.yml => upernet_vit_adapter_tiny_ade20k_512x512_160k.yml} (100%)

diff --git a/configs/vit_adapter/README.md b/configs/vit_adapter/README.md
index d7a15b0ca3..d364c3c8dd 100644
--- a/configs/vit_adapter/README.md
+++ b/configs/vit_adapter/README.md
@@ -1,15 +1,13 @@
-# Semantic Flow for Fast and Accurate Scene Parsing
+# Vision Transformer Adapter for Dense Predictions
 
 ## Reference
 
-> Xiangtai Li, Ansheng You, Zhen Zhu, Houlong Zhao, Maoke Yang, Kuiyuan Yang, Shaohua Tan, Yunhai Tong:
-Semantic Flow for Fast and Accurate Scene Parsing. ECCV (1) 2020: 775-793 .
+> Chen, Zhe, Yuchen Duan, Wenhai Wang, Junjun He, Tong Lu, Jifeng Dai, and Yu Qiao. "Vision Transformer Adapter for Dense Predictions." arXiv preprint arXiv:2205.08534 (2022).
 
 ## Performance
 
-### Cityscapes
+### ADE20K
 
 | Model | Backbone | Resolution | Training Iters | mIoU | mIoU (flip) | mIoU (ms+flip) | Links |
 |-|-|-|-|-|-|-|-|
-|SFNet|ResNet18_OS8|1024x1024|80000|78.72%|79.11%|79.28%|[model](https://bj.bcebos.com/paddleseg/dygraph/cityscapes/sfnet_resnet18_os8_cityscapes_1024x1024_80k/model.pdparams) \| [log](https://bj.bcebos.com/paddleseg/dygraph/cityscapes/sfnet_resnet18_os8_cityscapes_1024x1024_80k/train.log) \| [vdl](https://www.paddlepaddle.org.cn/paddle/visualdl/service/app/scalar?id=0d790ad96282048b136342fcebb08d14)|
-|SFNet|ResNet50_OS8|1024x1024|80000|81.49%|81.63%|81.85%|[model](https://bj.bcebos.com/paddleseg/dygraph/cityscapes/sfnet_resnet50_os8_cityscapes_1024x1024_80k/model.pdparams) \| [log](https://bj.bcebos.com/paddleseg/dygraph/cityscapes/sfnet_resnet50_os8_cityscapes_1024x1024_80k/train.log) \| [vdl](https://paddlepaddle.org.cn/paddle/visualdl/service/app?id=d458349ec63ea8ccd6fae84afa8ea981)|
+|UPerNetViTAdapter|ViT-Adapter-Tiny|512x512|160000|%|%|%|[model]() \| [log]() \| [vdl]()|
diff --git a/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml b/configs/vit_adapter/upernet_vit_adapter_tiny_ade20k_512x512_160k.yml
similarity index 100%
rename from configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml
rename to configs/vit_adapter/upernet_vit_adapter_tiny_ade20k_512x512_160k.yml
diff --git a/paddleseg/core/train.py b/paddleseg/core/train.py
index 635edf9ffa..0da04b01d4 100644
--- a/paddleseg/core/train.py
+++ b/paddleseg/core/train.py
@@ -173,7 +173,6 @@ def train(model,
             reader_cost_averager.record(time.time() - batch_start)
             images = data['img']
             labels = data['label'].astype('int64')
-
             edges = None
             if 'edge' in data.keys():
                 edges = data['edge'].astype('int64')
@@ -212,7 +211,6 @@ def train(model,
                     losses=losses)
                 loss = sum(loss_list)
                 loss.backward()
-
                 # if the optimizer is ReduceOnPlateau, the loss is the one which has been pass into step.
                 if isinstance(optimizer, paddle.optimizer.lr.ReduceOnPlateau):
                     optimizer.step(loss)
diff --git a/paddleseg/models/upernet_vit_adapter.py b/paddleseg/models/upernet_vit_adapter.py
index 7e56eb71f4..b158beef9e 100644
--- a/paddleseg/models/upernet_vit_adapter.py
+++ b/paddleseg/models/upernet_vit_adapter.py
@@ -24,21 +24,24 @@
 @manager.MODELS.add_component
 class UPerNetViTAdapter(nn.Layer):
     """
-    The UPerNet implementation based on PaddlePaddle.
+    The UPerNetViTAdapter implementation based on PaddlePaddle.
 
     The original article refers to
-    Tete Xiao, et, al. "Unified Perceptual Parsing for Scene Understanding"
-    (https://arxiv.org/abs/1807.10221).
+    Chen, Zhe, Yuchen Duan, Wenhai Wang, Junjun He, Tong Lu, Jifeng Dai, and Yu Qiao. 
+    "Vision Transformer Adapter for Dense Predictions." 
+    (https://arxiv.org/abs/2205.08534).
 
     Args:
         num_classes (int): The unique number of target classes.
-        backbone (Paddle.nn.Layer): Backbone network, currently support Resnet50/101.
-        backbone_indices (tuple): Four values in the tuple indicate the indices of output of backbone.
-        channels (int): The channels of inter layers. Default: 512.
-        aux_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: False.
+        backbone (nn.Layer): The backbone network.
+        backbone_indices (tuple | list): The values indicate the indices of output of backbone.
+        channels (int, optional): The channels of inter layers in upernet head. Default: 512.
+        pool_scales (list, optional): The scales in PPM. Default: [1, 2, 3, 6].
+        dropout_ratio (float, optional): The dropout ratio for upernet head. Default: 0.1.
+        aux_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
+        aux_channels (int, optional): The channels of inter layers in auxiliary head. Default: 256.
         align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
             e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
-        dropout_ratio (float): Dropout ratio for upernet head. Default: 0.1.
         pretrained (str, optional): The path or url of pretrained model. Default: None.
     """
 
@@ -72,6 +75,10 @@ def __init__(self,
         self.pretrained = pretrained
         self.init_weight()
 
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
     def forward(self, x):
         feats = self.backbone(x)
         feats = [feats[i] for i in self.backbone_indices]
@@ -85,10 +92,6 @@ def forward(self, x):
         ]
         return logit_list
 
-    def init_weight(self):
-        if self.pretrained is not None:
-            utils.load_entire_model(self, self.pretrained)
-
 
 class ConvBNReLU(nn.Layer):
     def __init__(self,
@@ -118,12 +121,9 @@ class PPM(nn.Layer):
     """Pooling Pyramid Module used in PSPNet.
 
     Args:
-        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
-            Module.
+        pool_scales (tuple | list): Pooling scales used in PPM.
         in_channels (int): Input channels.
-        channels (int): Channels after modules, before conv_seg.
-        conv_cfg (dict|None): Config of conv layers.
-        norm_cfg (dict|None): Config of norm layers.
+        channels (int): Output Channels after modules, before conv_seg.
         act_cfg (dict): Config of activation layers.
         align_corners (bool): align_corners argument of F.interpolate.
     """
@@ -145,7 +145,6 @@ def __init__(self, pool_scales, in_channels, channels, align_corners):
                         kernel_size=1)))
 
     def forward(self, x):
-        """Forward function."""
         ppm_outs = []
         for ppm in self.stages:
             ppm_out = ppm(x)
@@ -159,16 +158,20 @@ def forward(self, x):
 
 
 class UPerNetHead(nn.Layer):
-    """Unified Perceptual Parsing for Scene Understanding.
-
-    This head is the implementation of `UPerNet
-    <https://arxiv.org/abs/1807.10221>`_.
-
+    """
+    This head is the implementation of "Unified Perceptual Parsing for Scene Understanding".
     This is heavily based on https://github.com/czczup/ViT-Adapter
 
     Args:
-        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
-            Module applied on the last feature. Default: (1, 2, 3, 6).
+        num_classes (int): The unique number of target classes.
+        in_channels (list[int]): The channels of input features.
+        channels (int, optional): The channels of inter layers in upernet head. Default: 512.
+        pool_scales (list, optional): The scales in PPM. Default: [1, 2, 3, 6].
+        dropout_ratio (float, optional): The dropout ratio for upernet head. Default: 0.1.
+        aux_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
+        aux_channels (int, optional): The channels of inter layers in auxiliary head. Default: 256.
+        align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
+            e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
     """
 
     def __init__(self,
@@ -204,7 +207,6 @@ def __init__(self,
 
         self.fpn_bottleneck = ConvBNReLU(
             len(in_channels) * channels, channels, 3, padding=1)
-
         if dropout_ratio > 0:
             self.dropout = nn.Dropout2D(dropout_ratio)
         else:
@@ -219,7 +221,6 @@ def __init__(self,
                 aux_channels, num_classes, kernel_size=1)
 
     def psp_forward(self, inputs):
-        """Forward function of PSP module."""
         x = inputs[-1]
         psp_outs = [x]
         psp_outs.extend(self.psp_modules(x))
@@ -228,13 +229,6 @@ def psp_forward(self, inputs):
         return output
 
     def forward(self, inputs):
-        """Forward function."""
-        debug = False
-        if debug:
-            print('-------head 1----')
-            for x in inputs:
-                print(x.shape, x.numpy().mean())
-
         # build laterals
         laterals = [
             lateral_conv(inputs[i])
@@ -242,11 +236,6 @@ def forward(self, inputs):
         ]
         laterals.append(self.psp_forward(inputs))
 
-        if debug:
-            print('-------head 2----')
-            for x in laterals:
-                print(x.shape, x.numpy().mean())
-
         # build top-down path
         used_backbone_levels = len(laterals)
         for i in range(used_backbone_levels - 1, 0, -1):
@@ -264,11 +253,6 @@ def forward(self, inputs):
         ]
         fpn_outs.append(laterals[-1])  # append psp feature
 
-        if debug:
-            print('-------head 3----')
-            for x in fpn_outs:
-                print(x.shape, x.numpy().mean())
-
         for i in range(used_backbone_levels - 1, 0, -1):
             fpn_outs[i] = F.interpolate(
                 fpn_outs[i],
@@ -278,10 +262,6 @@ def forward(self, inputs):
         fpn_outs = paddle.concat(fpn_outs, axis=1)
         output = self.fpn_bottleneck(fpn_outs)
 
-        if debug:
-            print('-------head 4----')
-            print(output.shape, output.numpy().mean())
-
         if self.dropout is not None:
             output = self.dropout(output)
         output = self.conv_seg(output)
@@ -292,10 +272,4 @@ def forward(self, inputs):
             aux_output = self.aux_conv_seg(aux_output)
             logits_list.append(aux_output)
 
-        if debug:
-            print('-------head 5----')
-            for x in logits_list:
-                print(x.shape, x.numpy().mean())
-            exit()
-
         return logits_list

From 50ea8d125d944102d72c1318331e58407e0e6f10 Mon Sep 17 00:00:00 2001
From: juncaipeng <13006307475@163.com>
Date: Fri, 25 Nov 2022 14:58:43 +0800
Subject: [PATCH 11/16] refine for merge 3

---
 ...t_vit_adapter_tiny_ade20k_512x512_160k.yml |  16 +-
 paddleseg/models/backbones/vit_adapter.py     |  45 ++-
 .../models/layers/ms_deformable_attention.py  | 159 ++++++++++
 paddleseg/models/layers/vit_adapter_layers.py | 278 ++++--------------
 paddleseg/models/upernet_vit_adapter.py       |   2 +
 5 files changed, 247 insertions(+), 253 deletions(-)
 create mode 100644 paddleseg/models/layers/ms_deformable_attention.py

diff --git a/configs/vit_adapter/upernet_vit_adapter_tiny_ade20k_512x512_160k.yml b/configs/vit_adapter/upernet_vit_adapter_tiny_ade20k_512x512_160k.yml
index ab0f257346..1ebcab40b3 100644
--- a/configs/vit_adapter/upernet_vit_adapter_tiny_ade20k_512x512_160k.yml
+++ b/configs/vit_adapter/upernet_vit_adapter_tiny_ade20k_512x512_160k.yml
@@ -66,20 +66,8 @@ loss:
 model:
   type: UPerNetViTAdapter
   backbone:
-    type: ViTAdapter
-    num_heads: 3
-    patch_size: 16
-    embed_dim: 192
-    depth: 12
-    mlp_ratio: 4
-    drop_path_rate: 0.1
-    conv_inplane: 64
-    n_points: 4
-    deform_num_heads: 6
-    cffn_ratio: 0.25
-    deform_ratio: 1.0
-    interaction_indexes: [[0, 2], [3, 5], [6, 8], [9, 11]]
-    pretrained: pretrained_model/deit_tiny_patch16_224-a1311bcf_from_torch.pdparams
+    type: ViTAdapter_Tiny
+    pretrained: https://paddleseg.bj.bcebos.com/dygraph/backbone/deit_tiny_patch16_224.zip
   backbone_indices: [0, 1, 2, 3]
   channels: 512
   pool_scales: [1, 2, 3, 6]
diff --git a/paddleseg/models/backbones/vit_adapter.py b/paddleseg/models/backbones/vit_adapter.py
index 333b4afe35..649e89c9d5 100644
--- a/paddleseg/models/backbones/vit_adapter.py
+++ b/paddleseg/models/backbones/vit_adapter.py
@@ -16,7 +16,7 @@
 from paddleseg.models.layers.vit_adapter_layers import (
     SpatialPriorModule, InteractionBlock, deform_inputs, MSDeformAttn)
 
-__all__ = ['ViTAdapter']
+__all__ = ['ViTAdapter', 'ViTAdapter_Tiny']
 
 
 class PatchEmbed(nn.Layer):
@@ -352,8 +352,6 @@ def _add_level_embed(self, c2, c3, c4):
         return c2, c3, c4
 
     def forward(self, x):
-        debug = False
-
         deform_inputs1, deform_inputs2 = deform_inputs(x)
 
         # SPM forward
@@ -361,25 +359,12 @@ def forward(self, x):
         c2, c3, c4 = self._add_level_embed(c2, c3, c4)
         c = paddle.concat([c2, c3, c4], axis=1)
 
-        if debug:
-            print('----2----')
-            for i in deform_inputs1:
-                print(i.numpy().mean())
-            for i in deform_inputs2:
-                print(i.numpy().mean())
-            print(x.numpy().mean())
-            print(c.numpy().mean())
-
         # Patch Embedding forward
         x, H, W = self.patch_embed(x)
         bs, n, dim = x.shape
         pos_embed = self._get_pos_embed(self.pos_embed[:, 1:], H, W)
         x = self.pos_drop(x + pos_embed)
 
-        if debug:
-            print('-------3----')
-            print(x.numpy().mean())
-
         # Interaction
         outs = list()
         for i, layer in enumerate(self.interactions):
@@ -387,10 +372,6 @@ def forward(self, x):
             x, c = layer(x, c, self.blocks[indexes[0]:indexes[-1] + 1],
                          deform_inputs1, deform_inputs2, H, W)
             outs.append(x.transpose([0, 2, 1]).reshape([bs, dim, H, W]))
-            if debug:
-                print('-----4-{}------'.format(i))
-                print(x.numpy().mean())
-                print(c.numpy().mean())
 
         # Split & Reshape
         c2 = c[:, 0:c2.shape[1], :]
@@ -417,10 +398,22 @@ def forward(self, x):
         f2 = self.norm2(c2)
         f3 = self.norm3(c3)
         f4 = self.norm4(c4)
-        if debug:
-            print('-----5------')
-            print(f1.cpu().numpy().mean())
-            print(f2.cpu().numpy().mean())
-            print(f3.cpu().numpy().mean())
-            print(f4.cpu().numpy().mean())
         return [f1, f2, f3, f4]
+
+
+@manager.BACKBONES.add_component
+def ViTAdapter_Tiny(**kwargs):
+    return ViTAdapter(
+        num_heads=3,
+        patch_size=16,
+        embed_dim=192,
+        depth=12,
+        mlp_ratio=4,
+        drop_path_rate=0.1,
+        conv_inplane=64,
+        n_points=4,
+        deform_num_heads=6,
+        cffn_ratio=0.25,
+        deform_ratio=1.0,
+        interaction_indexes=[[0, 2], [3, 5], [6, 8], [9, 11]],
+        **kwargs)
\ No newline at end of file
diff --git a/paddleseg/models/layers/ms_deformable_attention.py b/paddleseg/models/layers/ms_deformable_attention.py
new file mode 100644
index 0000000000..37e18c22ae
--- /dev/null
+++ b/paddleseg/models/layers/ms_deformable_attention.py
@@ -0,0 +1,159 @@
+# This file is heavily based on https://github.com/czczup/ViT-Adapter
+import math
+import warnings
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddleseg.cvlibs import param_init
+from paddleseg.cvlibs.param_init import constant_init, xavier_uniform
+
+try:
+    import ms_deform_attn as msda
+except:
+    print(
+        "Import ms_deform_attn failed. Please first refer to the following document to install "
+        "ms_deform_attn lib, and then use multi-scale deformable attention module: "
+        "https://github.com/PaddlePaddle/PaddleSeg/tree/develop/configs/upernet_vit_adapter"
+    )
+
+
+class MSDeformAttn(nn.Layer):
+    def __init__(self,
+                 d_model=256,
+                 n_levels=4,
+                 n_heads=8,
+                 n_points=4,
+                 ratio=1.0):
+        """Multi-Scale Deformable Attention Module.
+        
+        Args:
+            d_model(int, optional): The hidden dimension. Default: 256
+            n_levels(int, optional): The number of feature levels. Default: 4
+            n_heads(int, optional): The number of attention heads. Default: 8
+            n_points(int, optional): The number of sampling points per attention head per feature level. Default: 4
+            ratio (float, optional): The ratio of channels for Linear. Default: 1.0
+        """
+        super().__init__()
+        if d_model % n_heads != 0:
+            raise ValueError('d_model must be divisible by n_heads, '
+                             'but got {} and {}'.format(d_model, n_heads))
+        _d_per_head = d_model // n_heads
+        # you'd better set _d_per_head to a power of 2
+        # which is more efficient in our CUDA implementation
+        if not self._is_power_of_2(_d_per_head):
+            warnings.warn("You'd better set d_model in MSDeformAttn to make "
+                          'the dimension of each attention head a power of 2 '
+                          'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = 64
+        self.d_model = d_model
+        self.n_levels = n_levels
+        self.n_heads = n_heads
+        self.n_points = n_points
+        self.ratio = ratio
+
+        self.sampling_offsets = nn.Linear(d_model,
+                                          n_heads * n_levels * n_points * 2)
+        self.attention_weights = nn.Linear(d_model,
+                                           n_heads * n_levels * n_points)
+        self.value_proj = nn.Linear(d_model, int(d_model * ratio))
+        self.output_proj = nn.Linear(int(d_model * ratio), d_model)
+
+        self._reset_parameters()
+
+    @staticmethod
+    def _is_power_of_2(n):
+        if (not isinstance(n, int)) or (n < 0):
+            raise ValueError('invalid input for _is_power_of_2: {} (type: {})'.
+                             format(n, type(n)))
+        return (n & (n - 1) == 0) and n != 0
+
+    def _reset_parameters(self):
+        constant_init(self.sampling_offsets.weight, value=0.)
+        thetas = paddle.arange(
+            self.n_heads, dtype='float32') * (2.0 * math.pi / self.n_heads)
+        grid_init = paddle.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init / grid_init.abs().max(
+            -1, keepdim=True)[0]).reshape([self.n_heads, 1, 1, 2]).tile(
+                [1, self.n_levels, self.n_points, 1])
+        for i in range(self.n_points):
+            grid_init[:, :, i, :] *= i + 1
+
+        grid_init = grid_init.reshape([-1])
+        self.sampling_offsets.bias = self.create_parameter(
+            shape=grid_init.shape,
+            default_initializer=paddle.nn.initializer.Assign(grid_init))
+        self.sampling_offsets.bias.stop_gradient = True
+
+        constant_init(self.attention_weights.weight, value=0.)
+        constant_init(self.attention_weights.bias, value=0.)
+        xavier_uniform(self.value_proj.weight)
+        constant_init(self.value_proj.bias, value=0.)
+        xavier_uniform(self.output_proj.weight)
+        constant_init(self.output_proj.bias, value=0.)
+
+    def forward(self,
+                query,
+                reference_points,
+                input_flatten,
+                input_spatial_shapes,
+                input_level_start_index,
+                input_padding_mask=None):
+        """
+        Args:
+            query:                       (N, Length_{query}, C)
+            reference_points:            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
+                                            or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
+            input_flatten:               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
+            input_spatial_shapes:        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+            input_level_start_index:     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
+            input_padding_mask:          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
+
+        Returns:
+            output                     (N, Length_{query}, C)
+        """
+
+        def masked_fill(x, mask, value):
+            y = paddle.full(x.shape, value, x.dtype)
+            return paddle.where(mask, y, x)
+
+        N, Len_q, _ = query.shape
+        N, Len_in, _ = input_flatten.shape
+        assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]
+                ).sum() == Len_in
+
+        value = self.value_proj(input_flatten)
+        if input_padding_mask is not None:
+            value = masked_fill(value, input_padding_mask[..., None], float(0))
+
+        value = value.reshape([
+            N, Len_in, self.n_heads,
+            int(self.ratio * self.d_model) // self.n_heads
+        ])
+        sampling_offsets = self.sampling_offsets(query).reshape(
+            [N, Len_q, self.n_heads, self.n_levels, self.n_points, 2])
+        attention_weights = self.attention_weights(query).reshape(
+            [N, Len_q, self.n_heads, self.n_levels * self.n_points])
+        attention_weights = F.softmax(attention_weights, -1).\
+            reshape([N, Len_q, self.n_heads, self.n_levels, self.n_points])
+
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = paddle.stack(
+                [input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]],
+                -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                                 + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = reference_points[:, :, None, :, None, :2] \
+                                 + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
+        else:
+            raise ValueError(
+                'Last dim of reference_points must be 2 or 4, but get {} instead.'
+                .format(reference_points.shape[-1]))
+        output = msda.ms_deform_attn(
+            value, input_spatial_shapes, input_level_start_index,
+            sampling_locations, attention_weights, self.im2col_step)
+        output = self.output_proj(output)
+        return output
diff --git a/paddleseg/models/layers/vit_adapter_layers.py b/paddleseg/models/layers/vit_adapter_layers.py
index 37dcb2d3e3..fef897bfa6 100644
--- a/paddleseg/models/layers/vit_adapter_layers.py
+++ b/paddleseg/models/layers/vit_adapter_layers.py
@@ -8,15 +8,7 @@
 import paddle.nn as nn
 import paddle.nn.functional as F
 from paddleseg.models.backbones.transformer_utils import DropPath
-from paddleseg.cvlibs.param_init import constant_init, xavier_uniform
-
-try:
-    import ms_deform_attn as msda
-except:
-    print(
-        "import ms_deform_attn failed, please refer the following doc to install ms_deform_attn lib: "
-        "https://github.com/PaddlePaddle/PaddleSeg/tree/develop/configs/upernet_vit_adapter"
-    )
+from paddleseg.models.layers.ms_deformable_attention import MSDeformAttn
 
 
 def get_reference_points(spatial_shapes):
@@ -37,7 +29,7 @@ def get_reference_points(spatial_shapes):
 
 
 def deform_inputs(x):
-    bs, c, h, w = x.shape
+    _, _, h, w = x.shape
     spatial_shapes = paddle.to_tensor(
         [(h // 8, w // 8), (h // 16, w // 16), (h // 32, w // 32)],
         dtype='int64')
@@ -56,14 +48,35 @@ def deform_inputs(x):
     return deform_inputs1, deform_inputs2
 
 
-def _is_power_of_2(n):
-    if (not isinstance(n, int)) or (n < 0):
-        raise ValueError('invalid input for _is_power_of_2: {} (type: {})'.
-                         format(n, type(n)))
-    return (n & (n - 1) == 0) and n != 0
+class DWConv(nn.Layer):
+    """
+    The specific DWConv unsed in ConvFFN. 
+    """
+
+    def __init__(self, dim=768):
+        super().__init__()
+        self.dwconv = nn.Conv2D(dim, dim, 3, 1, 1, bias_attr=True, groups=dim)
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        n = N // 21
+        x1 = x[:, 0:16 * n, :].transpose([0, 2, 1]).reshape(
+            [B, C, H * 2, W * 2])
+        x2 = x[:, 16 * n:20 * n, :].transpose([0, 2, 1]).reshape([B, C, H, W])
+        x3 = x[:, 20 * n:, :].transpose([0, 2, 1]).reshape(
+            [B, C, H // 2, W // 2])
+        x1 = self.dwconv(x1).flatten(2).transpose([0, 2, 1])
+        x2 = self.dwconv(x2).flatten(2).transpose([0, 2, 1])
+        x3 = self.dwconv(x3).flatten(2).transpose([0, 2, 1])
+        x = paddle.concat([x1, x2, x3], axis=1)
+        return x
 
 
 class ConvFFN(nn.Layer):
+    """
+    The implementation of ConvFFN unsed in Extractor.
+    """
+
     def __init__(self,
                  in_features,
                  hidden_features=None,
@@ -89,156 +102,11 @@ def forward(self, x, H, W):
         return x
 
 
-class DWConv(nn.Layer):
-    def __init__(self, dim=768):
-        super().__init__()
-        self.dwconv = nn.Conv2D(dim, dim, 3, 1, 1, bias_attr=True, groups=dim)
-
-    def forward(self, x, H, W):
-        B, N, C = x.shape
-        n = N // 21
-        x1 = x[:, 0:16 * n, :].transpose([0, 2, 1]).reshape(
-            [B, C, H * 2, W * 2])
-        x2 = x[:, 16 * n:20 * n, :].transpose([0, 2, 1]).reshape([B, C, H, W])
-        x3 = x[:, 20 * n:, :].transpose([0, 2, 1]).reshape(
-            [B, C, H // 2, W // 2])
-        x1 = self.dwconv(x1).flatten(2).transpose([0, 2, 1])
-        x2 = self.dwconv(x2).flatten(2).transpose([0, 2, 1])
-        x3 = self.dwconv(x3).flatten(2).transpose([0, 2, 1])
-        x = paddle.concat([x1, x2, x3], axis=1)
-        return x
-
-
-class MSDeformAttn(nn.Layer):
-    def __init__(self,
-                 d_model=256,
-                 n_levels=4,
-                 n_heads=8,
-                 n_points=4,
-                 ratio=1.0):
-        """Multi-Scale Deformable Attention Module.
-
-        :param d_model      hidden dimension
-        :param n_levels     number of feature levels
-        :param n_heads      number of attention heads
-        :param n_points     number of sampling points per attention head per feature level
-        """
-        super().__init__()
-        if d_model % n_heads != 0:
-            raise ValueError('d_model must be divisible by n_heads, '
-                             'but got {} and {}'.format(d_model, n_heads))
-        _d_per_head = d_model // n_heads
-        # you'd better set _d_per_head to a power of 2
-        # which is more efficient in our CUDA implementation
-        if not _is_power_of_2(_d_per_head):
-            warnings.warn("You'd better set d_model in MSDeformAttn to make "
-                          'the dimension of each attention head a power of 2 '
-                          'which is more efficient in our CUDA implementation.')
-
-        self.im2col_step = 64
-
-        self.d_model = d_model
-        self.n_levels = n_levels
-        self.n_heads = n_heads
-        self.n_points = n_points
-        self.ratio = ratio
-        self.sampling_offsets = nn.Linear(d_model,
-                                          n_heads * n_levels * n_points * 2)
-        self.attention_weights = nn.Linear(d_model,
-                                           n_heads * n_levels * n_points)
-        self.value_proj = nn.Linear(d_model, int(d_model * ratio))
-        self.output_proj = nn.Linear(int(d_model * ratio), d_model)
-
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        constant_init(self.sampling_offsets.weight, value=0.)
-        thetas = paddle.arange(
-            self.n_heads, dtype='float32') * (2.0 * math.pi / self.n_heads)
-        grid_init = paddle.stack([thetas.cos(), thetas.sin()], -1)
-        grid_init = (grid_init / grid_init.abs().max(
-            -1, keepdim=True)[0]).reshape([self.n_heads, 1, 1, 2]).tile(
-                [1, self.n_levels, self.n_points, 1])
-        for i in range(self.n_points):
-            grid_init[:, :, i, :] *= i + 1
-
-        grid_init = grid_init.reshape([-1])
-        self.sampling_offsets.bias = self.create_parameter(
-            shape=grid_init.shape,
-            default_initializer=paddle.nn.initializer.Assign(grid_init))
-        self.sampling_offsets.bias.stop_gradient = True
-
-        constant_init(self.attention_weights.weight, value=0.)
-        constant_init(self.attention_weights.bias, value=0.)
-        xavier_uniform(self.value_proj.weight)
-        constant_init(self.value_proj.bias, value=0.)
-        xavier_uniform(self.output_proj.weight)
-        constant_init(self.output_proj.bias, value=0.)
-
-    def forward(self,
-                query,
-                reference_points,
-                input_flatten,
-                input_spatial_shapes,
-                input_level_start_index,
-                input_padding_mask=None):
-        """
-        :param query                       (N, Length_{query}, C)
-        :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
-                                        or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
-        :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
-        :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
-        :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
-        :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
-
-        :return output                     (N, Length_{query}, C)
-        """
-
-        def masked_fill(x, mask, value):
-            y = paddle.full(x.shape, value, x.dtype)
-            return paddle.where(mask, y, x)
-
-        N, Len_q, _ = query.shape
-        N, Len_in, _ = input_flatten.shape
-        assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]
-                ).sum() == Len_in
-
-        value = self.value_proj(input_flatten)
-        if input_padding_mask is not None:
-            value = masked_fill(value, input_padding_mask[..., None], float(0))
-
-        value = value.reshape([
-            N, Len_in, self.n_heads,
-            int(self.ratio * self.d_model) // self.n_heads
-        ])
-        sampling_offsets = self.sampling_offsets(query).reshape(
-            [N, Len_q, self.n_heads, self.n_levels, self.n_points, 2])
-        attention_weights = self.attention_weights(query).reshape(
-            [N, Len_q, self.n_heads, self.n_levels * self.n_points])
-        attention_weights = F.softmax(attention_weights, -1).\
-            reshape([N, Len_q, self.n_heads, self.n_levels, self.n_points])
-
-        if reference_points.shape[-1] == 2:
-            offset_normalizer = paddle.stack(
-                [input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]],
-                -1)
-            sampling_locations = reference_points[:, :, None, :, None, :] \
-                                 + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
-        elif reference_points.shape[-1] == 4:
-            sampling_locations = reference_points[:, :, None, :, None, :2] \
-                                 + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
-        else:
-            raise ValueError(
-                'Last dim of reference_points must be 2 or 4, but get {} instead.'
-                .format(reference_points.shape[-1]))
-        output = msda.ms_deform_attn(
-            value, input_spatial_shapes, input_level_start_index,
-            sampling_locations, attention_weights, self.im2col_step)
-        output = self.output_proj(output)
-        return output
-
-
 class Extractor(nn.Layer):
+    """
+    The Extractor module in ViT-Adapter.
+    """
+
     def __init__(self,
                  dim,
                  num_heads=6,
@@ -272,23 +140,21 @@ def __init__(self,
 
     def forward(self, query, reference_points, feat, spatial_shapes,
                 level_start_index, H, W):
-        def _inner_forward(query, feat):
-            attn = self.attn(
-                self.query_norm(query), reference_points,
-                self.feat_norm(feat), spatial_shapes, level_start_index, None)
-            query = query + attn
-
-            if self.with_cffn:
-                query = query + self.drop_path(
-                    self.ffn(self.ffn_norm(query), H, W))
-            return query
-
-        query = _inner_forward(query, feat)
+        attn = self.attn(
+            self.query_norm(query), reference_points,
+            self.feat_norm(feat), spatial_shapes, level_start_index, None)
+        query = query + attn
 
+        if self.with_cffn:
+            query = query + self.drop_path(self.ffn(self.ffn_norm(query), H, W))
         return query
 
 
 class Injector(nn.Layer):
+    """
+    The Injector module in ViT-Adapter.
+    """
+
     def __init__(self,
                  dim,
                  num_heads=6,
@@ -314,18 +180,17 @@ def __init__(self,
 
     def forward(self, query, reference_points, feat, spatial_shapes,
                 level_start_index):
-        def _inner_forward(query, feat):
-            attn = self.attn(
-                self.query_norm(query), reference_points,
-                self.feat_norm(feat), spatial_shapes, level_start_index, None)
-            return query + self.gamma * attn
-
-        query = _inner_forward(query, feat)
-
-        return query
+        attn = self.attn(
+            self.query_norm(query), reference_points,
+            self.feat_norm(feat), spatial_shapes, level_start_index, None)
+        return query + self.gamma * attn
 
 
 class InteractionBlock(nn.Layer):
+    """
+    Combine the Extractor, Extractor and ViT Blocks.
+    """
+
     def __init__(self,
                  dim,
                  num_heads=6,
@@ -377,20 +242,15 @@ def __init__(self,
             self.extra_extractors = None
 
     def forward(self, x, c, blocks, deform_inputs1, deform_inputs2, H, W):
-        debug = False
         x = self.injector(
             query=x,
             reference_points=deform_inputs1[0],
             feat=c,
             spatial_shapes=deform_inputs1[1],
             level_start_index=deform_inputs1[2])
-        if debug:
-            print('x', x.cpu().numpy().mean())
 
         for idx, blk in enumerate(blocks):
             x = blk(x, H, W)
-            if debug:
-                print('x block_{}'.format(idx), x.cpu().numpy().mean())
 
         c = self.extractor(
             query=c,
@@ -400,8 +260,6 @@ def forward(self, x, c, blocks, deform_inputs1, deform_inputs2, H, W):
             level_start_index=deform_inputs2[2],
             H=H,
             W=W)
-        if debug:
-            print('c', c.cpu().numpy().mean())
 
         if self.extra_extractors is not None:
             for extractor in self.extra_extractors:
@@ -413,8 +271,6 @@ def forward(self, x, c, blocks, deform_inputs1, deform_inputs2, H, W):
                     level_start_index=deform_inputs2[2],
                     H=H,
                     W=W)
-            if debug:
-                print('c', c.cpu().numpy().mean())
 
         return x, c
 
@@ -588,22 +444,18 @@ def __init__(self, inplanes=64, embed_dim=384):
             bias_attr=True)
 
     def forward(self, x):
-        def _inner_forward(x):
-            c1 = self.stem(x)
-            c2 = self.conv2(c1)
-            c3 = self.conv3(c2)
-            c4 = self.conv4(c3)
-            c1 = self.fc1(c1)
-            c2 = self.fc2(c2)
-            c3 = self.fc3(c3)
-            c4 = self.fc4(c4)
-
-            bs, dim, _, _ = c1.shape
-            c2 = c2.reshape([bs, dim, -1]).transpose([0, 2, 1])  # 8s
-            c3 = c3.reshape([bs, dim, -1]).transpose([0, 2, 1])  # 16s
-            c4 = c4.reshape([bs, dim, -1]).transpose([0, 2, 1])  # 32s
-
-            return c1, c2, c3, c4
-
-        outs = _inner_forward(x)
-        return outs
+        c1 = self.stem(x)
+        c2 = self.conv2(c1)
+        c3 = self.conv3(c2)
+        c4 = self.conv4(c3)
+        c1 = self.fc1(c1)
+        c2 = self.fc2(c2)
+        c3 = self.fc3(c3)
+        c4 = self.fc4(c4)
+
+        bs, dim, _, _ = c1.shape
+        c2 = c2.reshape([bs, dim, -1]).transpose([0, 2, 1])  # 8s
+        c3 = c3.reshape([bs, dim, -1]).transpose([0, 2, 1])  # 16s
+        c4 = c4.reshape([bs, dim, -1]).transpose([0, 2, 1])  # 32s
+
+        return c1, c2, c3, c4
diff --git a/paddleseg/models/upernet_vit_adapter.py b/paddleseg/models/upernet_vit_adapter.py
index b158beef9e..cb9dcfd28f 100644
--- a/paddleseg/models/upernet_vit_adapter.py
+++ b/paddleseg/models/upernet_vit_adapter.py
@@ -31,6 +31,8 @@ class UPerNetViTAdapter(nn.Layer):
     "Vision Transformer Adapter for Dense Predictions." 
     (https://arxiv.org/abs/2205.08534).
 
+    The implementation is based on https://github.com/czczup/ViT-Adapter
+
     Args:
         num_classes (int): The unique number of target classes.
         backbone (nn.Layer): The backbone network.

From 539a1f03cbce17b1726c15f62214c547ffddf90b Mon Sep 17 00:00:00 2001
From: juncaipeng <13006307475@163.com>
Date: Mon, 28 Nov 2022 15:42:16 +0800
Subject: [PATCH 12/16] refine for merge 4

---
 configs/vit_adapter/README.md                      |  2 +-
 paddleseg/models/layers/ms_deformable_attention.py | 10 +++++-----
 paddleseg/models/layers/vit_adapter_layers.py      |  4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/configs/vit_adapter/README.md b/configs/vit_adapter/README.md
index d364c3c8dd..aab2e7e458 100644
--- a/configs/vit_adapter/README.md
+++ b/configs/vit_adapter/README.md
@@ -10,4 +10,4 @@
 
 | Model | Backbone | Resolution | Training Iters | mIoU | mIoU (flip) | mIoU (ms+flip) | Links |
 |-|-|-|-|-|-|-|-|
-|UPerNetViTAdapter|ViT-Adapter-Tiny|512x512|160000|%|%|%|[model]() \| [log]() \| [vdl]()|
+|UPerNetViTAdapter|ViT-Adapter-Tiny|512x512|160000|41.90%|-|-|[model](https://paddleseg.bj.bcebos.com/dygraph/ade20k/upernet_vit_adapter_tiny_ade20k_512x512_160k/model.pdparams) \| [log](https://paddleseg.bj.bcebos.com/dygraph/ade20k/upernet_vit_adapter_tiny_ade20k_512x512_160k/train_log.txt) \| [vdl](https://paddlepaddle.org.cn/paddle/visualdl/service/app?id=88173046bd09f61da5f48db66baddd7d)|
diff --git a/paddleseg/models/layers/ms_deformable_attention.py b/paddleseg/models/layers/ms_deformable_attention.py
index 37e18c22ae..880d12dca2 100644
--- a/paddleseg/models/layers/ms_deformable_attention.py
+++ b/paddleseg/models/layers/ms_deformable_attention.py
@@ -10,12 +10,12 @@
 from paddleseg.cvlibs.param_init import constant_init, xavier_uniform
 
 try:
-    import ms_deform_attn as msda
+    import ms_deform_attn
 except:
     print(
-        "Import ms_deform_attn failed. Please first refer to the following document to install "
-        "ms_deform_attn lib, and then use multi-scale deformable attention module: "
-        "https://github.com/PaddlePaddle/PaddleSeg/tree/develop/configs/upernet_vit_adapter"
+        "Import ms_deform_attn failed. Please download the following file and refer to "
+        "the readme to install ms_deform_attn lib: "
+        "https://paddleseg.bj.bcebos.com/dygraph/customized_ops/ms_deform_attn.zip"
     )
 
 
@@ -152,7 +152,7 @@ def masked_fill(x, mask, value):
             raise ValueError(
                 'Last dim of reference_points must be 2 or 4, but get {} instead.'
                 .format(reference_points.shape[-1]))
-        output = msda.ms_deform_attn(
+        output = ms_deform_attn.ms_deform_attn(
             value, input_spatial_shapes, input_level_start_index,
             sampling_locations, attention_weights, self.im2col_step)
         output = self.output_proj(output)
diff --git a/paddleseg/models/layers/vit_adapter_layers.py b/paddleseg/models/layers/vit_adapter_layers.py
index fef897bfa6..6735331db9 100644
--- a/paddleseg/models/layers/vit_adapter_layers.py
+++ b/paddleseg/models/layers/vit_adapter_layers.py
@@ -249,7 +249,7 @@ def forward(self, x, c, blocks, deform_inputs1, deform_inputs2, H, W):
             spatial_shapes=deform_inputs1[1],
             level_start_index=deform_inputs1[2])
 
-        for idx, blk in enumerate(blocks):
+        for _, blk in enumerate(blocks):
             x = blk(x, H, W)
 
         c = self.extractor(
@@ -334,7 +334,7 @@ def forward(self, x, c, cls, blocks, deform_inputs1, deform_inputs2, H, W):
             spatial_shapes=deform_inputs1[1],
             level_start_index=deform_inputs1[2])
         x = paddle.concat((cls, x), axis=1)
-        for idx, blk in enumerate(blocks):
+        for _, blk in enumerate(blocks):
             x = blk(x, H, W)
         cls, x = x[:, :1, ], x[:, 1:, ]
         c = self.extractor(

From d0b89c6bb221838d623310244cd2e3e3e5c2b54c Mon Sep 17 00:00:00 2001
From: juncaipeng <13006307475@163.com>
Date: Mon, 28 Nov 2022 16:35:24 +0800
Subject: [PATCH 13/16] refine for merge 5

---
 paddleseg/core/val.py                              | 2 ++
 paddleseg/models/backbones/vit_adapter.py          | 3 ++-
 paddleseg/models/layers/ms_deformable_attention.py | 1 +
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/paddleseg/core/val.py b/paddleseg/core/val.py
index 80a820b6bc..828edd5bdb 100644
--- a/paddleseg/core/val.py
+++ b/paddleseg/core/val.py
@@ -98,6 +98,8 @@ def evaluate(model,
     batch_start = time.time()
     with paddle.no_grad():
         for iter, data in enumerate(loader):
+            if iter % 20 == 0:
+                print('({} / {}'.format(iter, total_iters))
             reader_cost_averager.record(time.time() - batch_start)
             label = data['label'].astype('int64')
 
diff --git a/paddleseg/models/backbones/vit_adapter.py b/paddleseg/models/backbones/vit_adapter.py
index 649e89c9d5..6d5366d0f2 100644
--- a/paddleseg/models/backbones/vit_adapter.py
+++ b/paddleseg/models/backbones/vit_adapter.py
@@ -14,7 +14,8 @@
 from paddleseg.cvlibs.param_init import normal_init, trunc_normal_init, constant_init
 from paddleseg.models.backbones.transformer_utils import to_2tuple, DropPath
 from paddleseg.models.layers.vit_adapter_layers import (
-    SpatialPriorModule, InteractionBlock, deform_inputs, MSDeformAttn)
+    SpatialPriorModule, InteractionBlock, deform_inputs)
+from paddleseg.models.layers.ms_deformable_attention import MSDeformAttn
 
 __all__ = ['ViTAdapter', 'ViTAdapter_Tiny']
 
diff --git a/paddleseg/models/layers/ms_deformable_attention.py b/paddleseg/models/layers/ms_deformable_attention.py
index 880d12dca2..0df00e44d9 100644
--- a/paddleseg/models/layers/ms_deformable_attention.py
+++ b/paddleseg/models/layers/ms_deformable_attention.py
@@ -1,4 +1,5 @@
 # This file is heavily based on https://github.com/czczup/ViT-Adapter
+
 import math
 import warnings
 

From e5ba028178bc8cf4c5cfa7997e95ad3bd2fb2922 Mon Sep 17 00:00:00 2001
From: juncaipeng <13006307475@163.com>
Date: Mon, 28 Nov 2022 17:03:43 +0800
Subject: [PATCH 14/16] refine for merge 6

---
 paddleseg/models/layers/ms_deformable_attention.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddleseg/models/layers/ms_deformable_attention.py b/paddleseg/models/layers/ms_deformable_attention.py
index 0df00e44d9..4b7bc143c5 100644
--- a/paddleseg/models/layers/ms_deformable_attention.py
+++ b/paddleseg/models/layers/ms_deformable_attention.py
@@ -1,5 +1,4 @@
 # This file is heavily based on https://github.com/czczup/ViT-Adapter
-
 import math
 import warnings
 
@@ -18,6 +17,7 @@
         "the readme to install ms_deform_attn lib: "
         "https://paddleseg.bj.bcebos.com/dygraph/customized_ops/ms_deform_attn.zip"
     )
+    exit()
 
 
 class MSDeformAttn(nn.Layer):

From e7cf06533b7d878c3815d54c987c2b9b51f0afce Mon Sep 17 00:00:00 2001
From: juncaipeng <13006307475@163.com>
Date: Tue, 14 Mar 2023 19:59:43 +0800
Subject: [PATCH 15/16] up

---
 .../upernet_vit_adapter_tiny_ade20k_512x512_160k.yml   | 10 ----------
 paddleseg/core/val.py                                  |  2 --
 2 files changed, 12 deletions(-)

diff --git a/configs/vit_adapter/upernet_vit_adapter_tiny_ade20k_512x512_160k.yml b/configs/vit_adapter/upernet_vit_adapter_tiny_ade20k_512x512_160k.yml
index 1ebcab40b3..fbc2110a29 100644
--- a/configs/vit_adapter/upernet_vit_adapter_tiny_ade20k_512x512_160k.yml
+++ b/configs/vit_adapter/upernet_vit_adapter_tiny_ade20k_512x512_160k.yml
@@ -33,16 +33,6 @@ test_config:
   is_slide: True
   crop_size: [512, 512]
   stride: [341, 341]
-  
-export:
-  transforms:
-    - type: Resize
-      target_size: [2048, 512]
-      keep_ratio: True
-      size_divisor: 32
-    - type: Normalize
-      mean: [0.485, 0.456, 0.406]
-      std: [0.229, 0.224, 0.225]
 
 optimizer:
   _inherited_: False
diff --git a/paddleseg/core/val.py b/paddleseg/core/val.py
index 828edd5bdb..80a820b6bc 100644
--- a/paddleseg/core/val.py
+++ b/paddleseg/core/val.py
@@ -98,8 +98,6 @@ def evaluate(model,
     batch_start = time.time()
     with paddle.no_grad():
         for iter, data in enumerate(loader):
-            if iter % 20 == 0:
-                print('({} / {}'.format(iter, total_iters))
             reader_cost_averager.record(time.time() - batch_start)
             label = data['label'].astype('int64')
 

From 78bc4b25d40968edc4f17043f025063f0ee754c9 Mon Sep 17 00:00:00 2001
From: juncaipeng <13006307475@163.com>
Date: Fri, 17 Mar 2023 10:22:54 +0800
Subject: [PATCH 16/16] fix import ms_deform_attn

---
 configs/vit_adapter/README.md                 |  3 +++
 .../models/layers/ms_deformable_attention.py  | 19 +++++++++----------
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/configs/vit_adapter/README.md b/configs/vit_adapter/README.md
index aab2e7e458..ff904971fc 100644
--- a/configs/vit_adapter/README.md
+++ b/configs/vit_adapter/README.md
@@ -4,6 +4,9 @@
 
 > Chen, Zhe, Yuchen Duan, Wenhai Wang, Junjun He, Tong Lu, Jifeng Dai, and Yu Qiao. "Vision Transformer Adapter for Dense Predictions." arXiv preprint arXiv:2205.08534 (2022).
 
+## Prerequesites
+
+Download the ms_deform_attn.zip (https://paddleseg.bj.bcebos.com/dygraph/customized_ops/ms_deform_attn.zip), and then refer to the readme to install ms_deform_attn lib.  
 ## Performance
 
 ### ADE20K
diff --git a/paddleseg/models/layers/ms_deformable_attention.py b/paddleseg/models/layers/ms_deformable_attention.py
index 4b7bc143c5..8af9f36679 100644
--- a/paddleseg/models/layers/ms_deformable_attention.py
+++ b/paddleseg/models/layers/ms_deformable_attention.py
@@ -9,16 +9,6 @@
 from paddleseg.cvlibs import param_init
 from paddleseg.cvlibs.param_init import constant_init, xavier_uniform
 
-try:
-    import ms_deform_attn
-except:
-    print(
-        "Import ms_deform_attn failed. Please download the following file and refer to "
-        "the readme to install ms_deform_attn lib: "
-        "https://paddleseg.bj.bcebos.com/dygraph/customized_ops/ms_deform_attn.zip"
-    )
-    exit()
-
 
 class MSDeformAttn(nn.Layer):
     def __init__(self,
@@ -153,6 +143,15 @@ def masked_fill(x, mask, value):
             raise ValueError(
                 'Last dim of reference_points must be 2 or 4, but get {} instead.'
                 .format(reference_points.shape[-1]))
+        try:
+            import ms_deform_attn
+        except:
+            print(
+                "Import ms_deform_attn failed. Please download the following file and refer to "
+                "the readme to install ms_deform_attn lib: "
+                "https://paddleseg.bj.bcebos.com/dygraph/customized_ops/ms_deform_attn.zip"
+            )
+            exit()
         output = ms_deform_attn.ms_deform_attn(
             value, input_spatial_shapes, input_level_start_index,
             sampling_locations, attention_weights, self.im2col_step)