From f73f5529a3e81852e18c0f79eea7518809442a45 Mon Sep 17 00:00:00 2001 From: juncaipeng <13006307475@163.com> Date: Thu, 10 Nov 2022 15:01:08 +0800 Subject: [PATCH 01/16] add vit-adapter and align backbone forward --- configs/vit_adapter/README.md | 15 + ...rnet_deit_adapter_tiny_512_160k_ade20k.yml | 77 +++ paddleseg/models/backbones/__init__.py | 1 + paddleseg/models/backbones/vit_adapter.py | 458 +++++++++++++++++ paddleseg/models/layers/vit_adapter_layers.py | 462 ++++++++++++++++++ 5 files changed, 1013 insertions(+) create mode 100644 configs/vit_adapter/README.md create mode 100644 configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml create mode 100644 paddleseg/models/backbones/vit_adapter.py create mode 100644 paddleseg/models/layers/vit_adapter_layers.py diff --git a/configs/vit_adapter/README.md b/configs/vit_adapter/README.md new file mode 100644 index 0000000000..d7a15b0ca3 --- /dev/null +++ b/configs/vit_adapter/README.md @@ -0,0 +1,15 @@ +# Semantic Flow for Fast and Accurate Scene Parsing + +## Reference + +> Xiangtai Li, Ansheng You, Zhen Zhu, Houlong Zhao, Maoke Yang, Kuiyuan Yang, Shaohua Tan, Yunhai Tong: +Semantic Flow for Fast and Accurate Scene Parsing. ECCV (1) 2020: 775-793 . + +## Performance + +### Cityscapes + +| Model | Backbone | Resolution | Training Iters | mIoU | mIoU (flip) | mIoU (ms+flip) | Links | +|-|-|-|-|-|-|-|-| +|SFNet|ResNet18_OS8|1024x1024|80000|78.72%|79.11%|79.28%|[model](https://bj.bcebos.com/paddleseg/dygraph/cityscapes/sfnet_resnet18_os8_cityscapes_1024x1024_80k/model.pdparams) \| [log](https://bj.bcebos.com/paddleseg/dygraph/cityscapes/sfnet_resnet18_os8_cityscapes_1024x1024_80k/train.log) \| [vdl](https://www.paddlepaddle.org.cn/paddle/visualdl/service/app/scalar?id=0d790ad96282048b136342fcebb08d14)| +|SFNet|ResNet50_OS8|1024x1024|80000|81.49%|81.63%|81.85%|[model](https://bj.bcebos.com/paddleseg/dygraph/cityscapes/sfnet_resnet50_os8_cityscapes_1024x1024_80k/model.pdparams) \| [log](https://bj.bcebos.com/paddleseg/dygraph/cityscapes/sfnet_resnet50_os8_cityscapes_1024x1024_80k/train.log) \| [vdl](https://paddlepaddle.org.cn/paddle/visualdl/service/app?id=d458349ec63ea8ccd6fae84afa8ea981)| diff --git a/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml b/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml new file mode 100644 index 0000000000..32bfe32140 --- /dev/null +++ b/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml @@ -0,0 +1,77 @@ +_base_: '../_base_/ade20k.yml' + +batch_size: 4 # total batch size is 16 +iters: 160000 + +train_dataset: + transforms: + - type: ResizeStepScaling + min_scale_factor: 0.5 + max_scale_factor: 2.0 + scale_step_size: 0.25 + - type: RandomPaddingCrop + crop_size: [512, 512] + - type: RandomHorizontalFlip + - type: RandomDistort + brightness_range: 0.4 + contrast_range: 0.4 + saturation_range: 0.4 + - type: Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + +val_dataset: + transforms: + - type: Resize + target_size: [2048, 512] + keep_ratio: True + size_divisor: 32 + - type: Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + +export: + transforms: + - type: Resize + target_size: [2048, 512] + keep_ratio: True + size_divisor: 32 + - type: Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + +optimizer: + _inherited_: False + type: AdamW + weight_decay: 0.01 + +lr_scheduler: + type: PolynomialDecay + learning_rate: 0.0012 + end_lr: 0 + power: 1.0 + warmup_iters: 1500 + warmup_start_lr: 1.0e-6 + +loss: + types: + - type: CrossEntropyLoss + coef: [1] + +model: + type: TopFormer + backbone: + type: ViTAdapter + num_heads: 3 + patch_size: 16 + embed_dim: 192 + depth: 12 + mlp_ratio: 4 + drop_path_rate: 0.1 + conv_inplane: 64 + n_points: 4 + deform_num_heads: 6 + cffn_ratio: 0.25 + deform_ratio: 1.0 + interaction_indexes: [[0, 2], [3, 5], [6, 8], [9, 11]] + pretrained: pretrained_model/upernet_deit_adapter_tiny_512_160_ade20k_from_torch.pdparams \ No newline at end of file diff --git a/paddleseg/models/backbones/__init__.py b/paddleseg/models/backbones/__init__.py index 2241aaf77a..d5088b42c5 100644 --- a/paddleseg/models/backbones/__init__.py +++ b/paddleseg/models/backbones/__init__.py @@ -26,3 +26,4 @@ from .ghostnet import * from .top_transformer import * from .uhrnet import * +from .vit_adapter import * diff --git a/paddleseg/models/backbones/vit_adapter.py b/paddleseg/models/backbones/vit_adapter.py new file mode 100644 index 0000000000..3e4269423b --- /dev/null +++ b/paddleseg/models/backbones/vit_adapter.py @@ -0,0 +1,458 @@ +# The ViT-Adapter code was heavily based on https://github.com/czczup/ViT-Adapter + +import math +from functools import partial + +import numpy as np + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr +from paddle.regularizer import L2Decay +from paddle.nn.initializer import Uniform, KaimingNormal +from paddle.nn import Conv2D, BatchNorm, AdaptiveAvgPool2D, Linear + +from paddleseg.cvlibs import manager +from paddleseg.cvlibs.param_init import normal_init, trunc_normal_init, constant_init +from paddleseg.utils import utils, logger +from paddleseg.models.backbones.transformer_utils import to_2tuple, DropPath + +from paddleseg.models.layers.vit_adapter_layers import SpatialPriorModule, InteractionBlock, deform_inputs + +__all__ = ['ViTAdapter'] + + +class PatchEmbed(nn.Layer): + """2D Image to Patch Embedding.""" + + def __init__(self, + img_size=224, + patch_size=16, + in_chans=3, + embed_dim=768, + norm_layer=None, + flatten=True): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + self.img_size = img_size + self.patch_size = patch_size + self.grid_size = (img_size[0] // patch_size[0], + img_size[1] // patch_size[1]) + self.num_patches = self.grid_size[0] * self.grid_size[1] + self.flatten = flatten + + self.proj = nn.Conv2D( + in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() + + def forward(self, x): + x = self.proj(x) + _, _, H, W = x.shape + if self.flatten: + x = x.flatten(2).transpose([0, 2, 1]) # BCHW -> BNC + x = self.norm(x) + return x, H, W + + +class Mlp(nn.Layer): + def __init__(self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(nn.Layer): + def __init__(self, + dim, + num_heads=8, + qkv_bias=False, + attn_drop=0., + proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x, H, W): + x_shape = paddle.shape(x) + N, C = x_shape[1], x_shape[2] + qkv = self.qkv(x).reshape((-1, N, 3, self.num_heads, C // + self.num_heads)).transpose((2, 0, 3, 1, 4)) + q, k, v = qkv[0], qkv[1], qkv[2] + + attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale + attn = nn.functional.softmax(attn, axis=-1) + attn = self.attn_drop(attn) + + x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C)) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Layer): + def __init__(self, + dim, + num_heads, + mlp_ratio=4., + qkv_bias=False, + drop=0., + attn_drop=0., + drop_path=0., + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + layer_scale=False): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + attn_drop=attn_drop, + proj_drop=drop) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity( + ) + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop) + self.layer_scale = layer_scale + if layer_scale: + self.gamma1 = self.create_parameter( + shape=(dim, ), + default_initializer=paddle.nn.initializer.Constant(value=1.)) + self.gamma2 = self.create_parameter( + shape=(dim, ), + default_initializer=paddle.nn.initializer.Constant(value=1.)) + + def forward(self, x, H, W): + if self.layer_scale: + x = x + self.drop_path(self.gamma1 * self.attn(self.norm1(x), H, W)) + x = x + self.drop_path(self.gamma2 * self.mlp(self.norm2(x))) + else: + x = x + self.drop_path(self.attn(self.norm1(x), H, W)) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class VisionTransformer(nn.Layer): + """Vision Transformer. + + A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale` + - https://arxiv.org/abs/2010.11929 + + Includes distillation token & head support for `DeiT: Data-efficient Image Transformers` + - https://arxiv.org/abs/2012.12877 + """ + + def __init__(self, + img_size=224, + patch_size=16, + in_channels=3, + num_classes=1000, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4., + qkv_bias=True, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + layer_scale=True, + embed_layer=PatchEmbed, + norm_layer=partial( + nn.LayerNorm, epsilon=1e-6), + act_layer=nn.GELU, + pretrained=None): + """ + Args: + img_size (int, tuple): input image size + patch_size (int, tuple): patch size + in_channels (int): number of input channels + num_classes (int): number of classes for classification head + embed_dim (int): embedding dimension + depth (int): depth of transformer + num_heads (int): number of attention heads + mlp_ratio (int): ratio of mlp hidden dim to embedding dim + qkv_bias (bool): enable bias for qkv if True + drop_rate (float): dropout rate + attn_drop_rate (float): attention dropout rate + drop_path_rate (float): stochastic depth rate + embed_layer (nn.Module): patch embedding layer + norm_layer: (nn.Module): normalization layer + pretrained: (str): pretrained path + """ + super().__init__() + self.num_classes = num_classes + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + self.num_tokens = 1 + norm_layer = norm_layer or partial(nn.LayerNorm, epsilon=1e-6) + act_layer = act_layer or nn.GELU + self.norm_layer = norm_layer + self.act_layer = act_layer + self.pretrain_size = img_size + self.drop_path_rate = drop_path_rate + self.drop_rate = drop_rate + + self.patch_embed = embed_layer( + img_size=img_size, + patch_size=patch_size, + in_chans=in_channels, + embed_dim=embed_dim) + num_patches = self.patch_embed.num_patches + + self.pos_embed = self.create_parameter( + shape=(1, num_patches + self.num_tokens, embed_dim), + default_initializer=paddle.nn.initializer.Constant(value=0.)) + self.pos_drop = nn.Dropout(p=drop_rate) + + dpr = np.linspace(0, drop_path_rate, + depth) # stochastic depth decay rule + self.blocks = nn.Sequential(*[ + Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + act_layer=act_layer, + layer_scale=layer_scale) for i in range(depth) + ]) + + self.pretrained = pretrained + self.init_weight() + + def init_weight(self): + utils.load_pretrained_model(self, self.pretrained) + + def forward_features(self, x): + x, H, W = self.patch_embed(x) + cls_token = self.cls_token.expand( + x.shape[0], -1, -1) # stole cls_tokens impl from Phil Wang, thanks + x = paddle.concat([cls_tokens, x], axis=1) + x = self.pos_drop(x + self.pos_embed) + for blk in self.blocks: + x = blk(x, H, W) + return x + + def forward(self, x): + x = self.forward_features(x) + return x + + +@manager.BACKBONES.add_component +class ViTAdapter(VisionTransformer): + """ The ViT-Adapter + """ + + def __init__(self, + pretrain_size=224, + num_heads=12, + conv_inplane=64, + n_points=4, + deform_num_heads=6, + init_values=0., + interaction_indexes=None, + with_cffn=True, + cffn_ratio=0.25, + deform_ratio=1.0, + add_vit_feature=True, + pretrained=None, + use_extra_extractor=True, + *args, + **kwargs): + + super().__init__( + num_heads=num_heads, pretrained=pretrained, *args, **kwargs) + + self.cls_token = None + self.num_block = len(self.blocks) + self.pretrain_size = (pretrain_size, pretrain_size) + self.interaction_indexes = interaction_indexes + self.add_vit_feature = add_vit_feature + embed_dim = self.embed_dim + + self.level_embed = self.create_parameter( + shape=(3, embed_dim), + default_initializer=paddle.nn.initializer.Constant(value=0.)) + self.spm = SpatialPriorModule( + inplanes=conv_inplane, embed_dim=embed_dim) + self.interactions = nn.Sequential(*[ + InteractionBlock( + dim=embed_dim, + num_heads=deform_num_heads, + n_points=n_points, + init_values=init_values, + drop_path=self.drop_path_rate, + norm_layer=self.norm_layer, + with_cffn=with_cffn, + cffn_ratio=cffn_ratio, + deform_ratio=deform_ratio, + extra_extractor=((True if i == len(interaction_indexes) - 1 else + False) and use_extra_extractor)) + for i in range(len(interaction_indexes)) + ]) + self.up = nn.Conv2DTranspose(embed_dim, embed_dim, 2, 2) + self.norm1 = nn.SyncBatchNorm(embed_dim) + self.norm2 = nn.SyncBatchNorm(embed_dim) + self.norm3 = nn.SyncBatchNorm(embed_dim) + self.norm4 = nn.SyncBatchNorm(embed_dim) + + self.up.apply(self._init_weights) + self.spm.apply(self._init_weights) + self.interactions.apply(self._init_weights) + self.apply(self._init_deform_weights) + normal_init(self.level_embed) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_init(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + constant_init(m.bias, value=0) + elif isinstance(m, nn.LayerNorm) or isinstance(m, (nn.BatchNorm2D, + nn.SyncBatchNorm)): + constant_init(m.bias, value=0) + constant_init(m.weight, value=1.0) + elif isinstance(m, nn.Conv2D) or isinstance(m, nn.Conv2DTranspose): + fan_out = m._kernel_size[0] * m._kernel_size[1] * m._out_channels + fan_out //= m._groups + normal_init(m.weight, std=math.sqrt(2.0 / fan_out)) + if m.bias is not None: + constant_init(m.bias, value=0) + + def _get_pos_embed(self, pos_embed, H, W): + pos_embed = pos_embed.reshape( + [1, self.pretrain_size[0] // 16, self.pretrain_size[1] // 16, + -1]).transpose([0, 3, 1, 2]) + pos_embed = F.interpolate(pos_embed, size=(H, W), mode='bicubic', align_corners=False).\ + reshape([1, -1, H * W]).transpose([0, 2, 1]) + return pos_embed + + def _init_deform_weights(self, m): + ''' + if isinstance(m, MSDeformAttn): + m._reset_parameters() + ''' + pass + + def _add_level_embed(self, c2, c3, c4): + c2 = c2 + self.level_embed[0] + c3 = c3 + self.level_embed[1] + c4 = c4 + self.level_embed[2] + return c2, c3, c4 + + def forward(self, x): + debug = True + if debug: + import random + import numpy as np + random.seed(0) + np.random.seed(0) + x = np.random.rand(1, 3, 512, 512).astype("float32") + x = paddle.to_tensor(x, dtype='float32') + print('x0:', x.numpy().mean()) + + deform_inputs1, deform_inputs2 = deform_inputs(x) + + # SPM forward + c1, c2, c3, c4 = self.spm(x) + c2, c3, c4 = self._add_level_embed(c2, c3, c4) + c = paddle.concat([c2, c3, c4], axis=1) + + if debug: + print('----2----') + for i in deform_inputs1: + print(i.numpy().mean()) + for i in deform_inputs2: + print(i.numpy().mean()) + + # Patch Embedding forward + x, H, W = self.patch_embed(x) + bs, n, dim = x.shape + pos_embed = self._get_pos_embed(self.pos_embed[:, 1:], H, W) + x = self.pos_drop(x + pos_embed) + + if debug: + print('-------3----') + print(x.numpy().mean()) + + # Interaction + outs = list() + for i, layer in enumerate(self.interactions): + indexes = self.interaction_indexes[i] + x, c = layer(x, c, self.blocks[indexes[0]:indexes[-1] + 1], + deform_inputs1, deform_inputs2, H, W) + outs.append(x.transpose([0, 2, 1]).reshape([bs, dim, H, W])) + if debug: + print('-----4-{}------'.format(i)) + print(x.numpy().mean()) + print(c.numpy().mean()) + +# Split & Reshape + c2 = c[:, 0:c2.shape[1], :] + c3 = c[:, c2.shape[1]:c2.shape[1] + c3.shape[1], :] + c4 = c[:, c2.shape[1] + c3.shape[1]:, :] + + c2 = c2.transpose([0, 2, 1]).reshape([bs, dim, H * 2, W * 2]) + c3 = c3.transpose([0, 2, 1]).reshape([bs, dim, H, W]) + c4 = c4.transpose([0, 2, 1]).reshape([bs, dim, H // 2, W // 2]) + c1 = self.up(c2) + c1 + + if self.add_vit_feature: + x1, x2, x3, x4 = outs + x1 = F.interpolate( + x1, scale_factor=4, mode='bilinear', align_corners=False) + x2 = F.interpolate( + x2, scale_factor=2, mode='bilinear', align_corners=False) + x4 = F.interpolate( + x4, scale_factor=0.5, mode='bilinear', align_corners=False) + c1, c2, c3, c4 = c1 + x1, c2 + x2, c3 + x3, c4 + x4 + + # Final Norm + f1 = self.norm1(c1) + f2 = self.norm2(c2) + f3 = self.norm3(c3) + f4 = self.norm4(c4) + if debug: + print('-----5------') + print(f1.cpu().numpy().mean()) + print(f2.cpu().numpy().mean()) + print(f3.cpu().numpy().mean()) + print(f4.cpu().numpy().mean()) + # f1 = f1.cpu().numpy().mean() + # with msdeformatt + #assert np.allclose(f1, -0.03254774, rtol=0.0, atol=1e-6) + # without msdeformatt + #assert np.allclose(f1, -0.024487903, rtol=0.0, atol=1e-6) + exit() + return [f1, f2, f3, f4] diff --git a/paddleseg/models/layers/vit_adapter_layers.py b/paddleseg/models/layers/vit_adapter_layers.py new file mode 100644 index 0000000000..a73581890c --- /dev/null +++ b/paddleseg/models/layers/vit_adapter_layers.py @@ -0,0 +1,462 @@ +# The ViT-Adapter code was heavily based on https://github.com/czczup/ViT-Adapter + +from functools import partial + +import paddle +import paddle.nn as nn +from paddleseg.models.backbones.transformer_utils import DropPath + + +def get_reference_points(spatial_shapes): + reference_points_list = [] + for _, (H_, W_) in enumerate(spatial_shapes): + ref_y, ref_x = paddle.meshgrid( + paddle.linspace( + 0.5, H_ - 0.5, H_, dtype='float32'), + paddle.linspace( + 0.5, W_ - 0.5, W_, dtype='float32')) + ref_y = ref_y.reshape([1, -1]) / H_ + ref_x = ref_x.reshape([1, -1]) / W_ + ref = paddle.stack((ref_x, ref_y), -1) + reference_points_list.append(ref) + reference_points = paddle.concat(reference_points_list, 1) + reference_points = paddle.unsqueeze(reference_points, axis=2) + return reference_points + + +def deform_inputs(x): + bs, c, h, w = x.shape + spatial_shapes = paddle.to_tensor( + [(h // 8, w // 8), (h // 16, w // 16), (h // 32, w // 32)], + dtype='int64') + level_start_index = paddle.concat((paddle.zeros( + (1, ), dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1])) + reference_points = get_reference_points([(h // 16, w // 16)]) + deform_inputs1 = [reference_points, spatial_shapes, level_start_index] + + spatial_shapes = paddle.to_tensor([(h // 16, w // 16)], dtype='int64') + level_start_index = paddle.concat((paddle.zeros( + (1, ), dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1])) + reference_points = get_reference_points( + [(h // 8, w // 8), (h // 16, w // 16), (h // 32, w // 32)]) + deform_inputs2 = [reference_points, spatial_shapes, level_start_index] + + return deform_inputs1, deform_inputs2 + + +class ConvFFN(nn.Layer): + def __init__(self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.dwconv = DWConv(hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x, H, W): + x = self.fc1(x) + x = self.dwconv(x, H, W) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class DWConv(nn.Layer): + def __init__(self, dim=768): + super().__init__() + self.dwconv = nn.Conv2D(dim, dim, 3, 1, 1, bias_attr=True, groups=dim) + + def forward(self, x, H, W): + B, N, C = x.shape + n = N // 21 + x1 = x[:, 0:16 * n, :].transpose([0, 2, 1]).reshape( + [B, C, H * 2, W * 2]) + x2 = x[:, 16 * n:20 * n, :].transpose([0, 2, 1]).reshape([B, C, H, W]) + x3 = x[:, 20 * n:, :].transpose([0, 2, 1]).reshape( + [B, C, H // 2, W // 2]) + x1 = self.dwconv(x1).flatten(2).transpose([0, 2, 1]) + x2 = self.dwconv(x2).flatten(2).transpose([0, 2, 1]) + x3 = self.dwconv(x3).flatten(2).transpose([0, 2, 1]) + x = paddle.concat([x1, x2, x3], axis=1) + return x + + +class Extractor(nn.Layer): + def __init__(self, + dim, + num_heads=6, + n_points=4, + n_levels=1, + deform_ratio=1.0, + with_cffn=True, + cffn_ratio=0.25, + drop=0., + drop_path=0., + norm_layer=partial( + nn.LayerNorm, epsilon=1e-6)): + super().__init__() + self.query_norm = norm_layer(dim) + self.feat_norm = norm_layer(dim) + ''' + self.attn = MSDeformAttn(d_model=dim, n_levels=n_levels, n_heads=num_heads, + n_points=n_points, ratio=deform_ratio) + ''' + self.with_cffn = with_cffn + if with_cffn: + self.ffn = ConvFFN( + in_features=dim, + hidden_features=int(dim * cffn_ratio), + drop=drop) + self.ffn_norm = norm_layer(dim) + self.drop_path = DropPath( + drop_path) if drop_path > 0. else nn.Identity() + + def forward(self, query, reference_points, feat, spatial_shapes, + level_start_index, H, W): + def _inner_forward(query, feat): + ''' + attn = self.attn(self.query_norm(query), reference_points, + self.feat_norm(feat), spatial_shapes, + level_start_index, None) + query = query + attn + ''' + + if self.with_cffn: + query = query + self.drop_path( + self.ffn(self.ffn_norm(query), H, W)) + return query + + query = _inner_forward(query, feat) + + return query + + +class Injector(nn.Layer): + def __init__(self, + dim, + num_heads=6, + n_points=4, + n_levels=1, + deform_ratio=1.0, + norm_layer=partial( + nn.LayerNorm, epsilon=1e-6), + init_values=0.): + super().__init__() + self.query_norm = norm_layer(dim) + self.feat_norm = norm_layer(dim) + ''' + self.attn = MSDeformAttn(d_model=dim, n_levels=n_levels, n_heads=num_heads, + n_points=n_points, ratio=deform_ratio) + ''' + self.gamma = self.create_parameter( + shape=(dim, ), + default_initializer=paddle.nn.initializer.Constant( + value=init_values)) + + def forward(self, query, reference_points, feat, spatial_shapes, + level_start_index): + def _inner_forward(query, feat): + ''' + attn = self.attn(self.query_norm(query), reference_points, + self.feat_norm(feat), spatial_shapes, + level_start_index, None) + return query + self.gamma * attn + ''' + return query + + query = _inner_forward(query, feat) + + return query + + +class InteractionBlock(nn.Layer): + def __init__(self, + dim, + num_heads=6, + n_points=4, + norm_layer=partial( + nn.LayerNorm, epsilon=1e-6), + drop=0., + drop_path=0., + with_cffn=True, + cffn_ratio=0.25, + init_values=0., + deform_ratio=1.0, + extra_extractor=False): + super().__init__() + + self.injector = Injector( + dim=dim, + n_levels=3, + num_heads=num_heads, + init_values=init_values, + n_points=n_points, + norm_layer=norm_layer, + deform_ratio=deform_ratio) + self.extractor = Extractor( + dim=dim, + n_levels=1, + num_heads=num_heads, + n_points=n_points, + norm_layer=norm_layer, + deform_ratio=deform_ratio, + with_cffn=with_cffn, + cffn_ratio=cffn_ratio, + drop=drop, + drop_path=drop_path) + if extra_extractor: + self.extra_extractors = nn.Sequential(*[ + Extractor( + dim=dim, + num_heads=num_heads, + n_points=n_points, + norm_layer=norm_layer, + with_cffn=with_cffn, + cffn_ratio=cffn_ratio, + deform_ratio=deform_ratio, + drop=drop, + drop_path=drop_path) for _ in range(2) + ]) + else: + self.extra_extractors = None + + def forward(self, x, c, blocks, deform_inputs1, deform_inputs2, H, W): + debug = False + x = self.injector( + query=x, + reference_points=deform_inputs1[0], + feat=c, + spatial_shapes=deform_inputs1[1], + level_start_index=deform_inputs1[2]) + if debug: + print('x', x.cpu().numpy().mean()) + + for idx, blk in enumerate(blocks): + x = blk(x, H, W) + if debug: + print('x block_{}'.format(idx), x.cpu().numpy().mean()) + + c = self.extractor( + query=c, + reference_points=deform_inputs2[0], + feat=x, + spatial_shapes=deform_inputs2[1], + level_start_index=deform_inputs2[2], + H=H, + W=W) + if debug: + print('c', c.cpu().numpy().mean()) + + if self.extra_extractors is not None: + for extractor in self.extra_extractors: + c = extractor( + query=c, + reference_points=deform_inputs2[0], + feat=x, + spatial_shapes=deform_inputs2[1], + level_start_index=deform_inputs2[2], + H=H, + W=W) + if debug: + print('c', c.cpu().numpy().mean()) + + return x, c + + +class InteractionBlockWithCls(nn.Layer): + def __init__(self, + dim, + num_heads=6, + n_points=4, + norm_layer=partial( + nn.LayerNorm, eps=1e-6), + drop=0., + drop_path=0., + with_cffn=True, + cffn_ratio=0.25, + init_values=0., + deform_ratio=1.0, + extra_extractor=False): + super().__init__() + + self.injector = Injector( + dim=dim, + n_levels=3, + num_heads=num_heads, + init_values=init_values, + n_points=n_points, + norm_layer=norm_layer, + deform_ratio=deform_ratio) + self.extractor = Extractor( + dim=dim, + n_levels=1, + num_heads=num_heads, + n_points=n_points, + norm_layer=norm_layer, + deform_ratio=deform_ratio, + with_cffn=with_cffn, + cffn_ratio=cffn_ratio, + drop=drop, + drop_path=drop_path) + if extra_extractor: + self.extra_extractors = nn.Sequential(*[ + Extractor( + dim=dim, + num_heads=num_heads, + n_points=n_points, + norm_layer=norm_layer, + with_cffn=with_cffn, + cffn_ratio=cffn_ratio, + deform_ratio=deform_ratio, + drop=drop, + drop_path=drop_path) for _ in range(2) + ]) + else: + self.extra_extractors = None + + def forward(self, x, c, cls, blocks, deform_inputs1, deform_inputs2, H, W): + x = self.injector( + query=x, + reference_points=deform_inputs1[0], + feat=c, + spatial_shapes=deform_inputs1[1], + level_start_index=deform_inputs1[2]) + x = paddle.concat((cls, x), axis=1) + for idx, blk in enumerate(blocks): + x = blk(x, H, W) + cls, x = x[:, :1, ], x[:, 1:, ] + c = self.extractor( + query=c, + reference_points=deform_inputs2[0], + feat=x, + spatial_shapes=deform_inputs2[1], + level_start_index=deform_inputs2[2], + H=H, + W=W) + if self.extra_extractors is not None: + for extractor in self.extra_extractors: + c = extractor( + query=c, + reference_points=deform_inputs2[0], + feat=x, + spatial_shapes=deform_inputs2[1], + level_start_index=deform_inputs2[2], + H=H, + W=W) + return x, c, cls + + +class SpatialPriorModule(nn.Layer): + def __init__(self, inplanes=64, embed_dim=384): + super().__init__() + + self.stem = nn.Sequential(*[ + nn.Conv2D( + 3, + inplanes, + kernel_size=3, + stride=2, + padding=1, + bias_attr=False), nn.SyncBatchNorm(inplanes), nn.ReLU(), + nn.Conv2D( + inplanes, + inplanes, + kernel_size=3, + stride=1, + padding=1, + bias_attr=False), nn.SyncBatchNorm(inplanes), nn.ReLU(), + nn.Conv2D( + inplanes, + inplanes, + kernel_size=3, + stride=1, + padding=1, + bias_attr=False), nn.SyncBatchNorm(inplanes), nn.ReLU(), + nn.MaxPool2D( + kernel_size=3, stride=2, padding=1) + ]) + self.conv2 = nn.Sequential(*[ + nn.Conv2D( + inplanes, + 2 * inplanes, + kernel_size=3, + stride=2, + padding=1, + bias_attr=False), nn.SyncBatchNorm(2 * inplanes), nn.ReLU() + ]) + self.conv3 = nn.Sequential(*[ + nn.Conv2D( + 2 * inplanes, + 4 * inplanes, + kernel_size=3, + stride=2, + padding=1, + bias_attr=False), nn.SyncBatchNorm(4 * inplanes), nn.ReLU() + ]) + self.conv4 = nn.Sequential(*[ + nn.Conv2D( + 4 * inplanes, + 4 * inplanes, + kernel_size=3, + stride=2, + padding=1, + bias_attr=False), nn.SyncBatchNorm(4 * inplanes), nn.ReLU() + ]) + self.fc1 = nn.Conv2D( + inplanes, + embed_dim, + kernel_size=1, + stride=1, + padding=0, + bias_attr=True) + self.fc2 = nn.Conv2D( + 2 * inplanes, + embed_dim, + kernel_size=1, + stride=1, + padding=0, + bias_attr=True) + self.fc3 = nn.Conv2D( + 4 * inplanes, + embed_dim, + kernel_size=1, + stride=1, + padding=0, + bias_attr=True) + self.fc4 = nn.Conv2D( + 4 * inplanes, + embed_dim, + kernel_size=1, + stride=1, + padding=0, + bias_attr=True) + + def forward(self, x): + def _inner_forward(x): + c1 = self.stem(x) + c2 = self.conv2(c1) + c3 = self.conv3(c2) + c4 = self.conv4(c3) + c1 = self.fc1(c1) + c2 = self.fc2(c2) + c3 = self.fc3(c3) + c4 = self.fc4(c4) + + bs, dim, _, _ = c1.shape + c2 = c2.reshape([bs, dim, -1]).transpose([0, 2, 1]) # 8s + c3 = c3.reshape([bs, dim, -1]).transpose([0, 2, 1]) # 16s + c4 = c4.reshape([bs, dim, -1]).transpose([0, 2, 1]) # 32s + + return c1, c2, c3, c4 + + outs = _inner_forward(x) + return outs From 3fee1ed6c8f91244a32fb9363f8d8d8b23aa62d8 Mon Sep 17 00:00:00 2001 From: juncaipeng <13006307475@163.com> Date: Mon, 14 Nov 2022 15:24:24 +0800 Subject: [PATCH 02/16] align head infer forward --- ...rnet_deit_adapter_tiny_512_160k_ade20k.yml | 4 +- paddleseg/models/__init__.py | 1 + paddleseg/models/backbones/vit_adapter.py | 4 +- paddleseg/models/upernet_vit_adapter.py | 297 ++++++++++++++++++ 4 files changed, 303 insertions(+), 3 deletions(-) create mode 100644 paddleseg/models/upernet_vit_adapter.py diff --git a/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml b/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml index 32bfe32140..b1d4ac74b3 100644 --- a/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml +++ b/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml @@ -59,7 +59,7 @@ loss: coef: [1] model: - type: TopFormer + type: UPerNetViTAdapter backbone: type: ViTAdapter num_heads: 3 @@ -74,4 +74,6 @@ model: cffn_ratio: 0.25 deform_ratio: 1.0 interaction_indexes: [[0, 2], [3, 5], [6, 8], [9, 11]] + backbone_indices: [0, 1, 2, 3] + aux_loss: True pretrained: pretrained_model/upernet_deit_adapter_tiny_512_160_ade20k_from_torch.pdparams \ No newline at end of file diff --git a/paddleseg/models/__init__.py b/paddleseg/models/__init__.py index 1943bc5c86..bd9c3034ba 100644 --- a/paddleseg/models/__init__.py +++ b/paddleseg/models/__init__.py @@ -66,3 +66,4 @@ from .mscale_ocrnet import MscaleOCRNet from .topformer import TopFormer from .rtformer import RTFormer +from .upernet_vit_adapter import UPerNetViTAdapter diff --git a/paddleseg/models/backbones/vit_adapter.py b/paddleseg/models/backbones/vit_adapter.py index 3e4269423b..fcdc0cf757 100644 --- a/paddleseg/models/backbones/vit_adapter.py +++ b/paddleseg/models/backbones/vit_adapter.py @@ -301,6 +301,7 @@ def __init__(self, self.interaction_indexes = interaction_indexes self.add_vit_feature = add_vit_feature embed_dim = self.embed_dim + self.feat_channels = [embed_dim] * 4 self.level_embed = self.create_parameter( shape=(3, embed_dim), @@ -418,7 +419,7 @@ def forward(self, x): print(x.numpy().mean()) print(c.numpy().mean()) -# Split & Reshape + # Split & Reshape c2 = c[:, 0:c2.shape[1], :] c3 = c[:, c2.shape[1]:c2.shape[1] + c3.shape[1], :] c4 = c[:, c2.shape[1] + c3.shape[1]:, :] @@ -454,5 +455,4 @@ def forward(self, x): #assert np.allclose(f1, -0.03254774, rtol=0.0, atol=1e-6) # without msdeformatt #assert np.allclose(f1, -0.024487903, rtol=0.0, atol=1e-6) - exit() return [f1, f2, f3, f4] diff --git a/paddleseg/models/upernet_vit_adapter.py b/paddleseg/models/upernet_vit_adapter.py new file mode 100644 index 0000000000..117dbdf426 --- /dev/null +++ b/paddleseg/models/upernet_vit_adapter.py @@ -0,0 +1,297 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg import utils +from paddleseg.cvlibs import manager +from paddleseg.models import layers + + +@manager.MODELS.add_component +class UPerNetViTAdapter(nn.Layer): + """ + The UPerNet implementation based on PaddlePaddle. + + The original article refers to + Tete Xiao, et, al. "Unified Perceptual Parsing for Scene Understanding" + (https://arxiv.org/abs/1807.10221). + + Args: + num_classes (int): The unique number of target classes. + backbone (Paddle.nn.Layer): Backbone network, currently support Resnet50/101. + backbone_indices (tuple): Four values in the tuple indicate the indices of output of backbone. + channels (int): The channels of inter layers. Default: 512. + aux_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: False. + align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even, + e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. + dropout_ratio (float): Dropout ratio for upernet head. Default: 0.1. + pretrained (str, optional): The path or url of pretrained model. Default: None. + """ + + def __init__(self, + num_classes, + backbone, + backbone_indices, + channels=512, + pool_scales=[1, 2, 3, 6], + dropout_ratio=0.1, + aux_loss=True, + align_corners=False, + pretrained=None): + super().__init__() + self.backbone = backbone + self.backbone_indices = backbone_indices + self.align_corners = align_corners + + in_channels = [self.backbone.feat_channels[i] for i in backbone_indices] + self.head = UPerNetHead( + num_classes=num_classes, + in_channels=in_channels, + channels=channels, + pool_scales=pool_scales, + dropout_ratio=dropout_ratio, + aux_loss=aux_loss, + align_corners=align_corners) + + self.pretrained = pretrained + self.init_weight() + + def forward(self, x): + feats = self.backbone(x) + feats = [feats[i] for i in self.backbone_indices] + logit_list = self.head(feats) + logit_list = [ + F.interpolate( + logit, + paddle.shape(x)[2:], + mode='bilinear', + align_corners=self.align_corners) for logit in logit_list + ] + return logit_list + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + +class ConvBNReLU(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + bias_attr=False, + **kwargs): + super().__init__() + self.conv = nn.Conv2D( + in_channels, + out_channels, + kernel_size, + bias_attr=bias_attr, + **kwargs) + self.bn = nn.BatchNorm2D(out_channels) + self.relu = nn.ReLU() + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.relu(x) + return x + + +class PPM(nn.Layer): + """Pooling Pyramid Module used in PSPNet. + + Args: + pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid + Module. + in_channels (int): Input channels. + channels (int): Channels after modules, before conv_seg. + conv_cfg (dict|None): Config of conv layers. + norm_cfg (dict|None): Config of norm layers. + act_cfg (dict): Config of activation layers. + align_corners (bool): align_corners argument of F.interpolate. + """ + + def __init__(self, pool_scales, in_channels, channels, align_corners): + super().__init__() + self.pool_scales = pool_scales + self.in_channels = in_channels + self.channels = channels + self.align_corners = align_corners + self.stages = nn.LayerList() + for pool_scale in pool_scales: + self.stages.append( + nn.Sequential( + nn.AdaptiveAvgPool2D(output_size=(pool_scale, pool_scale)), + ConvBNReLU( + in_channels=in_channels, + out_channels=channels, + kernel_size=1))) + + def forward(self, x): + """Forward function.""" + ppm_outs = [] + for ppm in self.stages: + ppm_out = ppm(x) + upsampled_ppm_out = F.interpolate( + ppm_out, + paddle.shape(x)[2:], + mode='bilinear', + align_corners=self.align_corners) + ppm_outs.append(upsampled_ppm_out) + return ppm_outs + + +class UPerNetHead(nn.Layer): + """Unified Perceptual Parsing for Scene Understanding. + + This head is the implementation of `UPerNet + `_. + + Args: + pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid + Module applied on the last feature. Default: (1, 2, 3, 6). + """ + + def __init__(self, + num_classes, + in_channels, + channels, + pool_scales=[1, 2, 3, 6], + dropout_ratio=0.1, + aux_loss=False, + aux_channels=256, + align_corners=False): + super().__init__() + self.align_corners = align_corners + + # PSP Module + self.psp_modules = PPM(pool_scales, + in_channels[-1], + channels, + align_corners=align_corners) + self.bottleneck = ConvBNReLU( + in_channels[-1] + len(pool_scales) * channels, + channels, + 3, + padding=1) + # FPN Module + self.lateral_convs = nn.LayerList() + self.fpn_convs = nn.LayerList() + for ch in in_channels[:-1]: # skip the top layer + l_conv = ConvBNReLU(ch, channels, 1) + fpn_conv = ConvBNReLU(channels, channels, 3, padding=1) + self.lateral_convs.append(l_conv) + self.fpn_convs.append(fpn_conv) + + self.fpn_bottleneck = ConvBNReLU( + len(in_channels) * channels, channels, 3, padding=1) + + if dropout_ratio > 0: + self.dropout = nn.Dropout2D(dropout_ratio) + else: + self.dropout = None + self.conv_seg = nn.Conv2D(channels, num_classes, kernel_size=1) + + self.aux_loss = aux_loss + if self.aux_loss: + self.aux_conv = ConvBNReLU( + in_channels[2], aux_channels, 3, padding=1) + self.aux_conv_seg = nn.Conv2D( + aux_channels, num_classes, kernel_size=1) + + def psp_forward(self, inputs): + """Forward function of PSP module.""" + x = inputs[-1] + psp_outs = [x] + psp_outs.extend(self.psp_modules(x)) + psp_outs = paddle.concat(psp_outs, axis=1) + output = self.bottleneck(psp_outs) + return output + + def forward(self, inputs): + """Forward function.""" + debug = True + + if debug: + print('-------head 1----') + for x in inputs: + print(x.shape, x.numpy().mean()) + + # build laterals + laterals = [ + lateral_conv(inputs[i]) + for i, lateral_conv in enumerate(self.lateral_convs) + ] + laterals.append(self.psp_forward(inputs)) + + if debug: + print('-------head 2----') + for x in laterals: + print(x.shape, x.numpy().mean()) + + # build top-down path + used_backbone_levels = len(laterals) + for i in range(used_backbone_levels - 1, 0, -1): + upsampled = F.interpolate( + laterals[i], + paddle.shape(laterals[i - 1])[2:], + mode='bilinear', + align_corners=self.align_corners) + laterals[i - 1] = laterals[i - 1] + upsampled + + # build outputs + fpn_outs = [ + self.fpn_convs[i](laterals[i]) + for i in range(used_backbone_levels - 1) + ] + fpn_outs.append(laterals[-1]) # append psp feature + + if debug: + print('-------head 3----') + for x in fpn_outs: + print(x.shape, x.numpy().mean()) + + for i in range(used_backbone_levels - 1, 0, -1): + fpn_outs[i] = F.interpolate( + fpn_outs[i], + size=paddle.shape(fpn_outs[0])[2:], + mode='bilinear', + align_corners=self.align_corners) + fpn_outs = paddle.concat(fpn_outs, axis=1) + output = self.fpn_bottleneck(fpn_outs) + + if debug: + print('-------head 4----') + print(output.shape, output.numpy().mean()) + + if self.dropout is not None: + output = self.dropout(output) + output = self.conv_seg(output) + logits_list = [output] + + if self.aux_loss: + aux_output = self.aux_conv(inputs[2]) + aux_output = self.aux_conv_seg(aux_output) + logits_list.append(aux_output) + + if debug: + print('-------head 5----') + for x in logits_list: + print(x.shape, x.numpy().mean()) + exit() + return output From 79e597ac4c3dbe91f7a5d2f6e56632a44e2926f6 Mon Sep 17 00:00:00 2001 From: juncaipeng <13006307475@163.com> Date: Tue, 15 Nov 2022 15:11:01 +0800 Subject: [PATCH 03/16] aling model infer forward with ms_deform_attn --- paddleseg/models/backbones/vit_adapter.py | 9 +- paddleseg/models/layers/vit_adapter_layers.py | 181 ++++++++++++++++-- paddleseg/models/upernet_vit_adapter.py | 2 + 3 files changed, 167 insertions(+), 25 deletions(-) diff --git a/paddleseg/models/backbones/vit_adapter.py b/paddleseg/models/backbones/vit_adapter.py index fcdc0cf757..c002467b61 100644 --- a/paddleseg/models/backbones/vit_adapter.py +++ b/paddleseg/models/backbones/vit_adapter.py @@ -1,4 +1,4 @@ -# The ViT-Adapter code was heavily based on https://github.com/czczup/ViT-Adapter +# This is heavily based on https://github.com/czczup/ViT-Adapter import math from functools import partial @@ -18,7 +18,8 @@ from paddleseg.utils import utils, logger from paddleseg.models.backbones.transformer_utils import to_2tuple, DropPath -from paddleseg.models.layers.vit_adapter_layers import SpatialPriorModule, InteractionBlock, deform_inputs +from paddleseg.models.layers.vit_adapter_layers import ( + SpatialPriorModule, InteractionBlock, deform_inputs, MSDeformAttn) __all__ = ['ViTAdapter'] @@ -360,11 +361,8 @@ def _get_pos_embed(self, pos_embed, H, W): return pos_embed def _init_deform_weights(self, m): - ''' if isinstance(m, MSDeformAttn): m._reset_parameters() - ''' - pass def _add_level_embed(self, c2, c3, c4): c2 = c2 + self.level_embed[0] @@ -450,6 +448,7 @@ def forward(self, x): print(f2.cpu().numpy().mean()) print(f3.cpu().numpy().mean()) print(f4.cpu().numpy().mean()) + exit() # f1 = f1.cpu().numpy().mean() # with msdeformatt #assert np.allclose(f1, -0.03254774, rtol=0.0, atol=1e-6) diff --git a/paddleseg/models/layers/vit_adapter_layers.py b/paddleseg/models/layers/vit_adapter_layers.py index a73581890c..3f596bf7e9 100644 --- a/paddleseg/models/layers/vit_adapter_layers.py +++ b/paddleseg/models/layers/vit_adapter_layers.py @@ -1,10 +1,16 @@ -# The ViT-Adapter code was heavily based on https://github.com/czczup/ViT-Adapter +# This is heavily based on https://github.com/czczup/ViT-Adapter +import math +import warnings from functools import partial import paddle import paddle.nn as nn +import paddle.nn.functional as F from paddleseg.models.backbones.transformer_utils import DropPath +from paddleseg.cvlibs.param_init import constant_init, xavier_uniform + +import ms_deform_attn as msda # first install ms_deform_attn def get_reference_points(spatial_shapes): @@ -44,6 +50,13 @@ def deform_inputs(x): return deform_inputs1, deform_inputs2 +def _is_power_of_2(n): + if (not isinstance(n, int)) or (n < 0): + raise ValueError('invalid input for _is_power_of_2: {} (type: {})'. + format(n, type(n))) + return (n & (n - 1) == 0) and n != 0 + + class ConvFFN(nn.Layer): def __init__(self, in_features, @@ -90,6 +103,135 @@ def forward(self, x, H, W): return x +class MSDeformAttn(nn.Layer): + def __init__(self, + d_model=256, + n_levels=4, + n_heads=8, + n_points=4, + ratio=1.0): + """Multi-Scale Deformable Attention Module. + + :param d_model hidden dimension + :param n_levels number of feature levels + :param n_heads number of attention heads + :param n_points number of sampling points per attention head per feature level + """ + super().__init__() + if d_model % n_heads != 0: + raise ValueError('d_model must be divisible by n_heads, ' + 'but got {} and {}'.format(d_model, n_heads)) + _d_per_head = d_model // n_heads + # you'd better set _d_per_head to a power of 2 + # which is more efficient in our CUDA implementation + if not _is_power_of_2(_d_per_head): + warnings.warn("You'd better set d_model in MSDeformAttn to make " + 'the dimension of each attention head a power of 2 ' + 'which is more efficient in our CUDA implementation.') + + self.im2col_step = 64 + + self.d_model = d_model + self.n_levels = n_levels + self.n_heads = n_heads + self.n_points = n_points + self.ratio = ratio + self.sampling_offsets = nn.Linear(d_model, + n_heads * n_levels * n_points * 2) + self.attention_weights = nn.Linear(d_model, + n_heads * n_levels * n_points) + self.value_proj = nn.Linear(d_model, int(d_model * ratio)) + self.output_proj = nn.Linear(int(d_model * ratio), d_model) + + self._reset_parameters() + + def _reset_parameters(self): + constant_init(self.sampling_offsets.weight, value=0.) + thetas = paddle.arange( + self.n_heads, dtype='float32') * (2.0 * math.pi / self.n_heads) + grid_init = paddle.stack([thetas.cos(), thetas.sin()], -1) + grid_init = (grid_init / grid_init.abs().max( + -1, keepdim=True)[0]).reshape([self.n_heads, 1, 1, 2]).tile( + [1, self.n_levels, self.n_points, 1]) + for i in range(self.n_points): + grid_init[:, :, i, :] *= i + 1 + + with paddle.no_grad(): + grid_init = grid_init.reshape([-1]) + self.sampling_offsets.bias = self.create_parameter( + shape=grid_init.shape, + default_initializer=paddle.nn.initializer.Assign(grid_init)) + + constant_init(self.attention_weights.weight, value=0.) + constant_init(self.attention_weights.bias, value=0.) + xavier_uniform(self.value_proj.weight) + constant_init(self.value_proj.bias, value=0.) + xavier_uniform(self.output_proj.weight) + constant_init(self.output_proj.bias, value=0.) + + def forward(self, + query, + reference_points, + input_flatten, + input_spatial_shapes, + input_level_start_index, + input_padding_mask=None): + """ + :param query (N, Length_{query}, C) + :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area + or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes + :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C) + :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] + :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}] + :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements + + :return output (N, Length_{query}, C) + """ + + def masked_fill(x, mask, value): + y = paddle.full(x.shape, value, x.dtype) + return paddle.where(mask, y, x) + + N, Len_q, _ = query.shape + N, Len_in, _ = input_flatten.shape + assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1] + ).sum() == Len_in + + value = self.value_proj(input_flatten) + if input_padding_mask is not None: + value = masked_fill(value, input_padding_mask[..., None], float(0)) + + value = value.reshape([ + N, Len_in, self.n_heads, + int(self.ratio * self.d_model) // self.n_heads + ]) + sampling_offsets = self.sampling_offsets(query).reshape( + [N, Len_q, self.n_heads, self.n_levels, self.n_points, 2]) + attention_weights = self.attention_weights(query).reshape( + [N, Len_q, self.n_heads, self.n_levels * self.n_points]) + attention_weights = F.softmax(attention_weights, -1).\ + reshape([N, Len_q, self.n_heads, self.n_levels, self.n_points]) + + if reference_points.shape[-1] == 2: + offset_normalizer = paddle.stack( + [input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], + -1) + sampling_locations = reference_points[:, :, None, :, None, :] \ + + sampling_offsets / offset_normalizer[None, None, None, :, None, :] + elif reference_points.shape[-1] == 4: + sampling_locations = reference_points[:, :, None, :, None, :2] \ + + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 + else: + raise ValueError( + 'Last dim of reference_points must be 2 or 4, but get {} instead.' + .format(reference_points.shape[-1])) + output = msda.ms_deform_attn( + value, input_spatial_shapes, input_level_start_index, + sampling_locations, attention_weights, self.im2col_step) + output = self.output_proj(output) + return output + + class Extractor(nn.Layer): def __init__(self, dim, @@ -106,10 +248,12 @@ def __init__(self, super().__init__() self.query_norm = norm_layer(dim) self.feat_norm = norm_layer(dim) - ''' - self.attn = MSDeformAttn(d_model=dim, n_levels=n_levels, n_heads=num_heads, - n_points=n_points, ratio=deform_ratio) - ''' + self.attn = MSDeformAttn( + d_model=dim, + n_levels=n_levels, + n_heads=num_heads, + n_points=n_points, + ratio=deform_ratio) self.with_cffn = with_cffn if with_cffn: self.ffn = ConvFFN( @@ -123,12 +267,10 @@ def __init__(self, def forward(self, query, reference_points, feat, spatial_shapes, level_start_index, H, W): def _inner_forward(query, feat): - ''' - attn = self.attn(self.query_norm(query), reference_points, - self.feat_norm(feat), spatial_shapes, - level_start_index, None) + attn = self.attn( + self.query_norm(query), reference_points, + self.feat_norm(feat), spatial_shapes, level_start_index, None) query = query + attn - ''' if self.with_cffn: query = query + self.drop_path( @@ -153,10 +295,12 @@ def __init__(self, super().__init__() self.query_norm = norm_layer(dim) self.feat_norm = norm_layer(dim) - ''' - self.attn = MSDeformAttn(d_model=dim, n_levels=n_levels, n_heads=num_heads, - n_points=n_points, ratio=deform_ratio) - ''' + self.attn = MSDeformAttn( + d_model=dim, + n_levels=n_levels, + n_heads=num_heads, + n_points=n_points, + ratio=deform_ratio) self.gamma = self.create_parameter( shape=(dim, ), default_initializer=paddle.nn.initializer.Constant( @@ -165,13 +309,10 @@ def __init__(self, def forward(self, query, reference_points, feat, spatial_shapes, level_start_index): def _inner_forward(query, feat): - ''' - attn = self.attn(self.query_norm(query), reference_points, - self.feat_norm(feat), spatial_shapes, - level_start_index, None) + attn = self.attn( + self.query_norm(query), reference_points, + self.feat_norm(feat), spatial_shapes, level_start_index, None) return query + self.gamma * attn - ''' - return query query = _inner_forward(query, feat) diff --git a/paddleseg/models/upernet_vit_adapter.py b/paddleseg/models/upernet_vit_adapter.py index 117dbdf426..ca6336dd1a 100644 --- a/paddleseg/models/upernet_vit_adapter.py +++ b/paddleseg/models/upernet_vit_adapter.py @@ -162,6 +162,8 @@ class UPerNetHead(nn.Layer): This head is the implementation of `UPerNet `_. + This is heavily based on https://github.com/czczup/ViT-Adapter + Args: pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid Module applied on the last feature. Default: (1, 2, 3, 6). From fe4608673c797969a2213eaf1e8443819ae40208 Mon Sep 17 00:00:00 2001 From: juncaipeng <13006307475@163.com> Date: Tue, 15 Nov 2022 17:27:10 +0800 Subject: [PATCH 04/16] align ade20k inference 1115 --- ...upernet_deit_adapter_tiny_512_160k_ade20k.yml | 16 +++++++++++++--- paddleseg/core/val.py | 1 + paddleseg/models/backbones/vit_adapter.py | 7 ++----- paddleseg/models/upernet_vit_adapter.py | 11 +++++++---- 4 files changed, 23 insertions(+), 12 deletions(-) diff --git a/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml b/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml index b1d4ac74b3..fcdcd0f32c 100644 --- a/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml +++ b/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml @@ -1,6 +1,6 @@ _base_: '../_base_/ade20k.yml' -batch_size: 4 # total batch size is 16 +batch_size: 2 # total batch size is 16 iters: 160000 train_dataset: @@ -30,6 +30,11 @@ val_dataset: mean: [0.485, 0.456, 0.406] std: [0.229, 0.224, 0.225] +test_config: + is_slide: True + crop_size: [512, 512] + stride: [341, 341] + export: transforms: - type: Resize @@ -47,7 +52,7 @@ optimizer: lr_scheduler: type: PolynomialDecay - learning_rate: 0.0012 + learning_rate: 1.2e-4 end_lr: 0 power: 1.0 warmup_iters: 1500 @@ -56,7 +61,8 @@ lr_scheduler: loss: types: - type: CrossEntropyLoss - coef: [1] + - type: CrossEntropyLoss + coef: [1, 0.4] model: type: UPerNetViTAdapter @@ -75,5 +81,9 @@ model: deform_ratio: 1.0 interaction_indexes: [[0, 2], [3, 5], [6, 8], [9, 11]] backbone_indices: [0, 1, 2, 3] + channels: 512 + pool_scales: [1, 2, 3, 6] + dropout_ratio: 0.1 aux_loss: True + aux_channels: 256 pretrained: pretrained_model/upernet_deit_adapter_tiny_512_160_ade20k_from_torch.pdparams \ No newline at end of file diff --git a/paddleseg/core/val.py b/paddleseg/core/val.py index 80a820b6bc..958946ab04 100644 --- a/paddleseg/core/val.py +++ b/paddleseg/core/val.py @@ -209,6 +209,7 @@ def evaluate(model, if local_rank == 0 and print_detail: progbar_val.update(iter + 1, [('batch_cost', batch_cost), ('reader cost', reader_cost)]) + print(total_iters, iter + 1) reader_cost_averager.reset() batch_cost_averager.reset() batch_start = time.time() diff --git a/paddleseg/models/backbones/vit_adapter.py b/paddleseg/models/backbones/vit_adapter.py index c002467b61..c30c5e35bd 100644 --- a/paddleseg/models/backbones/vit_adapter.py +++ b/paddleseg/models/backbones/vit_adapter.py @@ -371,7 +371,7 @@ def _add_level_embed(self, c2, c3, c4): return c2, c3, c4 def forward(self, x): - debug = True + debug = False if debug: import random import numpy as np @@ -448,10 +448,7 @@ def forward(self, x): print(f2.cpu().numpy().mean()) print(f3.cpu().numpy().mean()) print(f4.cpu().numpy().mean()) - exit() # f1 = f1.cpu().numpy().mean() # with msdeformatt - #assert np.allclose(f1, -0.03254774, rtol=0.0, atol=1e-6) - # without msdeformatt - #assert np.allclose(f1, -0.024487903, rtol=0.0, atol=1e-6) + #assert np.allclose(f1, -0.03252137, rtol=0.0, atol=1e-6) return [f1, f2, f3, f4] diff --git a/paddleseg/models/upernet_vit_adapter.py b/paddleseg/models/upernet_vit_adapter.py index ca6336dd1a..76f649a3c1 100644 --- a/paddleseg/models/upernet_vit_adapter.py +++ b/paddleseg/models/upernet_vit_adapter.py @@ -50,6 +50,7 @@ def __init__(self, pool_scales=[1, 2, 3, 6], dropout_ratio=0.1, aux_loss=True, + aux_channels=256, align_corners=False, pretrained=None): super().__init__() @@ -65,6 +66,7 @@ def __init__(self, pool_scales=pool_scales, dropout_ratio=dropout_ratio, aux_loss=aux_loss, + aux_channels=aux_channels, align_corners=align_corners) self.pretrained = pretrained @@ -227,8 +229,7 @@ def psp_forward(self, inputs): def forward(self, inputs): """Forward function.""" - debug = True - + debug = False if debug: print('-------head 1----') for x in inputs: @@ -286,7 +287,7 @@ def forward(self, inputs): output = self.conv_seg(output) logits_list = [output] - if self.aux_loss: + if self.aux_loss and self.training: aux_output = self.aux_conv(inputs[2]) aux_output = self.aux_conv_seg(aux_output) logits_list.append(aux_output) @@ -295,5 +296,7 @@ def forward(self, inputs): print('-------head 5----') for x in logits_list: print(x.shape, x.numpy().mean()) + # -20.250404 -15.875856 exit() - return output + + return logits_list From 7193096d387d9cafc9a04e9e4d30270201f89be5 Mon Sep 17 00:00:00 2001 From: juncaipeng <13006307475@163.com> Date: Wed, 16 Nov 2022 20:27:00 +0800 Subject: [PATCH 05/16] 1116 0 --- .../upernet_deit_adapter_tiny_512_160k_ade20k.yml | 3 ++- paddleseg/models/backbones/vit_adapter.py | 14 -------------- paddleseg/models/layers/vit_adapter_layers.py | 10 +++++----- 3 files changed, 7 insertions(+), 20 deletions(-) diff --git a/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml b/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml index fcdcd0f32c..299efd0899 100644 --- a/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml +++ b/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml @@ -80,10 +80,11 @@ model: cffn_ratio: 0.25 deform_ratio: 1.0 interaction_indexes: [[0, 2], [3, 5], [6, 8], [9, 11]] + pretrained: pretrained_model/deit_tiny_patch16_224-a1311bcf_from_torch.pdparams backbone_indices: [0, 1, 2, 3] channels: 512 pool_scales: [1, 2, 3, 6] dropout_ratio: 0.1 aux_loss: True aux_channels: 256 - pretrained: pretrained_model/upernet_deit_adapter_tiny_512_160_ade20k_from_torch.pdparams \ No newline at end of file + #pretrained: pretrained_model/upernet_deit_adapter_tiny_512_160_ade20k_from_torch.pdparams \ No newline at end of file diff --git a/paddleseg/models/backbones/vit_adapter.py b/paddleseg/models/backbones/vit_adapter.py index c30c5e35bd..9178b25363 100644 --- a/paddleseg/models/backbones/vit_adapter.py +++ b/paddleseg/models/backbones/vit_adapter.py @@ -256,20 +256,6 @@ def __init__(self, def init_weight(self): utils.load_pretrained_model(self, self.pretrained) - def forward_features(self, x): - x, H, W = self.patch_embed(x) - cls_token = self.cls_token.expand( - x.shape[0], -1, -1) # stole cls_tokens impl from Phil Wang, thanks - x = paddle.concat([cls_tokens, x], axis=1) - x = self.pos_drop(x + self.pos_embed) - for blk in self.blocks: - x = blk(x, H, W) - return x - - def forward(self, x): - x = self.forward_features(x) - return x - @manager.BACKBONES.add_component class ViTAdapter(VisionTransformer): diff --git a/paddleseg/models/layers/vit_adapter_layers.py b/paddleseg/models/layers/vit_adapter_layers.py index 3f596bf7e9..4848343027 100644 --- a/paddleseg/models/layers/vit_adapter_layers.py +++ b/paddleseg/models/layers/vit_adapter_layers.py @@ -156,11 +156,11 @@ def _reset_parameters(self): for i in range(self.n_points): grid_init[:, :, i, :] *= i + 1 - with paddle.no_grad(): - grid_init = grid_init.reshape([-1]) - self.sampling_offsets.bias = self.create_parameter( - shape=grid_init.shape, - default_initializer=paddle.nn.initializer.Assign(grid_init)) + grid_init = grid_init.reshape([-1]) + self.sampling_offsets.bias = self.create_parameter( + shape=grid_init.shape, + default_initializer=paddle.nn.initializer.Assign(grid_init)) + self.sampling_offsets.bias.stop_gradient = True constant_init(self.attention_weights.weight, value=0.) constant_init(self.attention_weights.bias, value=0.) From 8263d01871b20d2da5709d8fd2869dd4cd36baf8 Mon Sep 17 00:00:00 2001 From: juncaipeng <13006307475@163.com> Date: Tue, 22 Nov 2022 13:22:09 +0800 Subject: [PATCH 06/16] align ce loss by adding avg_no_ignore --- .../upernet_deit_adapter_tiny_512_160k_ade20k.yml | 2 +- paddleseg/models/backbones/vit_adapter.py | 4 +++- paddleseg/models/losses/cross_entropy_loss.py | 11 ++++++++--- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml b/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml index 299efd0899..332c2a39f0 100644 --- a/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml +++ b/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml @@ -61,7 +61,7 @@ lr_scheduler: loss: types: - type: CrossEntropyLoss - - type: CrossEntropyLoss + avg_non_ignore: False coef: [1, 0.4] model: diff --git a/paddleseg/models/backbones/vit_adapter.py b/paddleseg/models/backbones/vit_adapter.py index 9178b25363..a993e742ac 100644 --- a/paddleseg/models/backbones/vit_adapter.py +++ b/paddleseg/models/backbones/vit_adapter.py @@ -358,7 +358,7 @@ def _add_level_embed(self, c2, c3, c4): def forward(self, x): debug = False - if debug: + if False: import random import numpy as np random.seed(0) @@ -380,6 +380,8 @@ def forward(self, x): print(i.numpy().mean()) for i in deform_inputs2: print(i.numpy().mean()) + print(x.numpy().mean()) + print(c.numpy().mean()) # Patch Embedding forward x, H, W = self.patch_embed(x) diff --git a/paddleseg/models/losses/cross_entropy_loss.py b/paddleseg/models/losses/cross_entropy_loss.py index c934a0a5b4..b1cfb3a624 100644 --- a/paddleseg/models/losses/cross_entropy_loss.py +++ b/paddleseg/models/losses/cross_entropy_loss.py @@ -33,6 +33,7 @@ class CrossEntropyLoss(nn.Layer): top_k_percent_pixels (float, optional): the value lies in [0.0, 1.0]. When its value < 1.0, only compute the loss for the top k percent pixels (e.g., the top 20% pixels). This is useful for hard pixel mining. Default ``1.0``. + avg_non_ignore (bool, optional): Whether the loss is only averaged over non-ignored value of pixels. Default: True. data_format (str, optional): The tensor format to use, 'NCHW' or 'NHWC'. Default ``'NCHW'``. """ @@ -40,10 +41,12 @@ def __init__(self, weight=None, ignore_index=255, top_k_percent_pixels=1.0, + avg_non_ignore=True, data_format='NCHW'): super(CrossEntropyLoss, self).__init__() self.ignore_index = ignore_index self.top_k_percent_pixels = top_k_percent_pixels + self.avg_non_ignore = avg_non_ignore self.EPS = 1e-8 self.data_format = data_format if weight is not None: @@ -107,10 +110,12 @@ def _post_process_loss(self, logit, label, semantic_weights, loss): Returns: (Tensor): The average loss. """ - mask = label != self.ignore_index - mask = paddle.cast(mask, 'float32') - label.stop_gradient = True + if self.avg_non_ignore: + mask = paddle.cast(label != self.ignore_index, dtype='float32') + else: + mask = paddle.ones(label.shape, dtype='float32') mask.stop_gradient = True + label.stop_gradient = True if loss.ndim > mask.ndim: loss = paddle.squeeze(loss, axis=-1) From d011c2bb942c718c28119429948151c499baef3b Mon Sep 17 00:00:00 2001 From: juncaipeng <13006307475@163.com> Date: Tue, 22 Nov 2022 16:07:46 +0800 Subject: [PATCH 07/16] change yml for real train --- ...rnet_deit_adapter_tiny_512_160k_ade20k.yml | 2 +- paddleseg/core/train.py | 26 +++++++++++++++++++ paddleseg/models/backbones/vit_adapter.py | 11 -------- paddleseg/models/upernet_vit_adapter.py | 1 - 4 files changed, 27 insertions(+), 13 deletions(-) diff --git a/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml b/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml index 332c2a39f0..b2a15c2004 100644 --- a/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml +++ b/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml @@ -52,7 +52,7 @@ optimizer: lr_scheduler: type: PolynomialDecay - learning_rate: 1.2e-4 + learning_rate: 6.0e-5 # the origin lr is 1.2e-6, but the real used lr in vit_adapter is 6e-5 end_lr: 0 power: 1.0 warmup_iters: 1500 diff --git a/paddleseg/core/train.py b/paddleseg/core/train.py index 0da04b01d4..01cd1cc054 100644 --- a/paddleseg/core/train.py +++ b/paddleseg/core/train.py @@ -173,6 +173,18 @@ def train(model, reader_cost_averager.record(time.time() - batch_start) images = data['img'] labels = data['label'].astype('int64') + + debug = False # set debug as True, change yml to load pretrained weights, and set dropout as 0 + if debug: + import numpy as np + images = paddle.to_tensor(np.load('img.npy')) + labels = paddle.to_tensor(np.load( + 'gt_semantic_seg.npy')).squeeze() + ''' + print('img', images.detach().cpu().numpy().mean()) + print('gt_semantic_seg', labels.detach().cpu().numpy().mean()) + ''' + edges = None if 'edge' in data.keys(): edges = data['edge'].astype('int64') @@ -211,6 +223,20 @@ def train(model, losses=losses) loss = sum(loss_list) loss.backward() + + if debug: + print(loss_list) + ''' + loss = sum(loss_list) * 1e3 + loss.backward() + + print(loss) + for name, tensor in model.named_parameters(): + if tensor.grad is not None: + print(name, tensor.grad.numpy().mean()) + exit() + ''' + # if the optimizer is ReduceOnPlateau, the loss is the one which has been pass into step. if isinstance(optimizer, paddle.optimizer.lr.ReduceOnPlateau): optimizer.step(loss) diff --git a/paddleseg/models/backbones/vit_adapter.py b/paddleseg/models/backbones/vit_adapter.py index a993e742ac..a9824884d5 100644 --- a/paddleseg/models/backbones/vit_adapter.py +++ b/paddleseg/models/backbones/vit_adapter.py @@ -358,14 +358,6 @@ def _add_level_embed(self, c2, c3, c4): def forward(self, x): debug = False - if False: - import random - import numpy as np - random.seed(0) - np.random.seed(0) - x = np.random.rand(1, 3, 512, 512).astype("float32") - x = paddle.to_tensor(x, dtype='float32') - print('x0:', x.numpy().mean()) deform_inputs1, deform_inputs2 = deform_inputs(x) @@ -436,7 +428,4 @@ def forward(self, x): print(f2.cpu().numpy().mean()) print(f3.cpu().numpy().mean()) print(f4.cpu().numpy().mean()) - # f1 = f1.cpu().numpy().mean() - # with msdeformatt - #assert np.allclose(f1, -0.03252137, rtol=0.0, atol=1e-6) return [f1, f2, f3, f4] diff --git a/paddleseg/models/upernet_vit_adapter.py b/paddleseg/models/upernet_vit_adapter.py index 76f649a3c1..7e56eb71f4 100644 --- a/paddleseg/models/upernet_vit_adapter.py +++ b/paddleseg/models/upernet_vit_adapter.py @@ -296,7 +296,6 @@ def forward(self, inputs): print('-------head 5----') for x in logits_list: print(x.shape, x.numpy().mean()) - # -20.250404 -15.875856 exit() return logits_list From ca688f2cebd04a151f9f1363cda292068909d54d Mon Sep 17 00:00:00 2001 From: juncaipeng <13006307475@163.com> Date: Fri, 25 Nov 2022 11:14:45 +0800 Subject: [PATCH 08/16] refine for merge --- .../upernet_deit_adapter_tiny_512_160k_ade20k.yml | 8 +++----- paddleseg/models/backbones/vit_adapter.py | 2 +- paddleseg/models/layers/vit_adapter_layers.py | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml b/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml index b2a15c2004..ab0f257346 100644 --- a/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml +++ b/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml @@ -1,6 +1,6 @@ _base_: '../_base_/ade20k.yml' -batch_size: 2 # total batch size is 16 +batch_size: 4 # total batch size is 16 iters: 160000 train_dataset: @@ -8,7 +8,6 @@ train_dataset: - type: ResizeStepScaling min_scale_factor: 0.5 max_scale_factor: 2.0 - scale_step_size: 0.25 - type: RandomPaddingCrop crop_size: [512, 512] - type: RandomHorizontalFlip @@ -52,7 +51,7 @@ optimizer: lr_scheduler: type: PolynomialDecay - learning_rate: 6.0e-5 # the origin lr is 1.2e-6, but the real used lr in vit_adapter is 6e-5 + learning_rate: 6.0e-5 end_lr: 0 power: 1.0 warmup_iters: 1500 @@ -86,5 +85,4 @@ model: pool_scales: [1, 2, 3, 6] dropout_ratio: 0.1 aux_loss: True - aux_channels: 256 - #pretrained: pretrained_model/upernet_deit_adapter_tiny_512_160_ade20k_from_torch.pdparams \ No newline at end of file + aux_channels: 256 \ No newline at end of file diff --git a/paddleseg/models/backbones/vit_adapter.py b/paddleseg/models/backbones/vit_adapter.py index a9824884d5..48e4e374a6 100644 --- a/paddleseg/models/backbones/vit_adapter.py +++ b/paddleseg/models/backbones/vit_adapter.py @@ -1,4 +1,4 @@ -# This is heavily based on https://github.com/czczup/ViT-Adapter +# This file is heavily based on https://github.com/czczup/ViT-Adapter import math from functools import partial diff --git a/paddleseg/models/layers/vit_adapter_layers.py b/paddleseg/models/layers/vit_adapter_layers.py index 4848343027..eaafc5c1d2 100644 --- a/paddleseg/models/layers/vit_adapter_layers.py +++ b/paddleseg/models/layers/vit_adapter_layers.py @@ -1,4 +1,4 @@ -# This is heavily based on https://github.com/czczup/ViT-Adapter +# This file is heavily based on https://github.com/czczup/ViT-Adapter import math import warnings From 466493bbe6e3ccc5188b8214f56f1594850f4b60 Mon Sep 17 00:00:00 2001 From: juncaipeng <13006307475@163.com> Date: Fri, 25 Nov 2022 11:30:40 +0800 Subject: [PATCH 09/16] refine for merge 1 --- paddleseg/core/train.py | 24 ------------------- paddleseg/core/val.py | 1 - paddleseg/models/backbones/vit_adapter.py | 7 +----- paddleseg/models/layers/vit_adapter_layers.py | 8 ++++++- 4 files changed, 8 insertions(+), 32 deletions(-) diff --git a/paddleseg/core/train.py b/paddleseg/core/train.py index 01cd1cc054..635edf9ffa 100644 --- a/paddleseg/core/train.py +++ b/paddleseg/core/train.py @@ -174,17 +174,6 @@ def train(model, images = data['img'] labels = data['label'].astype('int64') - debug = False # set debug as True, change yml to load pretrained weights, and set dropout as 0 - if debug: - import numpy as np - images = paddle.to_tensor(np.load('img.npy')) - labels = paddle.to_tensor(np.load( - 'gt_semantic_seg.npy')).squeeze() - ''' - print('img', images.detach().cpu().numpy().mean()) - print('gt_semantic_seg', labels.detach().cpu().numpy().mean()) - ''' - edges = None if 'edge' in data.keys(): edges = data['edge'].astype('int64') @@ -224,19 +213,6 @@ def train(model, loss = sum(loss_list) loss.backward() - if debug: - print(loss_list) - ''' - loss = sum(loss_list) * 1e3 - loss.backward() - - print(loss) - for name, tensor in model.named_parameters(): - if tensor.grad is not None: - print(name, tensor.grad.numpy().mean()) - exit() - ''' - # if the optimizer is ReduceOnPlateau, the loss is the one which has been pass into step. if isinstance(optimizer, paddle.optimizer.lr.ReduceOnPlateau): optimizer.step(loss) diff --git a/paddleseg/core/val.py b/paddleseg/core/val.py index 958946ab04..80a820b6bc 100644 --- a/paddleseg/core/val.py +++ b/paddleseg/core/val.py @@ -209,7 +209,6 @@ def evaluate(model, if local_rank == 0 and print_detail: progbar_val.update(iter + 1, [('batch_cost', batch_cost), ('reader cost', reader_cost)]) - print(total_iters, iter + 1) reader_cost_averager.reset() batch_cost_averager.reset() batch_start = time.time() diff --git a/paddleseg/models/backbones/vit_adapter.py b/paddleseg/models/backbones/vit_adapter.py index 48e4e374a6..333b4afe35 100644 --- a/paddleseg/models/backbones/vit_adapter.py +++ b/paddleseg/models/backbones/vit_adapter.py @@ -8,16 +8,11 @@ import paddle import paddle.nn as nn import paddle.nn.functional as F -from paddle import ParamAttr -from paddle.regularizer import L2Decay -from paddle.nn.initializer import Uniform, KaimingNormal -from paddle.nn import Conv2D, BatchNorm, AdaptiveAvgPool2D, Linear from paddleseg.cvlibs import manager -from paddleseg.cvlibs.param_init import normal_init, trunc_normal_init, constant_init from paddleseg.utils import utils, logger +from paddleseg.cvlibs.param_init import normal_init, trunc_normal_init, constant_init from paddleseg.models.backbones.transformer_utils import to_2tuple, DropPath - from paddleseg.models.layers.vit_adapter_layers import ( SpatialPriorModule, InteractionBlock, deform_inputs, MSDeformAttn) diff --git a/paddleseg/models/layers/vit_adapter_layers.py b/paddleseg/models/layers/vit_adapter_layers.py index eaafc5c1d2..37dcb2d3e3 100644 --- a/paddleseg/models/layers/vit_adapter_layers.py +++ b/paddleseg/models/layers/vit_adapter_layers.py @@ -10,7 +10,13 @@ from paddleseg.models.backbones.transformer_utils import DropPath from paddleseg.cvlibs.param_init import constant_init, xavier_uniform -import ms_deform_attn as msda # first install ms_deform_attn +try: + import ms_deform_attn as msda +except: + print( + "import ms_deform_attn failed, please refer the following doc to install ms_deform_attn lib: " + "https://github.com/PaddlePaddle/PaddleSeg/tree/develop/configs/upernet_vit_adapter" + ) def get_reference_points(spatial_shapes): From cc7aa0aec3eb91815dfc4d6b858dc130973dc461 Mon Sep 17 00:00:00 2001 From: juncaipeng <13006307475@163.com> Date: Fri, 25 Nov 2022 12:56:18 +0800 Subject: [PATCH 10/16] refine for merge 2 --- configs/vit_adapter/README.md | 10 +-- ..._vit_adapter_tiny_ade20k_512x512_160k.yml} | 0 paddleseg/core/train.py | 2 - paddleseg/models/upernet_vit_adapter.py | 82 +++++++------------ 4 files changed, 32 insertions(+), 62 deletions(-) rename configs/vit_adapter/{upernet_deit_adapter_tiny_512_160k_ade20k.yml => upernet_vit_adapter_tiny_ade20k_512x512_160k.yml} (100%) diff --git a/configs/vit_adapter/README.md b/configs/vit_adapter/README.md index d7a15b0ca3..d364c3c8dd 100644 --- a/configs/vit_adapter/README.md +++ b/configs/vit_adapter/README.md @@ -1,15 +1,13 @@ -# Semantic Flow for Fast and Accurate Scene Parsing +# Vision Transformer Adapter for Dense Predictions ## Reference -> Xiangtai Li, Ansheng You, Zhen Zhu, Houlong Zhao, Maoke Yang, Kuiyuan Yang, Shaohua Tan, Yunhai Tong: -Semantic Flow for Fast and Accurate Scene Parsing. ECCV (1) 2020: 775-793 . +> Chen, Zhe, Yuchen Duan, Wenhai Wang, Junjun He, Tong Lu, Jifeng Dai, and Yu Qiao. "Vision Transformer Adapter for Dense Predictions." arXiv preprint arXiv:2205.08534 (2022). ## Performance -### Cityscapes +### ADE20K | Model | Backbone | Resolution | Training Iters | mIoU | mIoU (flip) | mIoU (ms+flip) | Links | |-|-|-|-|-|-|-|-| -|SFNet|ResNet18_OS8|1024x1024|80000|78.72%|79.11%|79.28%|[model](https://bj.bcebos.com/paddleseg/dygraph/cityscapes/sfnet_resnet18_os8_cityscapes_1024x1024_80k/model.pdparams) \| [log](https://bj.bcebos.com/paddleseg/dygraph/cityscapes/sfnet_resnet18_os8_cityscapes_1024x1024_80k/train.log) \| [vdl](https://www.paddlepaddle.org.cn/paddle/visualdl/service/app/scalar?id=0d790ad96282048b136342fcebb08d14)| -|SFNet|ResNet50_OS8|1024x1024|80000|81.49%|81.63%|81.85%|[model](https://bj.bcebos.com/paddleseg/dygraph/cityscapes/sfnet_resnet50_os8_cityscapes_1024x1024_80k/model.pdparams) \| [log](https://bj.bcebos.com/paddleseg/dygraph/cityscapes/sfnet_resnet50_os8_cityscapes_1024x1024_80k/train.log) \| [vdl](https://paddlepaddle.org.cn/paddle/visualdl/service/app?id=d458349ec63ea8ccd6fae84afa8ea981)| +|UPerNetViTAdapter|ViT-Adapter-Tiny|512x512|160000|%|%|%|[model]() \| [log]() \| [vdl]()| diff --git a/configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml b/configs/vit_adapter/upernet_vit_adapter_tiny_ade20k_512x512_160k.yml similarity index 100% rename from configs/vit_adapter/upernet_deit_adapter_tiny_512_160k_ade20k.yml rename to configs/vit_adapter/upernet_vit_adapter_tiny_ade20k_512x512_160k.yml diff --git a/paddleseg/core/train.py b/paddleseg/core/train.py index 635edf9ffa..0da04b01d4 100644 --- a/paddleseg/core/train.py +++ b/paddleseg/core/train.py @@ -173,7 +173,6 @@ def train(model, reader_cost_averager.record(time.time() - batch_start) images = data['img'] labels = data['label'].astype('int64') - edges = None if 'edge' in data.keys(): edges = data['edge'].astype('int64') @@ -212,7 +211,6 @@ def train(model, losses=losses) loss = sum(loss_list) loss.backward() - # if the optimizer is ReduceOnPlateau, the loss is the one which has been pass into step. if isinstance(optimizer, paddle.optimizer.lr.ReduceOnPlateau): optimizer.step(loss) diff --git a/paddleseg/models/upernet_vit_adapter.py b/paddleseg/models/upernet_vit_adapter.py index 7e56eb71f4..b158beef9e 100644 --- a/paddleseg/models/upernet_vit_adapter.py +++ b/paddleseg/models/upernet_vit_adapter.py @@ -24,21 +24,24 @@ @manager.MODELS.add_component class UPerNetViTAdapter(nn.Layer): """ - The UPerNet implementation based on PaddlePaddle. + The UPerNetViTAdapter implementation based on PaddlePaddle. The original article refers to - Tete Xiao, et, al. "Unified Perceptual Parsing for Scene Understanding" - (https://arxiv.org/abs/1807.10221). + Chen, Zhe, Yuchen Duan, Wenhai Wang, Junjun He, Tong Lu, Jifeng Dai, and Yu Qiao. + "Vision Transformer Adapter for Dense Predictions." + (https://arxiv.org/abs/2205.08534). Args: num_classes (int): The unique number of target classes. - backbone (Paddle.nn.Layer): Backbone network, currently support Resnet50/101. - backbone_indices (tuple): Four values in the tuple indicate the indices of output of backbone. - channels (int): The channels of inter layers. Default: 512. - aux_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: False. + backbone (nn.Layer): The backbone network. + backbone_indices (tuple | list): The values indicate the indices of output of backbone. + channels (int, optional): The channels of inter layers in upernet head. Default: 512. + pool_scales (list, optional): The scales in PPM. Default: [1, 2, 3, 6]. + dropout_ratio (float, optional): The dropout ratio for upernet head. Default: 0.1. + aux_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True. + aux_channels (int, optional): The channels of inter layers in auxiliary head. Default: 256. align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. - dropout_ratio (float): Dropout ratio for upernet head. Default: 0.1. pretrained (str, optional): The path or url of pretrained model. Default: None. """ @@ -72,6 +75,10 @@ def __init__(self, self.pretrained = pretrained self.init_weight() + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + def forward(self, x): feats = self.backbone(x) feats = [feats[i] for i in self.backbone_indices] @@ -85,10 +92,6 @@ def forward(self, x): ] return logit_list - def init_weight(self): - if self.pretrained is not None: - utils.load_entire_model(self, self.pretrained) - class ConvBNReLU(nn.Layer): def __init__(self, @@ -118,12 +121,9 @@ class PPM(nn.Layer): """Pooling Pyramid Module used in PSPNet. Args: - pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid - Module. + pool_scales (tuple | list): Pooling scales used in PPM. in_channels (int): Input channels. - channels (int): Channels after modules, before conv_seg. - conv_cfg (dict|None): Config of conv layers. - norm_cfg (dict|None): Config of norm layers. + channels (int): Output Channels after modules, before conv_seg. act_cfg (dict): Config of activation layers. align_corners (bool): align_corners argument of F.interpolate. """ @@ -145,7 +145,6 @@ def __init__(self, pool_scales, in_channels, channels, align_corners): kernel_size=1))) def forward(self, x): - """Forward function.""" ppm_outs = [] for ppm in self.stages: ppm_out = ppm(x) @@ -159,16 +158,20 @@ def forward(self, x): class UPerNetHead(nn.Layer): - """Unified Perceptual Parsing for Scene Understanding. - - This head is the implementation of `UPerNet - `_. - + """ + This head is the implementation of "Unified Perceptual Parsing for Scene Understanding". This is heavily based on https://github.com/czczup/ViT-Adapter Args: - pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid - Module applied on the last feature. Default: (1, 2, 3, 6). + num_classes (int): The unique number of target classes. + in_channels (list[int]): The channels of input features. + channels (int, optional): The channels of inter layers in upernet head. Default: 512. + pool_scales (list, optional): The scales in PPM. Default: [1, 2, 3, 6]. + dropout_ratio (float, optional): The dropout ratio for upernet head. Default: 0.1. + aux_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True. + aux_channels (int, optional): The channels of inter layers in auxiliary head. Default: 256. + align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even, + e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. """ def __init__(self, @@ -204,7 +207,6 @@ def __init__(self, self.fpn_bottleneck = ConvBNReLU( len(in_channels) * channels, channels, 3, padding=1) - if dropout_ratio > 0: self.dropout = nn.Dropout2D(dropout_ratio) else: @@ -219,7 +221,6 @@ def __init__(self, aux_channels, num_classes, kernel_size=1) def psp_forward(self, inputs): - """Forward function of PSP module.""" x = inputs[-1] psp_outs = [x] psp_outs.extend(self.psp_modules(x)) @@ -228,13 +229,6 @@ def psp_forward(self, inputs): return output def forward(self, inputs): - """Forward function.""" - debug = False - if debug: - print('-------head 1----') - for x in inputs: - print(x.shape, x.numpy().mean()) - # build laterals laterals = [ lateral_conv(inputs[i]) @@ -242,11 +236,6 @@ def forward(self, inputs): ] laterals.append(self.psp_forward(inputs)) - if debug: - print('-------head 2----') - for x in laterals: - print(x.shape, x.numpy().mean()) - # build top-down path used_backbone_levels = len(laterals) for i in range(used_backbone_levels - 1, 0, -1): @@ -264,11 +253,6 @@ def forward(self, inputs): ] fpn_outs.append(laterals[-1]) # append psp feature - if debug: - print('-------head 3----') - for x in fpn_outs: - print(x.shape, x.numpy().mean()) - for i in range(used_backbone_levels - 1, 0, -1): fpn_outs[i] = F.interpolate( fpn_outs[i], @@ -278,10 +262,6 @@ def forward(self, inputs): fpn_outs = paddle.concat(fpn_outs, axis=1) output = self.fpn_bottleneck(fpn_outs) - if debug: - print('-------head 4----') - print(output.shape, output.numpy().mean()) - if self.dropout is not None: output = self.dropout(output) output = self.conv_seg(output) @@ -292,10 +272,4 @@ def forward(self, inputs): aux_output = self.aux_conv_seg(aux_output) logits_list.append(aux_output) - if debug: - print('-------head 5----') - for x in logits_list: - print(x.shape, x.numpy().mean()) - exit() - return logits_list From 50ea8d125d944102d72c1318331e58407e0e6f10 Mon Sep 17 00:00:00 2001 From: juncaipeng <13006307475@163.com> Date: Fri, 25 Nov 2022 14:58:43 +0800 Subject: [PATCH 11/16] refine for merge 3 --- ...t_vit_adapter_tiny_ade20k_512x512_160k.yml | 16 +- paddleseg/models/backbones/vit_adapter.py | 45 ++- .../models/layers/ms_deformable_attention.py | 159 ++++++++++ paddleseg/models/layers/vit_adapter_layers.py | 278 ++++-------------- paddleseg/models/upernet_vit_adapter.py | 2 + 5 files changed, 247 insertions(+), 253 deletions(-) create mode 100644 paddleseg/models/layers/ms_deformable_attention.py diff --git a/configs/vit_adapter/upernet_vit_adapter_tiny_ade20k_512x512_160k.yml b/configs/vit_adapter/upernet_vit_adapter_tiny_ade20k_512x512_160k.yml index ab0f257346..1ebcab40b3 100644 --- a/configs/vit_adapter/upernet_vit_adapter_tiny_ade20k_512x512_160k.yml +++ b/configs/vit_adapter/upernet_vit_adapter_tiny_ade20k_512x512_160k.yml @@ -66,20 +66,8 @@ loss: model: type: UPerNetViTAdapter backbone: - type: ViTAdapter - num_heads: 3 - patch_size: 16 - embed_dim: 192 - depth: 12 - mlp_ratio: 4 - drop_path_rate: 0.1 - conv_inplane: 64 - n_points: 4 - deform_num_heads: 6 - cffn_ratio: 0.25 - deform_ratio: 1.0 - interaction_indexes: [[0, 2], [3, 5], [6, 8], [9, 11]] - pretrained: pretrained_model/deit_tiny_patch16_224-a1311bcf_from_torch.pdparams + type: ViTAdapter_Tiny + pretrained: https://paddleseg.bj.bcebos.com/dygraph/backbone/deit_tiny_patch16_224.zip backbone_indices: [0, 1, 2, 3] channels: 512 pool_scales: [1, 2, 3, 6] diff --git a/paddleseg/models/backbones/vit_adapter.py b/paddleseg/models/backbones/vit_adapter.py index 333b4afe35..649e89c9d5 100644 --- a/paddleseg/models/backbones/vit_adapter.py +++ b/paddleseg/models/backbones/vit_adapter.py @@ -16,7 +16,7 @@ from paddleseg.models.layers.vit_adapter_layers import ( SpatialPriorModule, InteractionBlock, deform_inputs, MSDeformAttn) -__all__ = ['ViTAdapter'] +__all__ = ['ViTAdapter', 'ViTAdapter_Tiny'] class PatchEmbed(nn.Layer): @@ -352,8 +352,6 @@ def _add_level_embed(self, c2, c3, c4): return c2, c3, c4 def forward(self, x): - debug = False - deform_inputs1, deform_inputs2 = deform_inputs(x) # SPM forward @@ -361,25 +359,12 @@ def forward(self, x): c2, c3, c4 = self._add_level_embed(c2, c3, c4) c = paddle.concat([c2, c3, c4], axis=1) - if debug: - print('----2----') - for i in deform_inputs1: - print(i.numpy().mean()) - for i in deform_inputs2: - print(i.numpy().mean()) - print(x.numpy().mean()) - print(c.numpy().mean()) - # Patch Embedding forward x, H, W = self.patch_embed(x) bs, n, dim = x.shape pos_embed = self._get_pos_embed(self.pos_embed[:, 1:], H, W) x = self.pos_drop(x + pos_embed) - if debug: - print('-------3----') - print(x.numpy().mean()) - # Interaction outs = list() for i, layer in enumerate(self.interactions): @@ -387,10 +372,6 @@ def forward(self, x): x, c = layer(x, c, self.blocks[indexes[0]:indexes[-1] + 1], deform_inputs1, deform_inputs2, H, W) outs.append(x.transpose([0, 2, 1]).reshape([bs, dim, H, W])) - if debug: - print('-----4-{}------'.format(i)) - print(x.numpy().mean()) - print(c.numpy().mean()) # Split & Reshape c2 = c[:, 0:c2.shape[1], :] @@ -417,10 +398,22 @@ def forward(self, x): f2 = self.norm2(c2) f3 = self.norm3(c3) f4 = self.norm4(c4) - if debug: - print('-----5------') - print(f1.cpu().numpy().mean()) - print(f2.cpu().numpy().mean()) - print(f3.cpu().numpy().mean()) - print(f4.cpu().numpy().mean()) return [f1, f2, f3, f4] + + +@manager.BACKBONES.add_component +def ViTAdapter_Tiny(**kwargs): + return ViTAdapter( + num_heads=3, + patch_size=16, + embed_dim=192, + depth=12, + mlp_ratio=4, + drop_path_rate=0.1, + conv_inplane=64, + n_points=4, + deform_num_heads=6, + cffn_ratio=0.25, + deform_ratio=1.0, + interaction_indexes=[[0, 2], [3, 5], [6, 8], [9, 11]], + **kwargs) \ No newline at end of file diff --git a/paddleseg/models/layers/ms_deformable_attention.py b/paddleseg/models/layers/ms_deformable_attention.py new file mode 100644 index 0000000000..37e18c22ae --- /dev/null +++ b/paddleseg/models/layers/ms_deformable_attention.py @@ -0,0 +1,159 @@ +# This file is heavily based on https://github.com/czczup/ViT-Adapter +import math +import warnings + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import param_init +from paddleseg.cvlibs.param_init import constant_init, xavier_uniform + +try: + import ms_deform_attn as msda +except: + print( + "Import ms_deform_attn failed. Please first refer to the following document to install " + "ms_deform_attn lib, and then use multi-scale deformable attention module: " + "https://github.com/PaddlePaddle/PaddleSeg/tree/develop/configs/upernet_vit_adapter" + ) + + +class MSDeformAttn(nn.Layer): + def __init__(self, + d_model=256, + n_levels=4, + n_heads=8, + n_points=4, + ratio=1.0): + """Multi-Scale Deformable Attention Module. + + Args: + d_model(int, optional): The hidden dimension. Default: 256 + n_levels(int, optional): The number of feature levels. Default: 4 + n_heads(int, optional): The number of attention heads. Default: 8 + n_points(int, optional): The number of sampling points per attention head per feature level. Default: 4 + ratio (float, optional): The ratio of channels for Linear. Default: 1.0 + """ + super().__init__() + if d_model % n_heads != 0: + raise ValueError('d_model must be divisible by n_heads, ' + 'but got {} and {}'.format(d_model, n_heads)) + _d_per_head = d_model // n_heads + # you'd better set _d_per_head to a power of 2 + # which is more efficient in our CUDA implementation + if not self._is_power_of_2(_d_per_head): + warnings.warn("You'd better set d_model in MSDeformAttn to make " + 'the dimension of each attention head a power of 2 ' + 'which is more efficient in our CUDA implementation.') + + self.im2col_step = 64 + self.d_model = d_model + self.n_levels = n_levels + self.n_heads = n_heads + self.n_points = n_points + self.ratio = ratio + + self.sampling_offsets = nn.Linear(d_model, + n_heads * n_levels * n_points * 2) + self.attention_weights = nn.Linear(d_model, + n_heads * n_levels * n_points) + self.value_proj = nn.Linear(d_model, int(d_model * ratio)) + self.output_proj = nn.Linear(int(d_model * ratio), d_model) + + self._reset_parameters() + + @staticmethod + def _is_power_of_2(n): + if (not isinstance(n, int)) or (n < 0): + raise ValueError('invalid input for _is_power_of_2: {} (type: {})'. + format(n, type(n))) + return (n & (n - 1) == 0) and n != 0 + + def _reset_parameters(self): + constant_init(self.sampling_offsets.weight, value=0.) + thetas = paddle.arange( + self.n_heads, dtype='float32') * (2.0 * math.pi / self.n_heads) + grid_init = paddle.stack([thetas.cos(), thetas.sin()], -1) + grid_init = (grid_init / grid_init.abs().max( + -1, keepdim=True)[0]).reshape([self.n_heads, 1, 1, 2]).tile( + [1, self.n_levels, self.n_points, 1]) + for i in range(self.n_points): + grid_init[:, :, i, :] *= i + 1 + + grid_init = grid_init.reshape([-1]) + self.sampling_offsets.bias = self.create_parameter( + shape=grid_init.shape, + default_initializer=paddle.nn.initializer.Assign(grid_init)) + self.sampling_offsets.bias.stop_gradient = True + + constant_init(self.attention_weights.weight, value=0.) + constant_init(self.attention_weights.bias, value=0.) + xavier_uniform(self.value_proj.weight) + constant_init(self.value_proj.bias, value=0.) + xavier_uniform(self.output_proj.weight) + constant_init(self.output_proj.bias, value=0.) + + def forward(self, + query, + reference_points, + input_flatten, + input_spatial_shapes, + input_level_start_index, + input_padding_mask=None): + """ + Args: + query: (N, Length_{query}, C) + reference_points: (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area + or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes + input_flatten: (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C) + input_spatial_shapes: (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] + input_level_start_index: (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}] + input_padding_mask: (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements + + Returns: + output (N, Length_{query}, C) + """ + + def masked_fill(x, mask, value): + y = paddle.full(x.shape, value, x.dtype) + return paddle.where(mask, y, x) + + N, Len_q, _ = query.shape + N, Len_in, _ = input_flatten.shape + assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1] + ).sum() == Len_in + + value = self.value_proj(input_flatten) + if input_padding_mask is not None: + value = masked_fill(value, input_padding_mask[..., None], float(0)) + + value = value.reshape([ + N, Len_in, self.n_heads, + int(self.ratio * self.d_model) // self.n_heads + ]) + sampling_offsets = self.sampling_offsets(query).reshape( + [N, Len_q, self.n_heads, self.n_levels, self.n_points, 2]) + attention_weights = self.attention_weights(query).reshape( + [N, Len_q, self.n_heads, self.n_levels * self.n_points]) + attention_weights = F.softmax(attention_weights, -1).\ + reshape([N, Len_q, self.n_heads, self.n_levels, self.n_points]) + + if reference_points.shape[-1] == 2: + offset_normalizer = paddle.stack( + [input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], + -1) + sampling_locations = reference_points[:, :, None, :, None, :] \ + + sampling_offsets / offset_normalizer[None, None, None, :, None, :] + elif reference_points.shape[-1] == 4: + sampling_locations = reference_points[:, :, None, :, None, :2] \ + + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 + else: + raise ValueError( + 'Last dim of reference_points must be 2 or 4, but get {} instead.' + .format(reference_points.shape[-1])) + output = msda.ms_deform_attn( + value, input_spatial_shapes, input_level_start_index, + sampling_locations, attention_weights, self.im2col_step) + output = self.output_proj(output) + return output diff --git a/paddleseg/models/layers/vit_adapter_layers.py b/paddleseg/models/layers/vit_adapter_layers.py index 37dcb2d3e3..fef897bfa6 100644 --- a/paddleseg/models/layers/vit_adapter_layers.py +++ b/paddleseg/models/layers/vit_adapter_layers.py @@ -8,15 +8,7 @@ import paddle.nn as nn import paddle.nn.functional as F from paddleseg.models.backbones.transformer_utils import DropPath -from paddleseg.cvlibs.param_init import constant_init, xavier_uniform - -try: - import ms_deform_attn as msda -except: - print( - "import ms_deform_attn failed, please refer the following doc to install ms_deform_attn lib: " - "https://github.com/PaddlePaddle/PaddleSeg/tree/develop/configs/upernet_vit_adapter" - ) +from paddleseg.models.layers.ms_deformable_attention import MSDeformAttn def get_reference_points(spatial_shapes): @@ -37,7 +29,7 @@ def get_reference_points(spatial_shapes): def deform_inputs(x): - bs, c, h, w = x.shape + _, _, h, w = x.shape spatial_shapes = paddle.to_tensor( [(h // 8, w // 8), (h // 16, w // 16), (h // 32, w // 32)], dtype='int64') @@ -56,14 +48,35 @@ def deform_inputs(x): return deform_inputs1, deform_inputs2 -def _is_power_of_2(n): - if (not isinstance(n, int)) or (n < 0): - raise ValueError('invalid input for _is_power_of_2: {} (type: {})'. - format(n, type(n))) - return (n & (n - 1) == 0) and n != 0 +class DWConv(nn.Layer): + """ + The specific DWConv unsed in ConvFFN. + """ + + def __init__(self, dim=768): + super().__init__() + self.dwconv = nn.Conv2D(dim, dim, 3, 1, 1, bias_attr=True, groups=dim) + + def forward(self, x, H, W): + B, N, C = x.shape + n = N // 21 + x1 = x[:, 0:16 * n, :].transpose([0, 2, 1]).reshape( + [B, C, H * 2, W * 2]) + x2 = x[:, 16 * n:20 * n, :].transpose([0, 2, 1]).reshape([B, C, H, W]) + x3 = x[:, 20 * n:, :].transpose([0, 2, 1]).reshape( + [B, C, H // 2, W // 2]) + x1 = self.dwconv(x1).flatten(2).transpose([0, 2, 1]) + x2 = self.dwconv(x2).flatten(2).transpose([0, 2, 1]) + x3 = self.dwconv(x3).flatten(2).transpose([0, 2, 1]) + x = paddle.concat([x1, x2, x3], axis=1) + return x class ConvFFN(nn.Layer): + """ + The implementation of ConvFFN unsed in Extractor. + """ + def __init__(self, in_features, hidden_features=None, @@ -89,156 +102,11 @@ def forward(self, x, H, W): return x -class DWConv(nn.Layer): - def __init__(self, dim=768): - super().__init__() - self.dwconv = nn.Conv2D(dim, dim, 3, 1, 1, bias_attr=True, groups=dim) - - def forward(self, x, H, W): - B, N, C = x.shape - n = N // 21 - x1 = x[:, 0:16 * n, :].transpose([0, 2, 1]).reshape( - [B, C, H * 2, W * 2]) - x2 = x[:, 16 * n:20 * n, :].transpose([0, 2, 1]).reshape([B, C, H, W]) - x3 = x[:, 20 * n:, :].transpose([0, 2, 1]).reshape( - [B, C, H // 2, W // 2]) - x1 = self.dwconv(x1).flatten(2).transpose([0, 2, 1]) - x2 = self.dwconv(x2).flatten(2).transpose([0, 2, 1]) - x3 = self.dwconv(x3).flatten(2).transpose([0, 2, 1]) - x = paddle.concat([x1, x2, x3], axis=1) - return x - - -class MSDeformAttn(nn.Layer): - def __init__(self, - d_model=256, - n_levels=4, - n_heads=8, - n_points=4, - ratio=1.0): - """Multi-Scale Deformable Attention Module. - - :param d_model hidden dimension - :param n_levels number of feature levels - :param n_heads number of attention heads - :param n_points number of sampling points per attention head per feature level - """ - super().__init__() - if d_model % n_heads != 0: - raise ValueError('d_model must be divisible by n_heads, ' - 'but got {} and {}'.format(d_model, n_heads)) - _d_per_head = d_model // n_heads - # you'd better set _d_per_head to a power of 2 - # which is more efficient in our CUDA implementation - if not _is_power_of_2(_d_per_head): - warnings.warn("You'd better set d_model in MSDeformAttn to make " - 'the dimension of each attention head a power of 2 ' - 'which is more efficient in our CUDA implementation.') - - self.im2col_step = 64 - - self.d_model = d_model - self.n_levels = n_levels - self.n_heads = n_heads - self.n_points = n_points - self.ratio = ratio - self.sampling_offsets = nn.Linear(d_model, - n_heads * n_levels * n_points * 2) - self.attention_weights = nn.Linear(d_model, - n_heads * n_levels * n_points) - self.value_proj = nn.Linear(d_model, int(d_model * ratio)) - self.output_proj = nn.Linear(int(d_model * ratio), d_model) - - self._reset_parameters() - - def _reset_parameters(self): - constant_init(self.sampling_offsets.weight, value=0.) - thetas = paddle.arange( - self.n_heads, dtype='float32') * (2.0 * math.pi / self.n_heads) - grid_init = paddle.stack([thetas.cos(), thetas.sin()], -1) - grid_init = (grid_init / grid_init.abs().max( - -1, keepdim=True)[0]).reshape([self.n_heads, 1, 1, 2]).tile( - [1, self.n_levels, self.n_points, 1]) - for i in range(self.n_points): - grid_init[:, :, i, :] *= i + 1 - - grid_init = grid_init.reshape([-1]) - self.sampling_offsets.bias = self.create_parameter( - shape=grid_init.shape, - default_initializer=paddle.nn.initializer.Assign(grid_init)) - self.sampling_offsets.bias.stop_gradient = True - - constant_init(self.attention_weights.weight, value=0.) - constant_init(self.attention_weights.bias, value=0.) - xavier_uniform(self.value_proj.weight) - constant_init(self.value_proj.bias, value=0.) - xavier_uniform(self.output_proj.weight) - constant_init(self.output_proj.bias, value=0.) - - def forward(self, - query, - reference_points, - input_flatten, - input_spatial_shapes, - input_level_start_index, - input_padding_mask=None): - """ - :param query (N, Length_{query}, C) - :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area - or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes - :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C) - :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] - :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}] - :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements - - :return output (N, Length_{query}, C) - """ - - def masked_fill(x, mask, value): - y = paddle.full(x.shape, value, x.dtype) - return paddle.where(mask, y, x) - - N, Len_q, _ = query.shape - N, Len_in, _ = input_flatten.shape - assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1] - ).sum() == Len_in - - value = self.value_proj(input_flatten) - if input_padding_mask is not None: - value = masked_fill(value, input_padding_mask[..., None], float(0)) - - value = value.reshape([ - N, Len_in, self.n_heads, - int(self.ratio * self.d_model) // self.n_heads - ]) - sampling_offsets = self.sampling_offsets(query).reshape( - [N, Len_q, self.n_heads, self.n_levels, self.n_points, 2]) - attention_weights = self.attention_weights(query).reshape( - [N, Len_q, self.n_heads, self.n_levels * self.n_points]) - attention_weights = F.softmax(attention_weights, -1).\ - reshape([N, Len_q, self.n_heads, self.n_levels, self.n_points]) - - if reference_points.shape[-1] == 2: - offset_normalizer = paddle.stack( - [input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], - -1) - sampling_locations = reference_points[:, :, None, :, None, :] \ - + sampling_offsets / offset_normalizer[None, None, None, :, None, :] - elif reference_points.shape[-1] == 4: - sampling_locations = reference_points[:, :, None, :, None, :2] \ - + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 - else: - raise ValueError( - 'Last dim of reference_points must be 2 or 4, but get {} instead.' - .format(reference_points.shape[-1])) - output = msda.ms_deform_attn( - value, input_spatial_shapes, input_level_start_index, - sampling_locations, attention_weights, self.im2col_step) - output = self.output_proj(output) - return output - - class Extractor(nn.Layer): + """ + The Extractor module in ViT-Adapter. + """ + def __init__(self, dim, num_heads=6, @@ -272,23 +140,21 @@ def __init__(self, def forward(self, query, reference_points, feat, spatial_shapes, level_start_index, H, W): - def _inner_forward(query, feat): - attn = self.attn( - self.query_norm(query), reference_points, - self.feat_norm(feat), spatial_shapes, level_start_index, None) - query = query + attn - - if self.with_cffn: - query = query + self.drop_path( - self.ffn(self.ffn_norm(query), H, W)) - return query - - query = _inner_forward(query, feat) + attn = self.attn( + self.query_norm(query), reference_points, + self.feat_norm(feat), spatial_shapes, level_start_index, None) + query = query + attn + if self.with_cffn: + query = query + self.drop_path(self.ffn(self.ffn_norm(query), H, W)) return query class Injector(nn.Layer): + """ + The Injector module in ViT-Adapter. + """ + def __init__(self, dim, num_heads=6, @@ -314,18 +180,17 @@ def __init__(self, def forward(self, query, reference_points, feat, spatial_shapes, level_start_index): - def _inner_forward(query, feat): - attn = self.attn( - self.query_norm(query), reference_points, - self.feat_norm(feat), spatial_shapes, level_start_index, None) - return query + self.gamma * attn - - query = _inner_forward(query, feat) - - return query + attn = self.attn( + self.query_norm(query), reference_points, + self.feat_norm(feat), spatial_shapes, level_start_index, None) + return query + self.gamma * attn class InteractionBlock(nn.Layer): + """ + Combine the Extractor, Extractor and ViT Blocks. + """ + def __init__(self, dim, num_heads=6, @@ -377,20 +242,15 @@ def __init__(self, self.extra_extractors = None def forward(self, x, c, blocks, deform_inputs1, deform_inputs2, H, W): - debug = False x = self.injector( query=x, reference_points=deform_inputs1[0], feat=c, spatial_shapes=deform_inputs1[1], level_start_index=deform_inputs1[2]) - if debug: - print('x', x.cpu().numpy().mean()) for idx, blk in enumerate(blocks): x = blk(x, H, W) - if debug: - print('x block_{}'.format(idx), x.cpu().numpy().mean()) c = self.extractor( query=c, @@ -400,8 +260,6 @@ def forward(self, x, c, blocks, deform_inputs1, deform_inputs2, H, W): level_start_index=deform_inputs2[2], H=H, W=W) - if debug: - print('c', c.cpu().numpy().mean()) if self.extra_extractors is not None: for extractor in self.extra_extractors: @@ -413,8 +271,6 @@ def forward(self, x, c, blocks, deform_inputs1, deform_inputs2, H, W): level_start_index=deform_inputs2[2], H=H, W=W) - if debug: - print('c', c.cpu().numpy().mean()) return x, c @@ -588,22 +444,18 @@ def __init__(self, inplanes=64, embed_dim=384): bias_attr=True) def forward(self, x): - def _inner_forward(x): - c1 = self.stem(x) - c2 = self.conv2(c1) - c3 = self.conv3(c2) - c4 = self.conv4(c3) - c1 = self.fc1(c1) - c2 = self.fc2(c2) - c3 = self.fc3(c3) - c4 = self.fc4(c4) - - bs, dim, _, _ = c1.shape - c2 = c2.reshape([bs, dim, -1]).transpose([0, 2, 1]) # 8s - c3 = c3.reshape([bs, dim, -1]).transpose([0, 2, 1]) # 16s - c4 = c4.reshape([bs, dim, -1]).transpose([0, 2, 1]) # 32s - - return c1, c2, c3, c4 - - outs = _inner_forward(x) - return outs + c1 = self.stem(x) + c2 = self.conv2(c1) + c3 = self.conv3(c2) + c4 = self.conv4(c3) + c1 = self.fc1(c1) + c2 = self.fc2(c2) + c3 = self.fc3(c3) + c4 = self.fc4(c4) + + bs, dim, _, _ = c1.shape + c2 = c2.reshape([bs, dim, -1]).transpose([0, 2, 1]) # 8s + c3 = c3.reshape([bs, dim, -1]).transpose([0, 2, 1]) # 16s + c4 = c4.reshape([bs, dim, -1]).transpose([0, 2, 1]) # 32s + + return c1, c2, c3, c4 diff --git a/paddleseg/models/upernet_vit_adapter.py b/paddleseg/models/upernet_vit_adapter.py index b158beef9e..cb9dcfd28f 100644 --- a/paddleseg/models/upernet_vit_adapter.py +++ b/paddleseg/models/upernet_vit_adapter.py @@ -31,6 +31,8 @@ class UPerNetViTAdapter(nn.Layer): "Vision Transformer Adapter for Dense Predictions." (https://arxiv.org/abs/2205.08534). + The implementation is based on https://github.com/czczup/ViT-Adapter + Args: num_classes (int): The unique number of target classes. backbone (nn.Layer): The backbone network. From 539a1f03cbce17b1726c15f62214c547ffddf90b Mon Sep 17 00:00:00 2001 From: juncaipeng <13006307475@163.com> Date: Mon, 28 Nov 2022 15:42:16 +0800 Subject: [PATCH 12/16] refine for merge 4 --- configs/vit_adapter/README.md | 2 +- paddleseg/models/layers/ms_deformable_attention.py | 10 +++++----- paddleseg/models/layers/vit_adapter_layers.py | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/configs/vit_adapter/README.md b/configs/vit_adapter/README.md index d364c3c8dd..aab2e7e458 100644 --- a/configs/vit_adapter/README.md +++ b/configs/vit_adapter/README.md @@ -10,4 +10,4 @@ | Model | Backbone | Resolution | Training Iters | mIoU | mIoU (flip) | mIoU (ms+flip) | Links | |-|-|-|-|-|-|-|-| -|UPerNetViTAdapter|ViT-Adapter-Tiny|512x512|160000|%|%|%|[model]() \| [log]() \| [vdl]()| +|UPerNetViTAdapter|ViT-Adapter-Tiny|512x512|160000|41.90%|-|-|[model](https://paddleseg.bj.bcebos.com/dygraph/ade20k/upernet_vit_adapter_tiny_ade20k_512x512_160k/model.pdparams) \| [log](https://paddleseg.bj.bcebos.com/dygraph/ade20k/upernet_vit_adapter_tiny_ade20k_512x512_160k/train_log.txt) \| [vdl](https://paddlepaddle.org.cn/paddle/visualdl/service/app?id=88173046bd09f61da5f48db66baddd7d)| diff --git a/paddleseg/models/layers/ms_deformable_attention.py b/paddleseg/models/layers/ms_deformable_attention.py index 37e18c22ae..880d12dca2 100644 --- a/paddleseg/models/layers/ms_deformable_attention.py +++ b/paddleseg/models/layers/ms_deformable_attention.py @@ -10,12 +10,12 @@ from paddleseg.cvlibs.param_init import constant_init, xavier_uniform try: - import ms_deform_attn as msda + import ms_deform_attn except: print( - "Import ms_deform_attn failed. Please first refer to the following document to install " - "ms_deform_attn lib, and then use multi-scale deformable attention module: " - "https://github.com/PaddlePaddle/PaddleSeg/tree/develop/configs/upernet_vit_adapter" + "Import ms_deform_attn failed. Please download the following file and refer to " + "the readme to install ms_deform_attn lib: " + "https://paddleseg.bj.bcebos.com/dygraph/customized_ops/ms_deform_attn.zip" ) @@ -152,7 +152,7 @@ def masked_fill(x, mask, value): raise ValueError( 'Last dim of reference_points must be 2 or 4, but get {} instead.' .format(reference_points.shape[-1])) - output = msda.ms_deform_attn( + output = ms_deform_attn.ms_deform_attn( value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step) output = self.output_proj(output) diff --git a/paddleseg/models/layers/vit_adapter_layers.py b/paddleseg/models/layers/vit_adapter_layers.py index fef897bfa6..6735331db9 100644 --- a/paddleseg/models/layers/vit_adapter_layers.py +++ b/paddleseg/models/layers/vit_adapter_layers.py @@ -249,7 +249,7 @@ def forward(self, x, c, blocks, deform_inputs1, deform_inputs2, H, W): spatial_shapes=deform_inputs1[1], level_start_index=deform_inputs1[2]) - for idx, blk in enumerate(blocks): + for _, blk in enumerate(blocks): x = blk(x, H, W) c = self.extractor( @@ -334,7 +334,7 @@ def forward(self, x, c, cls, blocks, deform_inputs1, deform_inputs2, H, W): spatial_shapes=deform_inputs1[1], level_start_index=deform_inputs1[2]) x = paddle.concat((cls, x), axis=1) - for idx, blk in enumerate(blocks): + for _, blk in enumerate(blocks): x = blk(x, H, W) cls, x = x[:, :1, ], x[:, 1:, ] c = self.extractor( From d0b89c6bb221838d623310244cd2e3e3e5c2b54c Mon Sep 17 00:00:00 2001 From: juncaipeng <13006307475@163.com> Date: Mon, 28 Nov 2022 16:35:24 +0800 Subject: [PATCH 13/16] refine for merge 5 --- paddleseg/core/val.py | 2 ++ paddleseg/models/backbones/vit_adapter.py | 3 ++- paddleseg/models/layers/ms_deformable_attention.py | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/paddleseg/core/val.py b/paddleseg/core/val.py index 80a820b6bc..828edd5bdb 100644 --- a/paddleseg/core/val.py +++ b/paddleseg/core/val.py @@ -98,6 +98,8 @@ def evaluate(model, batch_start = time.time() with paddle.no_grad(): for iter, data in enumerate(loader): + if iter % 20 == 0: + print('({} / {}'.format(iter, total_iters)) reader_cost_averager.record(time.time() - batch_start) label = data['label'].astype('int64') diff --git a/paddleseg/models/backbones/vit_adapter.py b/paddleseg/models/backbones/vit_adapter.py index 649e89c9d5..6d5366d0f2 100644 --- a/paddleseg/models/backbones/vit_adapter.py +++ b/paddleseg/models/backbones/vit_adapter.py @@ -14,7 +14,8 @@ from paddleseg.cvlibs.param_init import normal_init, trunc_normal_init, constant_init from paddleseg.models.backbones.transformer_utils import to_2tuple, DropPath from paddleseg.models.layers.vit_adapter_layers import ( - SpatialPriorModule, InteractionBlock, deform_inputs, MSDeformAttn) + SpatialPriorModule, InteractionBlock, deform_inputs) +from paddleseg.models.layers.ms_deformable_attention import MSDeformAttn __all__ = ['ViTAdapter', 'ViTAdapter_Tiny'] diff --git a/paddleseg/models/layers/ms_deformable_attention.py b/paddleseg/models/layers/ms_deformable_attention.py index 880d12dca2..0df00e44d9 100644 --- a/paddleseg/models/layers/ms_deformable_attention.py +++ b/paddleseg/models/layers/ms_deformable_attention.py @@ -1,4 +1,5 @@ # This file is heavily based on https://github.com/czczup/ViT-Adapter + import math import warnings From e5ba028178bc8cf4c5cfa7997e95ad3bd2fb2922 Mon Sep 17 00:00:00 2001 From: juncaipeng <13006307475@163.com> Date: Mon, 28 Nov 2022 17:03:43 +0800 Subject: [PATCH 14/16] refine for merge 6 --- paddleseg/models/layers/ms_deformable_attention.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddleseg/models/layers/ms_deformable_attention.py b/paddleseg/models/layers/ms_deformable_attention.py index 0df00e44d9..4b7bc143c5 100644 --- a/paddleseg/models/layers/ms_deformable_attention.py +++ b/paddleseg/models/layers/ms_deformable_attention.py @@ -1,5 +1,4 @@ # This file is heavily based on https://github.com/czczup/ViT-Adapter - import math import warnings @@ -18,6 +17,7 @@ "the readme to install ms_deform_attn lib: " "https://paddleseg.bj.bcebos.com/dygraph/customized_ops/ms_deform_attn.zip" ) + exit() class MSDeformAttn(nn.Layer): From e7cf06533b7d878c3815d54c987c2b9b51f0afce Mon Sep 17 00:00:00 2001 From: juncaipeng <13006307475@163.com> Date: Tue, 14 Mar 2023 19:59:43 +0800 Subject: [PATCH 15/16] up --- .../upernet_vit_adapter_tiny_ade20k_512x512_160k.yml | 10 ---------- paddleseg/core/val.py | 2 -- 2 files changed, 12 deletions(-) diff --git a/configs/vit_adapter/upernet_vit_adapter_tiny_ade20k_512x512_160k.yml b/configs/vit_adapter/upernet_vit_adapter_tiny_ade20k_512x512_160k.yml index 1ebcab40b3..fbc2110a29 100644 --- a/configs/vit_adapter/upernet_vit_adapter_tiny_ade20k_512x512_160k.yml +++ b/configs/vit_adapter/upernet_vit_adapter_tiny_ade20k_512x512_160k.yml @@ -33,16 +33,6 @@ test_config: is_slide: True crop_size: [512, 512] stride: [341, 341] - -export: - transforms: - - type: Resize - target_size: [2048, 512] - keep_ratio: True - size_divisor: 32 - - type: Normalize - mean: [0.485, 0.456, 0.406] - std: [0.229, 0.224, 0.225] optimizer: _inherited_: False diff --git a/paddleseg/core/val.py b/paddleseg/core/val.py index 828edd5bdb..80a820b6bc 100644 --- a/paddleseg/core/val.py +++ b/paddleseg/core/val.py @@ -98,8 +98,6 @@ def evaluate(model, batch_start = time.time() with paddle.no_grad(): for iter, data in enumerate(loader): - if iter % 20 == 0: - print('({} / {}'.format(iter, total_iters)) reader_cost_averager.record(time.time() - batch_start) label = data['label'].astype('int64') From 78bc4b25d40968edc4f17043f025063f0ee754c9 Mon Sep 17 00:00:00 2001 From: juncaipeng <13006307475@163.com> Date: Fri, 17 Mar 2023 10:22:54 +0800 Subject: [PATCH 16/16] fix import ms_deform_attn --- configs/vit_adapter/README.md | 3 +++ .../models/layers/ms_deformable_attention.py | 19 +++++++++---------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/configs/vit_adapter/README.md b/configs/vit_adapter/README.md index aab2e7e458..ff904971fc 100644 --- a/configs/vit_adapter/README.md +++ b/configs/vit_adapter/README.md @@ -4,6 +4,9 @@ > Chen, Zhe, Yuchen Duan, Wenhai Wang, Junjun He, Tong Lu, Jifeng Dai, and Yu Qiao. "Vision Transformer Adapter for Dense Predictions." arXiv preprint arXiv:2205.08534 (2022). +## Prerequesites + +Download the ms_deform_attn.zip (https://paddleseg.bj.bcebos.com/dygraph/customized_ops/ms_deform_attn.zip), and then refer to the readme to install ms_deform_attn lib. ## Performance ### ADE20K diff --git a/paddleseg/models/layers/ms_deformable_attention.py b/paddleseg/models/layers/ms_deformable_attention.py index 4b7bc143c5..8af9f36679 100644 --- a/paddleseg/models/layers/ms_deformable_attention.py +++ b/paddleseg/models/layers/ms_deformable_attention.py @@ -9,16 +9,6 @@ from paddleseg.cvlibs import param_init from paddleseg.cvlibs.param_init import constant_init, xavier_uniform -try: - import ms_deform_attn -except: - print( - "Import ms_deform_attn failed. Please download the following file and refer to " - "the readme to install ms_deform_attn lib: " - "https://paddleseg.bj.bcebos.com/dygraph/customized_ops/ms_deform_attn.zip" - ) - exit() - class MSDeformAttn(nn.Layer): def __init__(self, @@ -153,6 +143,15 @@ def masked_fill(x, mask, value): raise ValueError( 'Last dim of reference_points must be 2 or 4, but get {} instead.' .format(reference_points.shape[-1])) + try: + import ms_deform_attn + except: + print( + "Import ms_deform_attn failed. Please download the following file and refer to " + "the readme to install ms_deform_attn lib: " + "https://paddleseg.bj.bcebos.com/dygraph/customized_ops/ms_deform_attn.zip" + ) + exit() output = ms_deform_attn.ms_deform_attn( value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)