diff --git a/docs/developer-guide/operators.md b/docs/developer-guide/operators.md
index 10fe1f03f0f..745043e4789 100644
--- a/docs/developer-guide/operators.md
+++ b/docs/developer-guide/operators.md
@@ -1,168 +1,177 @@
-
-* [AbsVal](#absval)
-* [ArgMax](#argmax)
-* [BatchNorm](#batchnorm)
-* [Bias](#bias)
-* [BinaryOp](#binaryop)
-* [BNLL](#bnll)
-* [Cast](#cast)
-* [CELU](#celu)
-* [Clip](#clip)
-* [Concat](#concat)
-* [Convolution](#convolution)
-* [Convolution1D](#convolution1d)
-* [Convolution3D](#convolution3d)
-* [ConvolutionDepthWise](#convolutiondepthwise)
-* [ConvolutionDepthWise1D](#convolutiondepthwise1d)
-* [ConvolutionDepthWise3D](#convolutiondepthwise3d)
-* [CopyTo](#copyto)
-* [Crop](#crop)
-* [CumulativeSum](#cumulativesum)
-* [Deconvolution](#deconvolution)
-* [Deconvolution1D](#deconvolution1d)
-* [Deconvolution3D](#deconvolution3d)
-* [DeconvolutionDepthWise](#deconvolutiondepthwise)
-* [DeconvolutionDepthWise1D](#deconvolutiondepthwise1d)
-* [DeconvolutionDepthWise3D](#deconvolutiondepthwise3d)
-* [DeformableConv2D](#deformableconv2d)
-* [Dequantize](#dequantize)
-* [Diag](#diag)
-* [Dropout](#dropout)
-* [Eltwise](#eltwise)
-* [ELU](#elu)
-* [Embed](#embed)
-* [Exp](#exp)
-* [Flatten](#flatten)
-* [Fold](#fold)
-* [GELU](#gelu)
-* [GLU](#glu)
-* [Gemm](#gemm)
-* [GridSample](#gridsample)
-* [GroupNorm](#groupnorm)
-* [GRU](#gru)
-* [HardSigmoid](#hardsigmoid)
-* [HardSwish](#hardswish)
-* [InnerProduct](#innerproduct)
-* [Input](#input)
-* [InstanceNorm](#instancenorm)
-* [Interp](#interp)
-* [InverseSpectrogram](#inversespectrogram)
-* [LayerNorm](#layernorm)
-* [Log](#log)
-* [LRN](#lrn)
-* [LSTM](#lstm)
-* [MemoryData](#memorydata)
-* [Mish](#mish)
-* [MultiHeadAttention](#multiheadattention)
-* [MVN](#mvn)
-* [Noop](#noop)
-* [Normalize](#normalize)
-* [Packing](#packing)
-* [Padding](#padding)
-* [Permute](#permute)
-* [PixelShuffle](#pixelshuffle)
-* [Pooling](#pooling)
-* [Pooling1D](#pooling1d)
-* [Pooling3D](#pooling3d)
-* [Power](#power)
-* [PReLU](#prelu)
-* [Quantize](#quantize)
-* [Reduction](#reduction)
-* [ReLU](#relu)
-* [Reorg](#reorg)
-* [Requantize](#requantize)
-* [Reshape](#reshape)
-* [RMSNorm](#rmsnorm)
-* [RNN](#rnn)
-* [Scale](#scale)
-* [SELU](#selu)
-* [Shrink](#shrink)
-* [ShuffleChannel](#shufflechannel)
-* [Sigmoid](#sigmoid)
-* [Slice](#slice)
-* [Softmax](#softmax)
-* [Softplus](#softplus)
-* [Spectrogram](#spectrogram)
-* [Split](#split)
-* [Swish](#swish)
-* [TanH](#tanh)
-* [Threshold](#threshold)
-* [Tile](#tile)
-* [UnaryOp](#unaryop)
-* [Unfold](#unfold)
+- [AbsVal](#absval)
+- [ArgMax](#argmax)
+- [BatchNorm](#batchnorm)
+- [Bias](#bias)
+- [BinaryOp](#binaryop)
+- [BNLL](#bnll)
+- [Cast](#cast)
+- [CELU](#celu)
+- [Clip](#clip)
+- [Concat](#concat)
+- [Convolution](#convolution)
+- [Convolution1D](#convolution1d)
+- [Convolution3D](#convolution3d)
+- [ConvolutionDepthWise](#convolutiondepthwise)
+- [ConvolutionDepthWise1D](#convolutiondepthwise1d)
+- [ConvolutionDepthWise3D](#convolutiondepthwise3d)
+- [CopyTo](#copyto)
+- [Crop](#crop)
+- [CumulativeSum](#cumulativesum)
+- [Deconvolution](#deconvolution)
+- [Deconvolution1D](#deconvolution1d)
+- [Deconvolution3D](#deconvolution3d)
+- [DeconvolutionDepthWise](#deconvolutiondepthwise)
+- [DeconvolutionDepthWise1D](#deconvolutiondepthwise1d)
+- [DeconvolutionDepthWise3D](#deconvolutiondepthwise3d)
+- [DeformableConv2D](#deformableconv2d)
+- [Dequantize](#dequantize)
+- [Diag](#diag)
+- [Dropout](#dropout)
+- [Eltwise](#eltwise)
+- [ELU](#elu)
+- [Embed](#embed)
+- [Exp](#exp)
+- [Flatten](#flatten)
+- [Flip](#flip)
+- [Fold](#fold)
+- [GELU](#gelu)
+- [GLU](#glu)
+- [Gemm](#gemm)
+- [GridSample](#gridsample)
+- [GroupNorm](#groupnorm)
+- [GRU](#gru)
+- [HardSigmoid](#hardsigmoid)
+- [HardSwish](#hardswish)
+- [InnerProduct](#innerproduct)
+- [Input](#input)
+- [InstanceNorm](#instancenorm)
+- [Interp](#interp)
+- [InverseSpectrogram](#inversespectrogram)
+- [LayerNorm](#layernorm)
+- [Log](#log)
+- [LRN](#lrn)
+- [LSTM](#lstm)
+- [MemoryData](#memorydata)
+- [Mish](#mish)
+- [MultiHeadAttention](#multiheadattention)
+- [MVN](#mvn)
+- [Noop](#noop)
+- [Normalize](#normalize)
+- [Packing](#packing)
+- [Padding](#padding)
+- [Permute](#permute)
+- [PixelShuffle](#pixelshuffle)
+- [Pooling](#pooling)
+- [Pooling1D](#pooling1d)
+- [Pooling3D](#pooling3d)
+- [Power](#power)
+- [PReLU](#prelu)
+- [Quantize](#quantize)
+- [Reduction](#reduction)
+- [ReLU](#relu)
+- [Reorg](#reorg)
+- [Requantize](#requantize)
+- [Reshape](#reshape)
+- [RMSNorm](#rmsnorm)
+- [RNN](#rnn)
+- [Scale](#scale)
+- [SELU](#selu)
+- [Shrink](#shrink)
+- [ShuffleChannel](#shufflechannel)
+- [Sigmoid](#sigmoid)
+- [Slice](#slice)
+- [Softmax](#softmax)
+- [Softplus](#softplus)
+- [Spectrogram](#spectrogram)
+- [Split](#split)
+- [Swish](#swish)
+- [TanH](#tanh)
+- [Threshold](#threshold)
+- [Tile](#tile)
+- [UnaryOp](#unaryop)
+- [Unfold](#unfold)
 
 # AbsVal
+
 ```
 y = abs(x)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
 # ArgMax
+
 ```
 y = argmax(x, out_max_val, topk)
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | out_max_val   | int   | 0         |                   |
-| 1         | topk          | int   | 1         |                   |
+| param id | name        | type | default | description |
+| -------- | ----------- | ---- | ------- | ----------- |
+| 0        | out_max_val | int  | 0       |             |
+| 1        | topk        | int  | 1       |             |
 
 # BatchNorm
+
 ```
 y = (x - mean) / sqrt(var + eps) * slope + bias
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | channels      | int   | 0         |                   |
-| 1         | eps           | float | 0.f       |                   |
+| param id | name     | type  | default | description |
+| -------- | -------- | ----- | ------- | ----------- |
+| 0        | channels | int   | 0       |             |
+| 1        | eps      | float | 0.f     |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| slope_data    | float | [channels]            |
-| mean_data     | float | [channels]            |
-| var_data      | float | [channels]            |
-| bias_data     | float | [channels]            |
+| weight     | type  | shape      |
+| ---------- | ----- | ---------- |
+| slope_data | float | [channels] |
+| mean_data  | float | [channels] |
+| var_data   | float | [channels] |
+| bias_data  | float | [channels] |
 
 # Bias
+
 ```
 y = x + bias
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | bias_data_size| int   | 0         |                   |
+| param id | name           | type | default | description |
+| -------- | -------------- | ---- | ------- | ----------- |
+| 0        | bias_data_size | int  | 0       |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| bias_data     | float | [channels]            |
+| weight    | type  | shape      |
+| --------- | ----- | ---------- |
+| bias_data | float | [channels] |
 
 # BinaryOp
- This operation is used for binary computation, and the calculation rule depends on the [broadcasting rule](https://github.com/Tencent/ncnn/wiki/binaryop-broadcasting).
+
+This operation is used for binary computation, and the calculation rule depends on the [broadcasting rule](https://github.com/Tencent/ncnn/wiki/binaryop-broadcasting).
+
 ```
 C = binaryop(A, B)
 ```
+
 if with_scalar = 1:
+
 - one_blob_only
 - support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | op_type       | int   | 0         | Operation type as follows |
-| 1         | with_scalar   | int   | 0         | with_scalar=0 B is a matrix, with_scalar=1 B is a scalar |
-| 2         | b             | float | 0.f       | When B is a scalar, B = b |
+| param id | name        | type  | default | description                                              |
+| -------- | ----------- | ----- | ------- | -------------------------------------------------------- |
+| 0        | op_type     | int   | 0       | Operation type as follows                                |
+| 1        | with_scalar | int   | 0       | with_scalar=0 B is a matrix, with_scalar=1 B is a scalar |
+| 2        | b           | float | 0.f     | When B is a scalar, B = b                                |
 
 Operation type:
+
 - 0 = ADD
 - 1 = SUB
 - 2 = MUL
@@ -177,28 +186,31 @@ Operation type:
 - 11 = RATAN2
 
 # BNLL
+
 ```
 y = log(1 + e^(-x)) , x > 0
 y = log(1 + e^x),     x < 0
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
 # Cast
+
 ```
 y = cast(x)
 ```
 
-* one_blob_only
-* support_packing
+- one_blob_only
+- support_packing
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | type_from     | int   | 0         |                   |
-| 1         | type_to       | int   | 0         |                   |
+| param id | name      | type | default | description |
+| -------- | --------- | ---- | ------- | ----------- |
+| 0        | type_from | int  | 0       |             |
+| 1        | type_to   | int  | 0       |             |
 
 Element type:
+
 - 0 = auto
 - 1 = float32
 - 2 = float16
@@ -206,293 +218,304 @@ Element type:
 - 4 = bfloat16
 
 # CELU
+
 ```
 if x < 0    y = (exp(x / alpha) - 1.f) * alpha
 else        y = x
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | alpha         | float | 1.f       |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | alpha | float | 1.f     |             |
 
 # Clip
+
 ```
 y = clamp(x, min, max)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | min           | float | -FLT_MAX  |                   |
-| 1         | max           | float | FLT_MAX   |                   |
+| param id | name | type  | default  | description |
+| -------- | ---- | ----- | -------- | ----------- |
+| 0        | min  | float | -FLT_MAX |             |
+| 1        | max  | float | FLT_MAX  |             |
 
 # Concat
+
 ```
 y = concat(x0, x1, x2, ...) by axis
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | axis          | int   | 0         |                   |
+| param id | name | type | default | description |
+| -------- | ---- | ---- | ------- | ----------- |
+| 0        | axis | int  | 0       |             |
 
 # Convolution
+
 ```
 x2 = pad(x, pads, pad_value)
 x3 = conv(x2, weight, kernel, stride, dilation) + bias
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 8         | int8_scale_term| int  | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 18        | pad_value     | float | 0.f       |                   |
-| 19        | dynamic_weight| int   | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16/int8 | [kernel_w, kernel_h, num_input, num_output] |
-| bias_data     | float | [num_output]          |
-| weight_data_int8_scales| float | [num_output] |
-| bottom_blob_int8_scales| float | [1]          |
-| top_blob_int8_scales| float | [1]             |
+- one_blob_only
+
+| param id | name              | type  | default    | description |
+| -------- | ----------------- | ----- | ---------- | ----------- |
+| 0        | num_output        | int   | 0          |             |
+| 1        | kernel_w          | int   | 0          |             |
+| 2        | dilation_w        | int   | 1          |             |
+| 3        | stride_w          | int   | 1          |             |
+| 4        | pad_left          | int   | 0          |             |
+| 5        | bias_term         | int   | 0          |             |
+| 6        | weight_data_size  | int   | 0          |             |
+| 8        | int8_scale_term   | int   | 0          |             |
+| 9        | activation_type   | int   | 0          |             |
+| 10       | activation_params | array | [ ]        |             |
+| 11       | kernel_h          | int   | kernel_w   |             |
+| 12       | dilation_h        | int   | dilation_w |             |
+| 13       | stride_h          | int   | stride_w   |             |
+| 14       | pad_top           | int   | pad_left   |             |
+| 15       | pad_right         | int   | pad_left   |             |
+| 16       | pad_bottom        | int   | pad_top    |             |
+| 18       | pad_value         | float | 0.f        |             |
+| 19       | dynamic_weight    | int   | 0          |             |
+
+| weight                  | type            | shape                                       |
+| ----------------------- | --------------- | ------------------------------------------- |
+| weight_data             | float/fp16/int8 | [kernel_w, kernel_h, num_input, num_output] |
+| bias_data               | float           | [num_output]                                |
+| weight_data_int8_scales | float           | [num_output]                                |
+| bottom_blob_int8_scales | float           | [1]                                         |
+| top_blob_int8_scales    | float           | [1]                                         |
 
 # Convolution1D
+
 ```
 x2 = pad(x, pads, pad_value)
 x3 = conv1d(x2, weight, kernel, stride, dilation) + bias
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 18        | pad_value     | float | 0.f       |                   |
-| 19        | dynamic_weight| int   | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16/int8 | [kernel_w, num_input, num_output] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default  | description |
+| -------- | ----------------- | ----- | -------- | ----------- |
+| 0        | num_output        | int   | 0        |             |
+| 1        | kernel_w          | int   | 0        |             |
+| 2        | dilation_w        | int   | 1        |             |
+| 3        | stride_w          | int   | 1        |             |
+| 4        | pad_left          | int   | 0        |             |
+| 5        | bias_term         | int   | 0        |             |
+| 6        | weight_data_size  | int   | 0        |             |
+| 9        | activation_type   | int   | 0        |             |
+| 10       | activation_params | array | [ ]      |             |
+| 15       | pad_right         | int   | pad_left |             |
+| 18       | pad_value         | float | 0.f      |             |
+| 19       | dynamic_weight    | int   | 0        |             |
+
+| weight      | type            | shape                             |
+| ----------- | --------------- | --------------------------------- |
+| weight_data | float/fp16/int8 | [kernel_w, num_input, num_output] |
+| bias_data   | float           | [num_output]                      |
 
 # Convolution3D
+
 ```
 x2 = pad(x, pads, pad_value)
 x3 = conv3d(x2, weight, kernel, stride, dilation) + bias
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 17        | pad_behind    | int   | pad_front |                   |
-| 18        | pad_value     | float | 0.f       |                   |
-| 21        | kernel_d      | int   | kernel_w  |                   |
-| 22        | dilation_d    | int   | dilation_w |                  |
-| 23        | stride_d      | int   | stride_w  |                   |
-| 24        | pad_front     | int   | pad_left  |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16/int8 | [kernel_w, kernel_h, kernel_d, num_input, num_output] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default    | description |
+| -------- | ----------------- | ----- | ---------- | ----------- |
+| 0        | num_output        | int   | 0          |             |
+| 1        | kernel_w          | int   | 0          |             |
+| 2        | dilation_w        | int   | 1          |             |
+| 3        | stride_w          | int   | 1          |             |
+| 4        | pad_left          | int   | 0          |             |
+| 5        | bias_term         | int   | 0          |             |
+| 6        | weight_data_size  | int   | 0          |             |
+| 9        | activation_type   | int   | 0          |             |
+| 10       | activation_params | array | [ ]        |             |
+| 11       | kernel_h          | int   | kernel_w   |             |
+| 12       | dilation_h        | int   | dilation_w |             |
+| 13       | stride_h          | int   | stride_w   |             |
+| 14       | pad_top           | int   | pad_left   |             |
+| 15       | pad_right         | int   | pad_left   |             |
+| 16       | pad_bottom        | int   | pad_top    |             |
+| 17       | pad_behind        | int   | pad_front  |             |
+| 18       | pad_value         | float | 0.f        |             |
+| 21       | kernel_d          | int   | kernel_w   |             |
+| 22       | dilation_d        | int   | dilation_w |             |
+| 23       | stride_d          | int   | stride_w   |             |
+| 24       | pad_front         | int   | pad_left   |             |
+
+| weight      | type            | shape                                                 |
+| ----------- | --------------- | ----------------------------------------------------- |
+| weight_data | float/fp16/int8 | [kernel_w, kernel_h, kernel_d, num_input, num_output] |
+| bias_data   | float           | [num_output]                                          |
 
 # ConvolutionDepthWise
+
 ```
 x2 = pad(x, pads, pad_value)
 x3 = conv(x2, weight, kernel, stride, dilation, group) + bias
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 7         | group         | int   | 1         |                   |
-| 8         | int8_scale_term| int  | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 18        | pad_value     | float | 0.f       |                   |
-| 19        | dynamic_weight| int   | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16/int8 | [kernel_w, kernel_h, num_input / group, num_output / group, group] |
-| bias_data     | float | [num_output]          |
-| weight_data_int8_scales| float | [group]      |
-| bottom_blob_int8_scales| float | [1]          |
-| top_blob_int8_scales| float | [1]             |
+- one_blob_only
+
+| param id | name              | type  | default    | description |
+| -------- | ----------------- | ----- | ---------- | ----------- |
+| 0        | num_output        | int   | 0          |             |
+| 1        | kernel_w          | int   | 0          |             |
+| 2        | dilation_w        | int   | 1          |             |
+| 3        | stride_w          | int   | 1          |             |
+| 4        | pad_left          | int   | 0          |             |
+| 5        | bias_term         | int   | 0          |             |
+| 6        | weight_data_size  | int   | 0          |             |
+| 7        | group             | int   | 1          |             |
+| 8        | int8_scale_term   | int   | 0          |             |
+| 9        | activation_type   | int   | 0          |             |
+| 10       | activation_params | array | [ ]        |             |
+| 11       | kernel_h          | int   | kernel_w   |             |
+| 12       | dilation_h        | int   | dilation_w |             |
+| 13       | stride_h          | int   | stride_w   |             |
+| 14       | pad_top           | int   | pad_left   |             |
+| 15       | pad_right         | int   | pad_left   |             |
+| 16       | pad_bottom        | int   | pad_top    |             |
+| 18       | pad_value         | float | 0.f        |             |
+| 19       | dynamic_weight    | int   | 0          |             |
+
+| weight                  | type            | shape                                                              |
+| ----------------------- | --------------- | ------------------------------------------------------------------ |
+| weight_data             | float/fp16/int8 | [kernel_w, kernel_h, num_input / group, num_output / group, group] |
+| bias_data               | float           | [num_output]                                                       |
+| weight_data_int8_scales | float           | [group]                                                            |
+| bottom_blob_int8_scales | float           | [1]                                                                |
+| top_blob_int8_scales    | float           | [1]                                                                |
 
 # ConvolutionDepthWise1D
+
 ```
 x2 = pad(x, pads, pad_value)
 x3 = conv1d(x2, weight, kernel, stride, dilation, group) + bias
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 7         | group         | int   | 1         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 18        | pad_value     | float | 0.f       |                   |
-| 19        | dynamic_weight| int   | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16/int8 | [kernel_w, num_input / group, num_output / group, group] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default  | description |
+| -------- | ----------------- | ----- | -------- | ----------- |
+| 0        | num_output        | int   | 0        |             |
+| 1        | kernel_w          | int   | 0        |             |
+| 2        | dilation_w        | int   | 1        |             |
+| 3        | stride_w          | int   | 1        |             |
+| 4        | pad_left          | int   | 0        |             |
+| 5        | bias_term         | int   | 0        |             |
+| 6        | weight_data_size  | int   | 0        |             |
+| 7        | group             | int   | 1        |             |
+| 9        | activation_type   | int   | 0        |             |
+| 10       | activation_params | array | [ ]      |             |
+| 15       | pad_right         | int   | pad_left |             |
+| 18       | pad_value         | float | 0.f      |             |
+| 19       | dynamic_weight    | int   | 0        |             |
+
+| weight      | type            | shape                                                    |
+| ----------- | --------------- | -------------------------------------------------------- |
+| weight_data | float/fp16/int8 | [kernel_w, num_input / group, num_output / group, group] |
+| bias_data   | float           | [num_output]                                             |
 
 # ConvolutionDepthWise3D
+
 ```
 x2 = pad(x, pads, pad_value)
 x3 = conv3d(x2, weight, kernel, stride, dilation, group) + bias
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 7         | group         | int   | 1         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 17        | pad_behind    | int   | pad_front |                   |
-| 18        | pad_value     | float | 0.f       |                   |
-| 21        | kernel_d      | int   | kernel_w  |                   |
-| 22        | dilation_d    | int   | dilation_w |                  |
-| 23        | stride_d      | int   | stride_w  |                   |
-| 24        | pad_front     | int   | pad_left  |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16/int8 | [kernel_w, kernel_h, kernel_d, num_input / group, num_output / group, group] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default    | description |
+| -------- | ----------------- | ----- | ---------- | ----------- |
+| 0        | num_output        | int   | 0          |             |
+| 1        | kernel_w          | int   | 0          |             |
+| 2        | dilation_w        | int   | 1          |             |
+| 3        | stride_w          | int   | 1          |             |
+| 4        | pad_left          | int   | 0          |             |
+| 5        | bias_term         | int   | 0          |             |
+| 6        | weight_data_size  | int   | 0          |             |
+| 7        | group             | int   | 1          |             |
+| 9        | activation_type   | int   | 0          |             |
+| 10       | activation_params | array | [ ]        |             |
+| 11       | kernel_h          | int   | kernel_w   |             |
+| 12       | dilation_h        | int   | dilation_w |             |
+| 13       | stride_h          | int   | stride_w   |             |
+| 14       | pad_top           | int   | pad_left   |             |
+| 15       | pad_right         | int   | pad_left   |             |
+| 16       | pad_bottom        | int   | pad_top    |             |
+| 17       | pad_behind        | int   | pad_front  |             |
+| 18       | pad_value         | float | 0.f        |             |
+| 21       | kernel_d          | int   | kernel_w   |             |
+| 22       | dilation_d        | int   | dilation_w |             |
+| 23       | stride_d          | int   | stride_w   |             |
+| 24       | pad_front         | int   | pad_left   |             |
+
+| weight      | type            | shape                                                                        |
+| ----------- | --------------- | ---------------------------------------------------------------------------- |
+| weight_data | float/fp16/int8 | [kernel_w, kernel_h, kernel_d, num_input / group, num_output / group, group] |
+| bias_data   | float           | [num_output]                                                                 |
 
 # CopyTo
+
 ```
 self[offset] = src
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | woffset       | int   | 0         |                   |
-| 1         | hoffset       | int   | 0         |                   |
-| 13        | doffset       | int   | 0         |                   |
-| 2         | coffset       | int   | 0         |                   |
-| 9         | starts        | array | [ ]       |                   |
-| 11        | axes          | array | [ ]       |                   |
+| param id | name    | type  | default | description |
+| -------- | ------- | ----- | ------- | ----------- |
+| 0        | woffset | int   | 0       |             |
+| 1        | hoffset | int   | 0       |             |
+| 13       | doffset | int   | 0       |             |
+| 2        | coffset | int   | 0       |             |
+| 9        | starts  | array | [ ]     |             |
+| 11       | axes    | array | [ ]     |             |
 
 # Crop
+
 ```
 y = crop(x)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | woffset       | int   | 0         |                   |
-| 1         | hoffset       | int   | 0         |                   |
-| 13        | doffset       | int   | 0         |                   |
-| 2         | coffset       | int   | 0         |                   |
-| 3         | outw          | int   | 0         |                   |
-| 4         | outh          | int   | 0         |                   |
-| 14        | outd          | int   | 0         |                   |
-| 5         | outc          | int   | 0         |                   |
-| 6         | woffset2      | int   | 0         |                   |
-| 7         | hoffset2      | int   | 0         |                   |
-| 15        | doffset2      | int   | 0         |                   |
-| 8         | coffset2      | int   | 0         |                   |
-| 9         | starts        | array | [ ]       |                   |
-| 10        | ends          | array | [ ]       |                   |
-| 11        | axes          | array | [ ]       |                   |
+- one_blob_only
+
+| param id | name     | type  | default | description |
+| -------- | -------- | ----- | ------- | ----------- |
+| 0        | woffset  | int   | 0       |             |
+| 1        | hoffset  | int   | 0       |             |
+| 13       | doffset  | int   | 0       |             |
+| 2        | coffset  | int   | 0       |             |
+| 3        | outw     | int   | 0       |             |
+| 4        | outh     | int   | 0       |             |
+| 14       | outd     | int   | 0       |             |
+| 5        | outc     | int   | 0       |             |
+| 6        | woffset2 | int   | 0       |             |
+| 7        | hoffset2 | int   | 0       |             |
+| 15       | doffset2 | int   | 0       |             |
+| 8        | coffset2 | int   | 0       |             |
+| 9        | starts   | array | [ ]     |             |
+| 10       | ends     | array | [ ]     |             |
+| 11       | axes     | array | [ ]     |             |
 
 # CumulativeSum
 
@@ -500,408 +523,433 @@ If axis < 0, we use axis = x.dims + axis
 
 It implements https://pytorch.org/docs/stable/generated/torch.cumsum.html
 
-* one_blob_only
-* support_inplace
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | axis          | int   | 0         |                   |
+- one_blob_only
+- support_inplace
 
+| param id | name | type | default | description |
+| -------- | ---- | ---- | ------- | ----------- |
+| 0        | axis | int  | 0       |             |
 
 # Deconvolution
+
 ```
 x2 = deconv(x, weight, kernel, stride, dilation) + bias
 x3 = depad(x2, pads, pad_value)
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 18        | output_pad_right| int | 0         |                   |
-| 19        | output_pad_bottom| int | output_pad_right |           |
-| 20        | output_w      | int   | 0         |                   |
-| 21        | output_h      | int   | output_w  |                   |
-| 28        | dynamic_weight| int   | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16 | [kernel_w, kernel_h, num_input, num_output] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default          | description |
+| -------- | ----------------- | ----- | ---------------- | ----------- |
+| 0        | num_output        | int   | 0                |             |
+| 1        | kernel_w          | int   | 0                |             |
+| 2        | dilation_w        | int   | 1                |             |
+| 3        | stride_w          | int   | 1                |             |
+| 4        | pad_left          | int   | 0                |             |
+| 5        | bias_term         | int   | 0                |             |
+| 6        | weight_data_size  | int   | 0                |             |
+| 9        | activation_type   | int   | 0                |             |
+| 10       | activation_params | array | [ ]              |             |
+| 11       | kernel_h          | int   | kernel_w         |             |
+| 12       | dilation_h        | int   | dilation_w       |             |
+| 13       | stride_h          | int   | stride_w         |             |
+| 14       | pad_top           | int   | pad_left         |             |
+| 15       | pad_right         | int   | pad_left         |             |
+| 16       | pad_bottom        | int   | pad_top          |             |
+| 18       | output_pad_right  | int   | 0                |             |
+| 19       | output_pad_bottom | int   | output_pad_right |             |
+| 20       | output_w          | int   | 0                |             |
+| 21       | output_h          | int   | output_w         |             |
+| 28       | dynamic_weight    | int   | 0                |             |
+
+| weight      | type       | shape                                       |
+| ----------- | ---------- | ------------------------------------------- |
+| weight_data | float/fp16 | [kernel_w, kernel_h, num_input, num_output] |
+| bias_data   | float      | [num_output]                                |
 
 # Deconvolution1D
+
 ```
 x2 = deconv1d(x, weight, kernel, stride, dilation) + bias
 x3 = depad(x2, pads, pad_value)
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 18        | output_pad_right| int | 0         |                   |
-| 20        | output_w      | int   | 0         |                   |
-| 28        | dynamic_weight| int   | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16 | [kernel_w, num_input, num_output] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default  | description |
+| -------- | ----------------- | ----- | -------- | ----------- |
+| 0        | num_output        | int   | 0        |             |
+| 1        | kernel_w          | int   | 0        |             |
+| 2        | dilation_w        | int   | 1        |             |
+| 3        | stride_w          | int   | 1        |             |
+| 4        | pad_left          | int   | 0        |             |
+| 5        | bias_term         | int   | 0        |             |
+| 6        | weight_data_size  | int   | 0        |             |
+| 9        | activation_type   | int   | 0        |             |
+| 10       | activation_params | array | [ ]      |             |
+| 15       | pad_right         | int   | pad_left |             |
+| 18       | output_pad_right  | int   | 0        |             |
+| 20       | output_w          | int   | 0        |             |
+| 28       | dynamic_weight    | int   | 0        |             |
+
+| weight      | type       | shape                             |
+| ----------- | ---------- | --------------------------------- |
+| weight_data | float/fp16 | [kernel_w, num_input, num_output] |
+| bias_data   | float      | [num_output]                      |
 
 # Deconvolution3D
+
 ```
 x2 = deconv3d(x, weight, kernel, stride, dilation) + bias
 x3 = depad(x2, pads, pad_value)
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 17        | pad_behind    | int   | pad_front |                   |
-| 18        | output_pad_right| int | 0         |                   |
-| 19        | output_pad_bottom| int | output_pad_right |           |
-| 20        | output_pad_behind| int | output_pad_right |           |
-| 21        | kernel_d      | int   | kernel_w  |                   |
-| 22        | dilation_d    | int   | dilation_w |                  |
-| 23        | stride_d      | int   | stride_w  |                   |
-| 24        | pad_front     | int   | pad_left  |                   |
-| 25        | output_w      | int   | 0         |                   |
-| 26        | output_h      | int   | output_w  |                   |
-| 27        | output_d      | int   | output_w  |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16 | [kernel_w, kernel_h, kernel_d, num_input, num_output] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default          | description |
+| -------- | ----------------- | ----- | ---------------- | ----------- |
+| 0        | num_output        | int   | 0                |             |
+| 1        | kernel_w          | int   | 0                |             |
+| 2        | dilation_w        | int   | 1                |             |
+| 3        | stride_w          | int   | 1                |             |
+| 4        | pad_left          | int   | 0                |             |
+| 5        | bias_term         | int   | 0                |             |
+| 6        | weight_data_size  | int   | 0                |             |
+| 9        | activation_type   | int   | 0                |             |
+| 10       | activation_params | array | [ ]              |             |
+| 11       | kernel_h          | int   | kernel_w         |             |
+| 12       | dilation_h        | int   | dilation_w       |             |
+| 13       | stride_h          | int   | stride_w         |             |
+| 14       | pad_top           | int   | pad_left         |             |
+| 15       | pad_right         | int   | pad_left         |             |
+| 16       | pad_bottom        | int   | pad_top          |             |
+| 17       | pad_behind        | int   | pad_front        |             |
+| 18       | output_pad_right  | int   | 0                |             |
+| 19       | output_pad_bottom | int   | output_pad_right |             |
+| 20       | output_pad_behind | int   | output_pad_right |             |
+| 21       | kernel_d          | int   | kernel_w         |             |
+| 22       | dilation_d        | int   | dilation_w       |             |
+| 23       | stride_d          | int   | stride_w         |             |
+| 24       | pad_front         | int   | pad_left         |             |
+| 25       | output_w          | int   | 0                |             |
+| 26       | output_h          | int   | output_w         |             |
+| 27       | output_d          | int   | output_w         |             |
+
+| weight      | type       | shape                                                 |
+| ----------- | ---------- | ----------------------------------------------------- |
+| weight_data | float/fp16 | [kernel_w, kernel_h, kernel_d, num_input, num_output] |
+| bias_data   | float      | [num_output]                                          |
 
 # DeconvolutionDepthWise
+
 ```
 x2 = deconv(x, weight, kernel, stride, dilation, group) + bias
 x3 = depad(x2, pads, pad_value)
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 7         | group         | int   | 1         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 18        | output_pad_right| int | 0         |                   |
-| 19        | output_pad_bottom| int | output_pad_right |           |
-| 20        | output_w      | int   | 0         |                   |
-| 21        | output_h      | int   | output_w  |                   |
-| 28        | dynamic_weight| int   | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16 | [kernel_w, kernel_h, num_input / group, num_output / group, group] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default          | description |
+| -------- | ----------------- | ----- | ---------------- | ----------- |
+| 0        | num_output        | int   | 0                |             |
+| 1        | kernel_w          | int   | 0                |             |
+| 2        | dilation_w        | int   | 1                |             |
+| 3        | stride_w          | int   | 1                |             |
+| 4        | pad_left          | int   | 0                |             |
+| 5        | bias_term         | int   | 0                |             |
+| 6        | weight_data_size  | int   | 0                |             |
+| 7        | group             | int   | 1                |             |
+| 9        | activation_type   | int   | 0                |             |
+| 10       | activation_params | array | [ ]              |             |
+| 11       | kernel_h          | int   | kernel_w         |             |
+| 12       | dilation_h        | int   | dilation_w       |             |
+| 13       | stride_h          | int   | stride_w         |             |
+| 14       | pad_top           | int   | pad_left         |             |
+| 15       | pad_right         | int   | pad_left         |             |
+| 16       | pad_bottom        | int   | pad_top          |             |
+| 18       | output_pad_right  | int   | 0                |             |
+| 19       | output_pad_bottom | int   | output_pad_right |             |
+| 20       | output_w          | int   | 0                |             |
+| 21       | output_h          | int   | output_w         |             |
+| 28       | dynamic_weight    | int   | 0                |             |
+
+| weight      | type       | shape                                                              |
+| ----------- | ---------- | ------------------------------------------------------------------ |
+| weight_data | float/fp16 | [kernel_w, kernel_h, num_input / group, num_output / group, group] |
+| bias_data   | float      | [num_output]                                                       |
 
 # DeconvolutionDepthWise1D
+
 ```
 x2 = deconv1d(x, weight, kernel, stride, dilation, group) + bias
 x3 = depad(x2, pads, pad_value)
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 7         | group         | int   | 1         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 18        | output_pad_right| int | 0         |                   |
-| 20        | output_w      | int   | 0         |                   |
-| 28        | dynamic_weight| int   | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16 | [kernel_w, num_input / group, num_output / group, group] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default  | description |
+| -------- | ----------------- | ----- | -------- | ----------- |
+| 0        | num_output        | int   | 0        |             |
+| 1        | kernel_w          | int   | 0        |             |
+| 2        | dilation_w        | int   | 1        |             |
+| 3        | stride_w          | int   | 1        |             |
+| 4        | pad_left          | int   | 0        |             |
+| 5        | bias_term         | int   | 0        |             |
+| 6        | weight_data_size  | int   | 0        |             |
+| 7        | group             | int   | 1        |             |
+| 9        | activation_type   | int   | 0        |             |
+| 10       | activation_params | array | [ ]      |             |
+| 15       | pad_right         | int   | pad_left |             |
+| 18       | output_pad_right  | int   | 0        |             |
+| 20       | output_w          | int   | 0        |             |
+| 28       | dynamic_weight    | int   | 0        |             |
+
+| weight      | type       | shape                                                    |
+| ----------- | ---------- | -------------------------------------------------------- |
+| weight_data | float/fp16 | [kernel_w, num_input / group, num_output / group, group] |
+| bias_data   | float      | [num_output]                                             |
 
 # DeconvolutionDepthWise3D
+
 ```
 x2 = deconv3d(x, weight, kernel, stride, dilation, group) + bias
 x3 = depad(x2, pads, pad_value)
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 7         | group         | int   | 1         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 17        | pad_behind    | int   | pad_front |                   |
-| 18        | output_pad_right| int | 0         |                   |
-| 19        | output_pad_bottom| int | output_pad_right |           |
-| 20        | output_pad_behind| int | output_pad_right |           |
-| 21        | kernel_d      | int   | kernel_w  |                   |
-| 22        | dilation_d    | int   | dilation_w |                  |
-| 23        | stride_d      | int   | stride_w  |                   |
-| 24        | pad_front     | int   | pad_left  |                   |
-| 25        | output_w      | int   | 0         |                   |
-| 26        | output_h      | int   | output_w  |                   |
-| 27        | output_d      | int   | output_w  |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16 | [kernel_w, kernel_h, kernel_d, num_input / group, num_output / group, group] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default          | description |
+| -------- | ----------------- | ----- | ---------------- | ----------- |
+| 0        | num_output        | int   | 0                |             |
+| 1        | kernel_w          | int   | 0                |             |
+| 2        | dilation_w        | int   | 1                |             |
+| 3        | stride_w          | int   | 1                |             |
+| 4        | pad_left          | int   | 0                |             |
+| 5        | bias_term         | int   | 0                |             |
+| 6        | weight_data_size  | int   | 0                |             |
+| 7        | group             | int   | 1                |             |
+| 9        | activation_type   | int   | 0                |             |
+| 10       | activation_params | array | [ ]              |             |
+| 11       | kernel_h          | int   | kernel_w         |             |
+| 12       | dilation_h        | int   | dilation_w       |             |
+| 13       | stride_h          | int   | stride_w         |             |
+| 14       | pad_top           | int   | pad_left         |             |
+| 15       | pad_right         | int   | pad_left         |             |
+| 16       | pad_bottom        | int   | pad_top          |             |
+| 17       | pad_behind        | int   | pad_front        |             |
+| 18       | output_pad_right  | int   | 0                |             |
+| 19       | output_pad_bottom | int   | output_pad_right |             |
+| 20       | output_pad_behind | int   | output_pad_right |             |
+| 21       | kernel_d          | int   | kernel_w         |             |
+| 22       | dilation_d        | int   | dilation_w       |             |
+| 23       | stride_d          | int   | stride_w         |             |
+| 24       | pad_front         | int   | pad_left         |             |
+| 25       | output_w          | int   | 0                |             |
+| 26       | output_h          | int   | output_w         |             |
+| 27       | output_d          | int   | output_w         |             |
+
+| weight      | type       | shape                                                                        |
+| ----------- | ---------- | ---------------------------------------------------------------------------- |
+| weight_data | float/fp16 | [kernel_w, kernel_h, kernel_d, num_input / group, num_output / group, group] |
+| bias_data   | float      | [num_output]                                                                 |
 
 # DeformableConv2D
+
 ```
 x2 = deformableconv2d(x, offset, mask, weight, kernel, stride, dilation) + bias
 y = activation(x2, act_type, act_params)
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16/int8 | [kernel_w, kernel_h, num_input, num_output] |
-| bias_data     | float | [num_output]          |
+| param id | name              | type  | default    | description |
+| -------- | ----------------- | ----- | ---------- | ----------- |
+| 0        | num_output        | int   | 0          |             |
+| 1        | kernel_w          | int   | 0          |             |
+| 2        | dilation_w        | int   | 1          |             |
+| 3        | stride_w          | int   | 1          |             |
+| 4        | pad_left          | int   | 0          |             |
+| 5        | bias_term         | int   | 0          |             |
+| 6        | weight_data_size  | int   | 0          |             |
+| 9        | activation_type   | int   | 0          |             |
+| 10       | activation_params | array | [ ]        |             |
+| 11       | kernel_h          | int   | kernel_w   |             |
+| 12       | dilation_h        | int   | dilation_w |             |
+| 13       | stride_h          | int   | stride_w   |             |
+| 14       | pad_top           | int   | pad_left   |             |
+| 15       | pad_right         | int   | pad_left   |             |
+| 16       | pad_bottom        | int   | pad_top    |             |
+
+| weight      | type            | shape                                       |
+| ----------- | --------------- | ------------------------------------------- |
+| weight_data | float/fp16/int8 | [kernel_w, kernel_h, num_input, num_output] |
+| bias_data   | float           | [num_output]                                |
 
 # Dequantize
+
 ```
 y = x * scale + bias
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | scale_data_size| int  | 1         |                   |
-| 1         | bias_data_size| int   | 0         |                   |
+| param id | name            | type | default | description |
+| -------- | --------------- | ---- | ------- | ----------- |
+| 0        | scale_data_size | int  | 1       |             |
+| 1        | bias_data_size  | int  | 0       |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| scale_data    | float | [scale_data_size]     |
-| bias_data     | float | [bias_data_size]      |
+| weight     | type  | shape             |
+| ---------- | ----- | ----------------- |
+| scale_data | float | [scale_data_size] |
+| bias_data  | float | [bias_data_size]  |
 
 # Diag
+
 ```
 y = diag(x, diagonal)
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | diagonal      | int   | 0         |                   |
+| param id | name     | type | default | description |
+| -------- | -------- | ---- | ------- | ----------- |
+| 0        | diagonal | int  | 0       |             |
 
 # Dropout
+
 ```
 y = x * scale
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | scale         | float | 1.f       |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | scale | float | 1.f     |             |
 
 # Eltwise
+
 ```
 y = elementwise_op(x0, x1, ...)
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | op_type       | int   | 0         |                   |
-| 1         | coeffs        | array | [ ]       |                   |
+| param id | name    | type  | default | description |
+| -------- | ------- | ----- | ------- | ----------- |
+| 0        | op_type | int   | 0       |             |
+| 1        | coeffs  | array | [ ]     |             |
 
 Operation type:
+
 - 0 = PROD
 - 1 = SUM
 - 2 = MAX
 
 # ELU
+
 ```
 if x < 0    y = (exp(x) - 1) * alpha
 else        y = x
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | alpha         | float | 0.1f      |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | alpha | float | 0.1f    |             |
 
 # Embed
+
 ```
 y = embedding(x)
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | input_dim     | int   | 0         |                   |
-| 2         | bias_term     | int   | 0         |                   |
-| 3         | weight_data_size | int | 0        |                   |
-| 18        | int8_scale_term| int  | 0         |                   |
+| param id | name             | type | default | description |
+| -------- | ---------------- | ---- | ------- | ----------- |
+| 0        | num_output       | int  | 0       |             |
+| 1        | input_dim        | int  | 0       |             |
+| 2        | bias_term        | int  | 0       |             |
+| 3        | weight_data_size | int  | 0       |             |
+| 18       | int8_scale_term  | int  | 0       |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float | [weight_data_size]    |
-| bias_term     | float | [num_output]          |
-| weight_data_int8_scales| float | [1]          |
+| weight                  | type  | shape              |
+| ----------------------- | ----- | ------------------ |
+| weight_data             | float | [weight_data_size] |
+| bias_term               | float | [num_output]       |
+| weight_data_int8_scales | float | [1]                |
 
 # Exp
+
 ```
 if base == -1   y = exp(shift + x * scale)
 else            y = pow(base, (shift + x * scale))
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | base          | float | -1.f      |                   |
-| 1         | scale         | float | 1.f       |                   |
-| 2         | shift         | float | 0.f       |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | base  | float | -1.f    |             |
+| 1        | scale | float | 1.f     |             |
+| 2        | shift | float | 0.f     |             |
 
 # Flatten
+
 Reshape blob to 1 dimension
 
-* one_blob_only
+- one_blob_only
+
+# Flip
+
+- one_blob_only
+
+| param id | name | type  | default | description |
+| -------- | ---- | ----- | ------- | ----------- |
+| 0        | axis | array | []      |             |
 
 # Fold
+
 ```
 y = fold(x)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 20        | output_w      | int   | 0         |                   |
-| 21        | output_h      | int   | output_w  |                   |
+- one_blob_only
+
+| param id | name       | type | default    | description |
+| -------- | ---------- | ---- | ---------- | ----------- |
+| 0        | num_output | int  | 0          |             |
+| 1        | kernel_w   | int  | 0          |             |
+| 2        | dilation_w | int  | 1          |             |
+| 3        | stride_w   | int  | 1          |             |
+| 4        | pad_left   | int  | 0          |             |
+| 11       | kernel_h   | int  | kernel_w   |             |
+| 12       | dilation_h | int  | dilation_w |             |
+| 13       | stride_h   | int  | stride_w   |             |
+| 14       | pad_top    | int  | pad_left   |             |
+| 15       | pad_right  | int  | pad_left   |             |
+| 16       | pad_bottom | int  | pad_top    |             |
+| 20       | output_w   | int  | 0          |             |
+| 21       | output_h   | int  | output_w   |             |
 
 # GELU
+
 ```
 if fast_gelu == 1   y = 0.5 * x * (1 + tanh(0.79788452 * (x + 0.044715 * x * x * x)));
 else                y = 0.5 * x * erfc(-0.70710678 * x)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | fast_gelu     | int   | 0         | use approximation |
+| param id | name      | type | default | description       |
+| -------- | --------- | ---- | ------- | ----------------- |
+| 0        | fast_gelu | int  | 0       | use approximation |
 
 # GLU
 
@@ -913,13 +961,14 @@ where a is the first half of the input matrix and b is the second half.
 
 axis specifies the dimension to split the input
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | axis          | int   | 0         |                   |
+| param id | name | type | default | description |
+| -------- | ---- | ---- | ------- | ----------- |
+| 0        | axis | int  | 0       |             |
 
 # Gemm
+
 ```
 a = transA ? transpose(x0) : x0
 b = transb ? transpose(x1) : x1
@@ -927,88 +976,91 @@ c = x2
 y = (gemm(a, b) + c * beta) * alpha
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | alpha         | float | 1.f       |                   |
-| 1         | beta          | float | 1.f       |                   |
-| 2         | transA        | int   | 0         |                   |
-| 3         | transb        | int   | 0         |                   |
-| 4         | constantA     | int   | 0         |                   |
-| 5         | constantB     | int   | 0         |                   |
-| 6         | constantC     | int   | 0         |                   |
-| 7         | constantM     | int   | 0         |                   |
-| 8         | constantN     | int   | 0         |                   |
-| 9         | constantK     | int   | 0         |                   |
-| 10        | constant_broadcast_type_C | int | 0 |                 |
-| 11        | output_N1M    | int   | 0         |                   |
-| 12        | output_elempack | int | 0         |                   |
-| 13        | output_elemtype | int | 0         |                   |
-| 14        | output_transpose | int| 0         |                   |
-| 18        | int8_scale_term | int | 0         |                   |
-| 20        | constant_TILE_M | int | 0         |                   |
-| 21        | constant_TILE_N | int | 0         |                   |
-| 22        | constant_TILE_K | int | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| A_data        | float/fp16/int8 | [M, K] or [K, M] |
-| B_data        | float/fp16/int8 | [N, K] or [K, N] |
-| C_data        | float | [1], [M] or [N] or [1, M] or [N,1] or [N, M] |
-| A_data_int8_scales| float | [M]               |
-| B_data_int8_scales| float | [1]               |
+| param id | name                      | type  | default | description |
+| -------- | ------------------------- | ----- | ------- | ----------- |
+| 0        | alpha                     | float | 1.f     |             |
+| 1        | beta                      | float | 1.f     |             |
+| 2        | transA                    | int   | 0       |             |
+| 3        | transb                    | int   | 0       |             |
+| 4        | constantA                 | int   | 0       |             |
+| 5        | constantB                 | int   | 0       |             |
+| 6        | constantC                 | int   | 0       |             |
+| 7        | constantM                 | int   | 0       |             |
+| 8        | constantN                 | int   | 0       |             |
+| 9        | constantK                 | int   | 0       |             |
+| 10       | constant_broadcast_type_C | int   | 0       |             |
+| 11       | output_N1M                | int   | 0       |             |
+| 12       | output_elempack           | int   | 0       |             |
+| 13       | output_elemtype           | int   | 0       |             |
+| 14       | output_transpose          | int   | 0       |             |
+| 18       | int8_scale_term           | int   | 0       |             |
+| 20       | constant_TILE_M           | int   | 0       |             |
+| 21       | constant_TILE_N           | int   | 0       |             |
+| 22       | constant_TILE_K           | int   | 0       |             |
+
+| weight             | type            | shape                                        |
+| ------------------ | --------------- | -------------------------------------------- |
+| A_data             | float/fp16/int8 | [M, K] or [K, M]                             |
+| B_data             | float/fp16/int8 | [N, K] or [K, N]                             |
+| C_data             | float           | [1], [M] or [N] or [1, M] or [N,1] or [N, M] |
+| A_data_int8_scales | float           | [M]                                          |
+| B_data_int8_scales | float           | [1]                                          |
 
 # GridSample
+
 ```
 Given an input and a flow-field grid, computes the output using input values and pixel locations from grid.
 
-For each output location output[:, h2, w2], the size-2 vector grid[h2, w2, 2] specifies input pixel[:, h1, w1] locations x and y, 
+For each output location output[:, h2, w2], the size-2 vector grid[h2, w2, 2] specifies input pixel[:, h1, w1] locations x and y,
 which are used to interpolate the output value output[:, h2, w2]
 
 This function is often used in conjunction with affine_grid() to build Spatial Transformer Networks .
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | sample_type   | int   | 1         |                   |
-| 1         | padding_mode  | int   | 1         |                   |
-| 2         | align_corner  | int   | 0         |                   |
-| 3         | permute_fusion| int   | 0         | fuse with permute |
-
+| param id | name           | type | default | description       |
+| -------- | -------------- | ---- | ------- | ----------------- |
+| 0        | sample_type    | int  | 1       |                   |
+| 1        | padding_mode   | int  | 1       |                   |
+| 2        | align_corner   | int  | 0       |                   |
+| 3        | permute_fusion | int  | 0       | fuse with permute |
 
 Sample type:
+
 - 1 = Nearest
 - 2 = Bilinear
 - 3 = Bicubic
 
 Padding mode:
+
 - 1 = zeros
 - 2 = border
 - 3 = reflection
 
-
 # GroupNorm
+
 ```
 split x along channel axis into group x0, x1 ...
 l2 normalize for each group x0, x1 ...
 y = x * gamma + beta
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | group         | int   | 1         |                   |
-| 1         | channels      | int   | 0         |                   |
-| 2         | eps           | float | 0.001f    | x = x / sqrt(var + eps) |
-| 3         | affine        | int   | 1         |                   |
+| param id | name     | type  | default | description             |
+| -------- | -------- | ----- | ------- | ----------------------- |
+| 0        | group    | int   | 1       |                         |
+| 1        | channels | int   | 0       |                         |
+| 2        | eps      | float | 0.001f  | x = x / sqrt(var + eps) |
+| 3        | affine   | int   | 1       |                         |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| gamma_data    | float | [channels]            |
-| beta_data     | float | [channels]            |
+| weight     | type  | shape      |
+| ---------- | ----- | ---------- |
+| gamma_data | float | [channels] |
+| beta_data  | float | [channels] |
 
 # GRU
+
 Apply a single-layer GRU to a feature sequence of `T` timesteps. The input blob shape is `[w=input_size, h=T]` and the output blob shape is `[w=num_output, h=T]`.
 
 ```
@@ -1016,134 +1068,143 @@ y = gru(x)
 y0, hidden y1 = gru(x0, hidden x1)
 ```
 
-* one_blob_only if bidirectional
+- one_blob_only if bidirectional
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         | hidden size of output |
-| 1         | weight_data_size| int | 0         | total size of weight matrix |
-| 2         | direction     | int   | 0         | 0=forward, 1=reverse, 2=bidirectional |
+| param id | name             | type | default | description                           |
+| -------- | ---------------- | ---- | ------- | ------------------------------------- |
+| 0        | num_output       | int  | 0       | hidden size of output                 |
+| 1        | weight_data_size | int  | 0       | total size of weight matrix           |
+| 2        | direction        | int  | 0       | 0=forward, 1=reverse, 2=bidirectional |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_xc_data| float/fp16/int8 | [input_size, num_output * 3, num_directions] |
-| bias_c_data   | float/fp16/int8 | [num_output, 4, num_directions] |
-| weight_hc_data| float/fp16/int8 | [num_output, num_output * 3, num_directions] |
+| weight         | type            | shape                                        |
+| -------------- | --------------- | -------------------------------------------- |
+| weight_xc_data | float/fp16/int8 | [input_size, num_output * 3, num_directions] |
+| bias_c_data    | float/fp16/int8 | [num_output, 4, num_directions]              |
+| weight_hc_data | float/fp16/int8 | [num_output, num_output * 3, num_directions] |
 
 Direction flag:
+
 - 0 = forward only
 - 1 = reverse only
 - 2 = bidirectional
 
 # HardSigmoid
+
 ```
 y = clamp(x * alpha + beta, 0, 1)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | alpha         | float | 0.2f      |                   |
-| 1         | beta          | float | 0.5f      |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | alpha | float | 0.2f    |             |
+| 1        | beta  | float | 0.5f    |             |
 
 # HardSwish
+
 ```
 y = x * clamp(x * alpha + beta, 0, 1)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | alpha         | float | 0.2f      |                   |
-| 1         | beta          | float | 0.5f      |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | alpha | float | 0.2f    |             |
+| 1        | beta  | float | 0.5f    |             |
 
 # InnerProduct
+
 ```
 x2 = innerproduct(x, weight) + bias
 y = activation(x2, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | bias_term     | int   | 0         |                   |
-| 2         | weight_data_size| int | 0         |                   |
-| 8         | int8_scale_term| int  | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
+- one_blob_only
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16/int8 | [num_input, num_output] |
-| bias_data     | float | [num_output]          |
-| weight_data_int8_scales| float | [num_output] |
-| bottom_blob_int8_scales| float | [1]          |
+| param id | name              | type  | default | description |
+| -------- | ----------------- | ----- | ------- | ----------- |
+| 0        | num_output        | int   | 0       |             |
+| 1        | bias_term         | int   | 0       |             |
+| 2        | weight_data_size  | int   | 0       |             |
+| 8        | int8_scale_term   | int   | 0       |             |
+| 9        | activation_type   | int   | 0       |             |
+| 10       | activation_params | array | [ ]     |             |
+
+| weight                  | type            | shape                   |
+| ----------------------- | --------------- | ----------------------- |
+| weight_data             | float/fp16/int8 | [num_input, num_output] |
+| bias_data               | float           | [num_output]            |
+| weight_data_int8_scales | float           | [num_output]            |
+| bottom_blob_int8_scales | float           | [1]                     |
 
 # Input
+
 ```
 y = input
 ```
 
-* support_inplace
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | w             | int   | 0         |                   |
-| 1         | h             | int   | 0         |                   |
-| 11        | d             | int   | 0         |                   |
-| 2         | c             | int   | 0         |                   |
+| param id | name | type | default | description |
+| -------- | ---- | ---- | ------- | ----------- |
+| 0        | w    | int  | 0       |             |
+| 1        | h    | int  | 0       |             |
+| 11       | d    | int  | 0       |             |
+| 2        | c    | int  | 0       |             |
 
 # InstanceNorm
+
 ```
 split x along channel axis into instance x0, x1 ...
 l2 normalize for each channel instance x0, x1 ...
 y = x * gamma + beta
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | channels      | int   | 0         |                   |
-| 1         | eps           | float | 0.001f    | x = x / sqrt(var + eps) |
-| 2         | affine        | int   | 1         |                   |
+| param id | name     | type  | default | description             |
+| -------- | -------- | ----- | ------- | ----------------------- |
+| 0        | channels | int   | 0       |                         |
+| 1        | eps      | float | 0.001f  | x = x / sqrt(var + eps) |
+| 2        | affine   | int   | 1       |                         |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| gamma_data    | float | [channels]            |
-| beta_data     | float | [channels]            |
+| weight     | type  | shape      |
+| ---------- | ----- | ---------- |
+| gamma_data | float | [channels] |
+| beta_data  | float | [channels] |
 
 # Interp
+
 ```
 if dynamic_target_size == 0     y = resize(x) by fixed size or scale
 else                            y = resize(x0, size(x1))
 ```
 
-* one_blob_only if dynamic_target_size == 0
+- one_blob_only if dynamic_target_size == 0
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | resize_type   | int   | 0         |                   |
-| 1         | height_scale  | float | 1.f       |                   |
-| 2         | width_scale   | float | 1.f       |                   |
-| 3         | output_height | int   | 0         |                   |
-| 4         | output_width  | int   | 0         |                   |
-| 5         | dynamic_target_size| int | 0      |                   |
-| 6         | align_corner  | int   | 0         |                   |
+| param id | name                | type  | default | description |
+| -------- | ------------------- | ----- | ------- | ----------- |
+| 0        | resize_type         | int   | 0       |             |
+| 1        | height_scale        | float | 1.f     |             |
+| 2        | width_scale         | float | 1.f     |             |
+| 3        | output_height       | int   | 0       |             |
+| 4        | output_width        | int   | 0       |             |
+| 5        | dynamic_target_size | int   | 0       |             |
+| 6        | align_corner        | int   | 0       |             |
 
 Resize type:
+
 - 1 = Nearest
 - 2 = Bilinear
 - 3 = Bicubic
 
 # InverseSpectrogram
+
 ```
 x1 = x as complex
 x1 = x1 * sqrt(norm) if normalized
@@ -1155,77 +1216,82 @@ if returns == 1 return y1 real
 if returns == 2 return y1 imag
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | n_fft         | int   | 0         |                   |
-| 1         | returns       | int   | 1         |                   |
-| 2         | hoplen        | int   | n_fft / 4 |                   |
-| 3         | winlen        | int   | n_fft     |                   |
-| 4         | window_type   | int   | 0         | 0=ones 1=hann 2=hamming |
-| 5         | center        | int   | 1         |                   |
-| 7         | normalized    | int   | 0         | 0=no 1=n_fft 2=window-l2-energy |
+| param id | name        | type | default   | description                     |
+| -------- | ----------- | ---- | --------- | ------------------------------- |
+| 0        | n_fft       | int  | 0         |                                 |
+| 1        | returns     | int  | 1         |                                 |
+| 2        | hoplen      | int  | n_fft / 4 |                                 |
+| 3        | winlen      | int  | n_fft     |                                 |
+| 4        | window_type | int  | 0         | 0=ones 1=hann 2=hamming         |
+| 5        | center      | int  | 1         |                                 |
+| 7        | normalized  | int  | 0         | 0=no 1=n_fft 2=window-l2-energy |
 
 # LayerNorm
+
 ```
 split x along outmost axis into part x0, x1 ...
 l2 normalize for each part x0, x1 ...
 y = x * gamma + beta by elementwise
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | affine_size   | int   | 0         |                   |
-| 1         | eps           | float | 0.001f    | x = x / sqrt(var + eps) |
-| 2         | affine        | int   | 1         |                   |
+| param id | name        | type  | default | description             |
+| -------- | ----------- | ----- | ------- | ----------------------- |
+| 0        | affine_size | int   | 0       |                         |
+| 1        | eps         | float | 0.001f  | x = x / sqrt(var + eps) |
+| 2        | affine      | int   | 1       |                         |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| gamma_data    | float | [affine_size]         |
-| beta_data     | float | [affine_size]         |
+| weight     | type  | shape         |
+| ---------- | ----- | ------------- |
+| gamma_data | float | [affine_size] |
+| beta_data  | float | [affine_size] |
 
 # Log
+
 ```
 if base == -1   y = log(shift + x * scale)
 else            y = log(shift + x * scale) / log(base)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | base          | float | -1.f      |                   |
-| 1         | scale         | float | 1.f       |                   |
-| 2         | shift         | float | 0.f       |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | base  | float | -1.f    |             |
+| 1        | scale | float | 1.f     |             |
+| 2        | shift | float | 0.f     |             |
 
 # LRN
+
 ```
 if region_type == ACROSS_CHANNELS   square_sum = sum of channel window of local_size
 if region_type == WITHIN_CHANNEL    square_sum = sum of spatial window of local_size
 y = x * pow(bias + alpha * square_sum / (local_size * local_size), -beta)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | region_type   | int   | 0         |                   |
-| 1         | local_size    | int   | 5         |                   |
-| 2         | alpha         | float | 1.f       |                   |
-| 3         | beta          | float | 0.75f     |                   |
-| 4         | bias          | float | 1.f       |                   |
+| param id | name        | type  | default | description |
+| -------- | ----------- | ----- | ------- | ----------- |
+| 0        | region_type | int   | 0       |             |
+| 1        | local_size  | int   | 5       |             |
+| 2        | alpha       | float | 1.f     |             |
+| 3        | beta        | float | 0.75f   |             |
+| 4        | bias        | float | 1.f     |             |
 
 Region type:
+
 - 0 = ACROSS_CHANNELS
 - 1 = WITHIN_CHANNEL
 
 # LSTM
+
 Apply a single-layer LSTM to a feature sequence of `T` timesteps. The input blob shape is `[w=input_size, h=T]` and the output blob shape is `[w=num_output, h=T]`.
 
 ```
@@ -1233,53 +1299,57 @@ y = lstm(x)
 y0, hidden y1, cell y2 = lstm(x0, hidden x1, cell x2)
 ```
 
-* one_blob_only if bidirectional
+- one_blob_only if bidirectional
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         | output size of output |
-| 1         | weight_data_size| int | 0         | total size of IFOG weight matrix |
-| 2         | direction     | int   | 0         | 0=forward, 1=reverse, 2=bidirectional |
-| 3         | hidden_size   | int   | num_output| hidden size       |
+| param id | name             | type | default    | description                           |
+| -------- | ---------------- | ---- | ---------- | ------------------------------------- |
+| 0        | num_output       | int  | 0          | output size of output                 |
+| 1        | weight_data_size | int  | 0          | total size of IFOG weight matrix      |
+| 2        | direction        | int  | 0          | 0=forward, 1=reverse, 2=bidirectional |
+| 3        | hidden_size      | int  | num_output | hidden size                           |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_xc_data| float/fp16/int8 | [input_size, hidden_size * 4, num_directions] |
-| bias_c_data   | float/fp16/int8 | [hidden_size, 4, num_directions] |
-| weight_hc_data| float/fp16/int8 | [num_output, hidden_size * 4, num_directions] |
-| weight_hr_data| float/fp16/int8 | [hidden_size, num_output, num_directions] |
+| weight         | type            | shape                                         |
+| -------------- | --------------- | --------------------------------------------- |
+| weight_xc_data | float/fp16/int8 | [input_size, hidden_size * 4, num_directions] |
+| bias_c_data    | float/fp16/int8 | [hidden_size, 4, num_directions]              |
+| weight_hc_data | float/fp16/int8 | [num_output, hidden_size * 4, num_directions] |
+| weight_hr_data | float/fp16/int8 | [hidden_size, num_output, num_directions]     |
 
 Direction flag:
+
 - 0 = forward only
 - 1 = reverse only
 - 2 = bidirectional
 
 # MemoryData
+
 ```
 y = data
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | w             | int   | 0         |                   |
-| 1         | h             | int   | 0         |                   |
-| 11        | d             | int   | 0         |                   |
-| 2         | c             | int   | 0         |                   |
-| 21        | load_type     | int   | 1         | 1=fp32            |
+| param id | name      | type | default | description |
+| -------- | --------- | ---- | ------- | ----------- |
+| 0        | w         | int  | 0       |             |
+| 1        | h         | int  | 0       |             |
+| 11       | d         | int  | 0       |             |
+| 2        | c         | int  | 0       |             |
+| 21       | load_type | int  | 1       | 1=fp32      |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| data          | float | [w, h, d, c]          |
+| weight | type  | shape        |
+| ------ | ----- | ------------ |
+| data   | float | [w, h, d, c] |
 
 # Mish
+
 ```
 y = x * tanh(log(exp(x) + 1))
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
 # MultiHeadAttention
+
 ```
 split q k v into num_head part q0, k0, v0, q1, k1, v1 ...
 for each num_head part
@@ -1294,33 +1364,34 @@ for each num_head part
 y = affine(out)
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | embed_dim     | int   | 0         |                   |
-| 1         | num_heads     | int   | 1         |                   |
-| 2         | weight_data_size| int | 0         | qdim = weight_data_size / embed_dim |
-| 3         | kdim          | int   | embed_dim |                   |
-| 4         | vdim          | int   | embed_dim |                   |
-| 5         | attn_mask     | int   | 0         |                   |
-| 6         | scale         | float | 1.f / sqrt(embed_dim / num_heads) | |
-| 18        | int8_scale_term | int | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| q_weight_data | float/fp16/int8 | [embed_dim * qdim] |
-| q_bias_data   | float | [embed_dim]           |
-| k_weight_data | float/fp16/int8 | [embed_dim * kdim] |
-| k_bias_data   | float | [embed_dim]           |
-| v_weight_data | float/fp16/int8 | [embed_dim * vdim] |
-| v_bias_data   | float | [embed_dim]           |
-| out_weight_data| float/fp16/int8 | [qdim * embed_dim] |
-| out_bias_data | float | [qdim]                |
-| q_weight_data_int8_scales| float | [embed_dim] |
-| k_weight_data_int8_scales| float | [embed_dim] |
-| v_weight_data_int8_scales| float | [embed_dim] |
-| out_weight_data_int8_scales| float | [1]      |
+| param id | name             | type  | default                           | description                         |
+| -------- | ---------------- | ----- | --------------------------------- | ----------------------------------- |
+| 0        | embed_dim        | int   | 0                                 |                                     |
+| 1        | num_heads        | int   | 1                                 |                                     |
+| 2        | weight_data_size | int   | 0                                 | qdim = weight_data_size / embed_dim |
+| 3        | kdim             | int   | embed_dim                         |                                     |
+| 4        | vdim             | int   | embed_dim                         |                                     |
+| 5        | attn_mask        | int   | 0                                 |                                     |
+| 6        | scale            | float | 1.f / sqrt(embed_dim / num_heads) |                                     |
+| 18       | int8_scale_term  | int   | 0                                 |                                     |
+
+| weight                      | type            | shape              |
+| --------------------------- | --------------- | ------------------ |
+| q_weight_data               | float/fp16/int8 | [embed_dim * qdim] |
+| q_bias_data                 | float           | [embed_dim]        |
+| k_weight_data               | float/fp16/int8 | [embed_dim * kdim] |
+| k_bias_data                 | float           | [embed_dim]        |
+| v_weight_data               | float/fp16/int8 | [embed_dim * vdim] |
+| v_bias_data                 | float           | [embed_dim]        |
+| out_weight_data             | float/fp16/int8 | [qdim * embed_dim] |
+| out_bias_data               | float           | [qdim]             |
+| q_weight_data_int8_scales   | float           | [embed_dim]        |
+| k_weight_data_int8_scales   | float           | [embed_dim]        |
+| v_weight_data_int8_scales   | float           | [embed_dim]        |
+| out_weight_data_int8_scales | float           | [1]                |
 
 # MVN
+
 ```
 if normalize_variance == 1 && across_channels == 1      y = (x - mean) / (sqrt(var) + eps) of whole blob
 if normalize_variance == 1 && across_channels == 0      y = (x - mean) / (sqrt(var) + eps) of each channel
@@ -1328,20 +1399,22 @@ if normalize_variance == 0 && across_channels == 1      y = x - mean of whole bl
 if normalize_variance == 0 && across_channels == 0      y = x - mean of each channel
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | normalize_variance| int | 0       |                   |
-| 1         | across_channels| int  | 0         |                   |
-| 2         | eps           | float | 0.0001f   | x = x / (sqrt(var) + eps) |
+| param id | name               | type  | default | description               |
+| -------- | ------------------ | ----- | ------- | ------------------------- |
+| 0        | normalize_variance | int   | 0       |                           |
+| 1        | across_channels    | int   | 0       |                           |
+| 2        | eps                | float | 0.0001f | x = x / (sqrt(var) + eps) |
 
 # Noop
+
 ```
 y = x
 ```
 
 # Normalize
+
 ```
 if across_spatial == 1 && across_channel == 1      x2 = normalize(x) of whole blob
 if across_spatial == 1 && across_channel == 0      x2 = normalize(x) of each channel
@@ -1349,79 +1422,85 @@ if across_spatial == 0 && across_channel == 1      x2 = normalize(x) of each pos
 y = x2 * scale
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | across_spatial| int   | 0         |                   |
-| 1         | channel_shared| int   | 0         |                   |
-| 2         | eps           | float | 0.0001f   | see eps mode      |
-| 3         | scale_data_size| int  | 0         |                   |
-| 4         | across_channel| int   | 0         |                   |
-| 9         | eps_mode      | int   | 0         |                   |
+| param id | name            | type  | default | description  |
+| -------- | --------------- | ----- | ------- | ------------ |
+| 0        | across_spatial  | int   | 0       |              |
+| 1        | channel_shared  | int   | 0       |              |
+| 2        | eps             | float | 0.0001f | see eps mode |
+| 3        | scale_data_size | int   | 0       |              |
+| 4        | across_channel  | int   | 0       |              |
+| 9        | eps_mode        | int   | 0       |              |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| scale_data    | float | [scale_data_size]     |
+| weight     | type  | shape             |
+| ---------- | ----- | ----------------- |
+| scale_data | float | [scale_data_size] |
 
 Eps Mode:
-- 0 = caffe/mxnet   x = x / sqrt(var + eps)
-- 1 = pytorch       x = x / max(sqrt(var), eps)
-- 2 = tensorflow    x = x / sqrt(max(var, eps))
+
+- 0 = caffe/mxnet x = x / sqrt(var + eps)
+- 1 = pytorch x = x / max(sqrt(var), eps)
+- 2 = tensorflow x = x / sqrt(max(var, eps))
 
 # Packing
+
 ```
 y = wrap_packing(x)
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | out_elempack  | int   | 1         |                   |
-| 1         | use_padding   | int   | 0         |                   |
-| 2         | cast_type_from| int   | 0         |                   |
-| 3         | cast_type_to  | int   | 0         |                   |
-| 4         | storage_type_from| int | 0        |                   |
-| 5         | storage_type_to| int  | 0         |                   |
+| param id | name              | type | default | description |
+| -------- | ----------------- | ---- | ------- | ----------- |
+| 0        | out_elempack      | int  | 1       |             |
+| 1        | use_padding       | int  | 0       |             |
+| 2        | cast_type_from    | int  | 0       |             |
+| 3        | cast_type_to      | int  | 0       |             |
+| 4        | storage_type_from | int  | 0       |             |
+| 5        | storage_type_to   | int  | 0       |             |
 
 # Padding
+
 ```
 y = pad(x, pads)
 ```
 
-| param id  | name          | type | default   | description       |
-| --------- | ------------- | ---- | --------- | ----------------- |
-| 0         | top           | int  | 0         |                   |
-| 1         | bottom        | int  | 0         |                   |
-| 2         | left          | int  | 0         |                   |
-| 3         | right         | int  | 0         |                   |
-| 4         | type          | int  | 0         |                   |
-| 5         | value         | float | 0         |                   |
-| 6         | per_channel_pad_data_size| int | 0 |                 |
-| 7         | front         | int  | stride_w  |                   |
-| 8         | behind        | int  | pad_left  |                   |
+| param id | name                      | type  | default  | description |
+| -------- | ------------------------- | ----- | -------- | ----------- |
+| 0        | top                       | int   | 0        |             |
+| 1        | bottom                    | int   | 0        |             |
+| 2        | left                      | int   | 0        |             |
+| 3        | right                     | int   | 0        |             |
+| 4        | type                      | int   | 0        |             |
+| 5        | value                     | float | 0        |             |
+| 6        | per_channel_pad_data_size | int   | 0        |             |
+| 7        | front                     | int   | stride_w |             |
+| 8        | behind                    | int   | pad_left |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| per_channel_pad_data| float | [per_channel_pad_data_size] |
+| weight               | type  | shape                       |
+| -------------------- | ----- | --------------------------- |
+| per_channel_pad_data | float | [per_channel_pad_data_size] |
 
 Padding type:
+
 - 0 = CONSTANT
 - 1 = REPLICATE
 - 2 = REFLECT
 
 # Permute
+
 ```
 y = reorder(x)
 ```
 
-| param id  | name          | type | default   | description       |
-| --------- | ------------- | ---- | --------- | ----------------- |
-| 0         | order_type    | int  | 0         |                   |
+| param id | name       | type | default | description |
+| -------- | ---------- | ---- | ------- | ----------- |
+| 0        | order_type | int  | 0       |             |
 
 Order Type:
+
 - 0 = WH WHC WHDC
 - 1 = HW HWC HWDC
 - 2 = WCH WDHC
@@ -1448,183 +1527,198 @@ Order Type:
 - 23 = CDHW
 
 # PixelShuffle
+
 ```
 if mode == 0    y = depth_to_space(x) where x channel order is sw-sh-outc
 if mode == 1    y = depth_to_space(x) where x channel order is outc-sw-sh
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type | default   | description       |
-| --------- | ------------- | ---- | --------- | ----------------- |
-| 0         | upscale_factor| int  | 1         |                   |
-| 1         | mode          | int  | 0         |                   |
+| param id | name           | type | default | description |
+| -------- | -------------- | ---- | ------- | ----------- |
+| 0        | upscale_factor | int  | 1       |             |
+| 1        | mode           | int  | 0       |             |
 
 # Pooling
+
 ```
 x2 = pad(x, pads)
 x3 = pooling(x2, kernel, stride)
 ```
 
-| param id  | name          | type | default   | description       |
-| --------- | --------------| ---- | --------- | ----------------- |
-| 0         | pooling_type  | int  | 0         |                   |
-| 1         | kernel_w      | int  | 0         |                   |
-| 2         | stride_w      | int  | 1         |                   |
-| 3         | pad_left      | int  | 0         |                   |
-| 4         | global_pooling| int  | 0         |                   |
-| 5         | pad_mode      | int  | 0         |                   |
-| 6         | avgpool_count_include_pad| int | 0 |                 |
-| 7         | adaptive_pooling| int | 0        |                   |
-| 8         | out_w         | int  | 0         |                   |
-| 11        | kernel_h      | int  | kernel_w  |                   |
-| 12        | stride_h      | int  | stride_w  |                   |
-| 13        | pad_top       | int  | pad_left  |                   |
-| 14        | pad_right     | int  | pad_left  |                   |
-| 15        | pad_bottom    | int  | pad_top   |                   |
-| 18        | out_h         | int  | out_w     |                   |
+| param id | name                      | type | default  | description |
+| -------- | ------------------------- | ---- | -------- | ----------- |
+| 0        | pooling_type              | int  | 0        |             |
+| 1        | kernel_w                  | int  | 0        |             |
+| 2        | stride_w                  | int  | 1        |             |
+| 3        | pad_left                  | int  | 0        |             |
+| 4        | global_pooling            | int  | 0        |             |
+| 5        | pad_mode                  | int  | 0        |             |
+| 6        | avgpool_count_include_pad | int  | 0        |             |
+| 7        | adaptive_pooling          | int  | 0        |             |
+| 8        | out_w                     | int  | 0        |             |
+| 11       | kernel_h                  | int  | kernel_w |             |
+| 12       | stride_h                  | int  | stride_w |             |
+| 13       | pad_top                   | int  | pad_left |             |
+| 14       | pad_right                 | int  | pad_left |             |
+| 15       | pad_bottom                | int  | pad_top  |             |
+| 18       | out_h                     | int  | out_w    |             |
 
 Pooling type:
+
 - 0 = MAX
 - 1 = AVG
 
 Pad mode:
+
 - 0 = full padding
 - 1 = valid padding
 - 2 = tensorflow padding=SAME or onnx padding=SAME_UPPER
 - 3 = onnx padding=SAME_LOWER
 
 # Pooling1D
+
 ```
 x2 = pad(x, pads)
 x3 = pooling1d(x2, kernel, stride)
 ```
 
-| param id  | name          | type | default   | description       |
-| --------- | --------------| ---- | --------- | ----------------- |
-| 0         | pooling_type  | int  | 0         |                   |
-| 1         | kernel_w      | int  | 0         |                   |
-| 2         | stride_w      | int  | 1         |                   |
-| 3         | pad_left      | int  | 0         |                   |
-| 4         | global_pooling| int  | 0         |                   |
-| 5         | pad_mode      | int  | 0         |                   |
-| 6         | avgpool_count_include_pad| int | 0 |                 |
-| 7         | adaptive_pooling| int | 0        |                   |
-| 8         | out_w         | int  | 0         |                   |
-| 14        | pad_right     | int  | pad_left  |                   |
+| param id | name                      | type | default  | description |
+| -------- | ------------------------- | ---- | -------- | ----------- |
+| 0        | pooling_type              | int  | 0        |             |
+| 1        | kernel_w                  | int  | 0        |             |
+| 2        | stride_w                  | int  | 1        |             |
+| 3        | pad_left                  | int  | 0        |             |
+| 4        | global_pooling            | int  | 0        |             |
+| 5        | pad_mode                  | int  | 0        |             |
+| 6        | avgpool_count_include_pad | int  | 0        |             |
+| 7        | adaptive_pooling          | int  | 0        |             |
+| 8        | out_w                     | int  | 0        |             |
+| 14       | pad_right                 | int  | pad_left |             |
 
 Pooling type:
+
 - 0 = MAX
 - 1 = AVG
 
 Pad mode:
+
 - 0 = full padding
 - 1 = valid padding
 - 2 = tensorflow padding=SAME or onnx padding=SAME_UPPER
 - 3 = onnx padding=SAME_LOWER
 
 # Pooling3D
+
 ```
 x2 = pad(x, pads)
 x3 = pooling3d(x2, kernel, stride)
 ```
 
-| param id  | name          | type | default   | description       |
-| --------- | --------------| ---- | --------- | ----------------- |
-| 0         | pooling_type  | int  | 0         |                   |
-| 1         | kernel_w      | int  | 0         |                   |
-| 2         | stride_w      | int  | 1         |                   |
-| 3         | pad_left      | int  | 0         |                   |
-| 4         | global_pooling| int  | 0         |                   |
-| 5         | pad_mode      | int  | 0         |                   |
-| 6         | avgpool_count_include_pad| int | 0 |                 |
-| 7         | adaptive_pooling| int | 0        |                   |
-| 8         | out_w         | int  | 0         |                   |
-| 11        | kernel_h      | int  | kernel_w  |                   |
-| 12        | stride_h      | int  | stride_w  |                   |
-| 13        | pad_top       | int  | pad_left  |                   |
-| 14        | pad_right     | int  | pad_left  |                   |
-| 15        | pad_bottom    | int  | pad_top   |                   |
-| 16        | pad_behind    | int  | pad_front |                   |
-| 18        | out_h         | int  | out_w     |                   |
-| 21        | kernel_d      | int  | kernel_w  |                   |
-| 22        | stride_d      | int  | stride_w  |                   |
-| 23        | pad_front     | int  | pad_left  |                   |
-| 28        | out_d         | int  | out_w     |                   |
+| param id | name                      | type | default   | description |
+| -------- | ------------------------- | ---- | --------- | ----------- |
+| 0        | pooling_type              | int  | 0         |             |
+| 1        | kernel_w                  | int  | 0         |             |
+| 2        | stride_w                  | int  | 1         |             |
+| 3        | pad_left                  | int  | 0         |             |
+| 4        | global_pooling            | int  | 0         |             |
+| 5        | pad_mode                  | int  | 0         |             |
+| 6        | avgpool_count_include_pad | int  | 0         |             |
+| 7        | adaptive_pooling          | int  | 0         |             |
+| 8        | out_w                     | int  | 0         |             |
+| 11       | kernel_h                  | int  | kernel_w  |             |
+| 12       | stride_h                  | int  | stride_w  |             |
+| 13       | pad_top                   | int  | pad_left  |             |
+| 14       | pad_right                 | int  | pad_left  |             |
+| 15       | pad_bottom                | int  | pad_top   |             |
+| 16       | pad_behind                | int  | pad_front |             |
+| 18       | out_h                     | int  | out_w     |             |
+| 21       | kernel_d                  | int  | kernel_w  |             |
+| 22       | stride_d                  | int  | stride_w  |             |
+| 23       | pad_front                 | int  | pad_left  |             |
+| 28       | out_d                     | int  | out_w     |             |
 
 Pooling type:
+
 - 0 = MAX
 - 1 = AVG
 
 Pad mode:
+
 - 0 = full padding
 - 1 = valid padding
 - 2 = tensorflow padding=SAME or onnx padding=SAME_UPPER
 - 3 = onnx padding=SAME_LOWER
 
 # Power
+
 ```
 y = pow((shift + x * scale), power)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | power         | float | 1.f       |                   |
-| 1         | scale         | float | 1.f       |                   |
-| 2         | shift         | float | 0.f       |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | power | float | 1.f     |             |
+| 1        | scale | float | 1.f     |             |
+| 2        | shift | float | 0.f     |             |
 
 # PReLU
+
 ```
 if x < 0    y = x * slope
 else        y = x
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_slope     | int   | 0         |                   |
+| param id | name      | type | default | description |
+| -------- | --------- | ---- | ------- | ----------- |
+| 0        | num_slope | int  | 0       |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| slope_data    | float | [num_slope]           |
+| weight     | type  | shape       |
+| ---------- | ----- | ----------- |
+| slope_data | float | [num_slope] |
 
 # Quantize
+
 ```
 y = float2int8(x * scale)
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | scale_data_size| int  | 1         |                   |
+| param id | name            | type | default | description |
+| -------- | --------------- | ---- | ------- | ----------- |
+| 0        | scale_data_size | int  | 1       |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| scale_data    | float | [scale_data_size]     |
+| weight     | type  | shape             |
+| ---------- | ----- | ----------------- |
+| scale_data | float | [scale_data_size] |
 
 # Reduction
+
 ```
 y = reduce_op(x * coeff)
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | operation     | int   | 0         |                   |
-| 1         | reduce_all    | int   | 1         |                   |
-| 2         | coeff         | float | 1.f       |                   |
-| 3         | axes          | array | [ ]       |                   |
-| 4         | keepdims      | int   | 0         |                   |
-| 5         | fixbug0       | int   | 0         | hack for bug fix, should be 1 |
+| param id | name       | type  | default | description                   |
+| -------- | ---------- | ----- | ------- | ----------------------------- |
+| 0        | operation  | int   | 0       |                               |
+| 1        | reduce_all | int   | 1       |                               |
+| 2        | coeff      | float | 1.f     |                               |
+| 3        | axes       | array | [ ]     |                               |
+| 4        | keepdims   | int   | 0       |                               |
+| 5        | fixbug0    | int   | 0       | hack for bug fix, should be 1 |
 
 Operation type:
+
 - 0 = SUM
 - 1 = ASUM
 - 2 = SUMSQ
@@ -1638,96 +1732,103 @@ Operation type:
 - 10 = LogSumExp
 
 # ReLU
+
 ```
 if x < 0    y = x * slope
 else        y = x
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | slope         | float | 0.f       |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | slope | float | 0.f     |             |
 
 # Reorg
+
 ```
 if mode == 0    y = space_to_depth(x) where x channel order is sw-sh-outc
 if mode == 1    y = space_to_depth(x) where x channel order is outc-sw-sh
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type | default   | description       |
-| --------- | ------------- | ---- | --------- | ----------------- |
-| 0         | stride        | int  | 1         |                   |
-| 1         | mode          | int  | 0         |                   |
+| param id | name   | type | default | description |
+| -------- | ------ | ---- | ------- | ----------- |
+| 0        | stride | int  | 1       |             |
+| 1        | mode   | int  | 0       |             |
 
 # Requantize
+
 ```
 x2 = x * scale_in + bias
 x3 = activation(x2)
 y = float2int8(x3 * scale_out)
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | scale_in_data_size| int | 1       |                   |
-| 1         | scale_out_data_size| int | 1      |                   |
-| 2         | bias_data_size| int   | 0         |                   |
-| 3         | activation_type| int  | 0         |                   |
-| 4         | activation_params| int | [ ]      |                   |
+| param id | name                | type | default | description |
+| -------- | ------------------- | ---- | ------- | ----------- |
+| 0        | scale_in_data_size  | int  | 1       |             |
+| 1        | scale_out_data_size | int  | 1       |             |
+| 2        | bias_data_size      | int  | 0       |             |
+| 3        | activation_type     | int  | 0       |             |
+| 4        | activation_params   | int  | [ ]     |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| scale_in_data | float | [scale_in_data_size]  |
-| scale_out_data| float | [scale_out_data_size] |
-| bias_data     | float | [bias_data_size]      |
+| weight         | type  | shape                 |
+| -------------- | ----- | --------------------- |
+| scale_in_data  | float | [scale_in_data_size]  |
+| scale_out_data | float | [scale_out_data_size] |
+| bias_data      | float | [bias_data_size]      |
 
 # Reshape
+
 ```
 if permute == 1     y = hwc2chw(reshape(chw2hwc(x)))
 else                y = reshape(x)
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | w             | int   | -233      |                   |
-| 1         | h             | int   | -233      |                   |
-| 11        | d             | int   | -233      |                   |
-| 2         | c             | int   | -233      |                   |
-| 3         | permute       | int   | 0         |                   |
+| param id | name    | type | default | description |
+| -------- | ------- | ---- | ------- | ----------- |
+| 0        | w       | int  | -233    |             |
+| 1        | h       | int  | -233    |             |
+| 11       | d       | int  | -233    |             |
+| 2        | c       | int  | -233    |             |
+| 3        | permute | int  | 0       |             |
 
 Reshape flag:
+
 - 0 = copy from bottom
 - -1 = remaining
 - -233 = drop this dim(default)
 
 # RMSNorm
+
 ```
 split x along outmost axis into part x0, x1 ...
 root mean square normalize for each part x0, x1 ...
 y = x * gamma by elementwise
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | affine_size   | int   | 0         |                   |
-| 1         | eps           | float | 0.001f    | x = x / sqrt(var + eps) |
-| 2         | affine        | int   | 1         |                   |
+| param id | name        | type  | default | description             |
+| -------- | ----------- | ----- | ------- | ----------------------- |
+| 0        | affine_size | int   | 0       |                         |
+| 1        | eps         | float | 0.001f  | x = x / sqrt(var + eps) |
+| 2        | affine      | int   | 1       |                         |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| gamma_data    | float | [affine_size]         |
+| weight     | type  | shape         |
+| ---------- | ----- | ------------- |
+| gamma_data | float | [affine_size] |
 
 # RNN
+
 Apply a single-layer RNN to a feature sequence of `T` timesteps. The input blob shape is `[w=input_size, h=T]` and the output blob shape is `[w=num_output, h=T]`.
 
 ```
@@ -1735,127 +1836,137 @@ y = rnn(x)
 y0, hidden y1 = rnn(x0, hidden x1)
 ```
 
-* one_blob_only if bidirectional
+- one_blob_only if bidirectional
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         | hidden size of output |
-| 1         | weight_data_size| int | 0         | total size of weight matrix |
-| 2         | direction     | int   | 0         | 0=forward, 1=reverse, 2=bidirectional |
+| param id | name             | type | default | description                           |
+| -------- | ---------------- | ---- | ------- | ------------------------------------- |
+| 0        | num_output       | int  | 0       | hidden size of output                 |
+| 1        | weight_data_size | int  | 0       | total size of weight matrix           |
+| 2        | direction        | int  | 0       | 0=forward, 1=reverse, 2=bidirectional |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_xc_data| float/fp16/int8 | [input_size, num_output, num_directions] |
-| bias_c_data   | float/fp16/int8 | [num_output, 1, num_directions] |
-| weight_hc_data| float/fp16/int8 | [num_output, num_output, num_directions] |
+| weight         | type            | shape                                    |
+| -------------- | --------------- | ---------------------------------------- |
+| weight_xc_data | float/fp16/int8 | [input_size, num_output, num_directions] |
+| bias_c_data    | float/fp16/int8 | [num_output, 1, num_directions]          |
+| weight_hc_data | float/fp16/int8 | [num_output, num_output, num_directions] |
 
 Direction flag:
+
 - 0 = forward only
 - 1 = reverse only
 - 2 = bidirectional
 
 # Scale
+
 ```
 if scale_data_size == -233  y = x0 * x1
 else                        y = x * scale + bias
 ```
 
-* one_blob_only if scale_data_size != -233
-* support_inplace
+- one_blob_only if scale_data_size != -233
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | scale_data_size| int  | 0         |                   |
-| 1         | bias_term     | int   | 0         |                   |
+| param id | name            | type | default | description |
+| -------- | --------------- | ---- | ------- | ----------- |
+| 0        | scale_data_size | int  | 0       |             |
+| 1        | bias_term       | int  | 0       |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| scale_data    | float | [scale_data_size]     |
-| bias_data     | float | [scale_data_size]     |
+| weight     | type  | shape             |
+| ---------- | ----- | ----------------- |
+| scale_data | float | [scale_data_size] |
+| bias_data  | float | [scale_data_size] |
 
 # SELU
+
 ```
 if x < 0    y = (exp(x) - 1.f) * alpha * lambda
 else        y = x * lambda
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | alpha         | float | 1.67326324f|                  |
-| 1         | lambda        | float | 1.050700987f|                 |
+| param id | name   | type  | default      | description |
+| -------- | ------ | ----- | ------------ | ----------- |
+| 0        | alpha  | float | 1.67326324f  |             |
+| 1        | lambda | float | 1.050700987f |             |
 
 # Shrink
+
 ```
 if x < -lambd y = x + bias
 if x >  lambd y = x - bias
 else          y = x
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | bias          | float | 0.0f      |                   |
-| 1         | lambd         | float | 0.5f      |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | bias  | float | 0.0f    |             |
+| 1        | lambd | float | 0.5f    |             |
 
 # ShuffleChannel
+
 ```
 if reverse == 0     y = shufflechannel(x) by group
 if reverse == 1     y = shufflechannel(x) by channel / group
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type | default   | description       |
-| --------- | ------------- | ---- | --------- | ----------------- |
-| 0         | group         | int  | 1         |                   |
-| 1         | reverse       | int  | 0         |                   |
+| param id | name    | type | default | description |
+| -------- | ------- | ---- | ------- | ----------- |
+| 0        | group   | int  | 1       |             |
+| 1        | reverse | int  | 0       |             |
 
 # Sigmoid
+
 ```
 y = 1 / (1 + exp(-x))
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
 # Slice
+
 ```
 split x along axis into slices, each part slice size is based on slices array
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | slices        | array | [ ]       |                   |
-| 1         | axis          | int   | 0         |                   |
-| 2         | indices       | array | [ ]       |                   |
+| param id | name    | type  | default | description |
+| -------- | ------- | ----- | ------- | ----------- |
+| 0        | slices  | array | [ ]     |             |
+| 1        | axis    | int   | 0       |             |
+| 2        | indices | array | [ ]     |             |
 
 # Softmax
+
 ```
 softmax(x, axis)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | axis          | int   | 0         |                   |
-| 1         | fixbug0       | int   | 0         | hack for bug fix, should be 1 |
+| param id | name    | type | default | description                   |
+| -------- | ------- | ---- | ------- | ----------------------------- |
+| 0        | axis    | int  | 0       |                               |
+| 1        | fixbug0 | int  | 0       | hack for bug fix, should be 1 |
 
 # Softplus
+
 ```
 y = log(exp(x) + 1)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
 # Spectrogram
+
 ```
 x1 = pad(x) if center
 y = stft(x1)
@@ -1866,68 +1977,74 @@ if power == 1 return magnitude
 if power == 2 return square of magnitude
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | n_fft         | int   | 0         |                   |
-| 1         | power         | int   | 0         |                   |
-| 2         | hoplen        | int   | n_fft / 4 |                   |
-| 3         | winlen        | int   | n_fft     |                   |
-| 4         | window_type   | int   | 0         | 0=ones 1=hann 2=hamming |
-| 5         | center        | int   | 1         |                   |
-| 6         | pad_type      | int   | 2         | 0=CONSTANT 1=REPLICATE 2=REFLECT |
-| 7         | normalized    | int   | 0         | 0=no 1=n_fft 2=window-l2-energy |
-| 8         | onesided      | int   | 1         |                   |
+| param id | name        | type | default   | description                      |
+| -------- | ----------- | ---- | --------- | -------------------------------- |
+| 0        | n_fft       | int  | 0         |                                  |
+| 1        | power       | int  | 0         |                                  |
+| 2        | hoplen      | int  | n_fft / 4 |                                  |
+| 3        | winlen      | int  | n_fft     |                                  |
+| 4        | window_type | int  | 0         | 0=ones 1=hann 2=hamming          |
+| 5        | center      | int  | 1         |                                  |
+| 6        | pad_type    | int  | 2         | 0=CONSTANT 1=REPLICATE 2=REFLECT |
+| 7        | normalized  | int  | 0         | 0=no 1=n_fft 2=window-l2-energy  |
+| 8        | onesided    | int  | 1         |                                  |
 
 # Split
+
 ```
 y0, y1 ... = x
 ```
 
 # Swish
+
 ```
 y = x / (1 + exp(-x))
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
 # TanH
+
 ```
 y = tanh(x)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
 # Threshold
+
 ```
 if x > threshold    y = 1
 else                y = 0
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | threshold     | float | 0.f       |                   |
+| param id | name      | type  | default | description |
+| -------- | --------- | ----- | ------- | ----------- |
+| 0        | threshold | float | 0.f     |             |
 
 # Tile
+
 ```
 y = repeat tiles along axis for x
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | axis          | int   | 0         |                   |
-| 1         | tiles         | int   | 1         |                   |
-| 2         | repeats       | array | [ ]       |                   |
+| param id | name    | type  | default | description |
+| -------- | ------- | ----- | ------- | ----------- |
+| 0        | axis    | int   | 0       |             |
+| 1        | tiles   | int   | 1       |             |
+| 2        | repeats | array | [ ]     |             |
 
 # UnaryOp
+
 ```
 y = unaryop(x)
 ```
@@ -1935,11 +2052,12 @@ y = unaryop(x)
 - one_blob_only
 - support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | op_type       | int   | 0         | Operation type as follows |
+| param id | name    | type | default | description               |
+| -------- | ------- | ---- | ------- | ------------------------- |
+| 0        | op_type | int  | 0       | Operation type as follows |
 
 Operation type:
+
 - 0 = ABS
 - 1 = NEG
 - 2 = FLOOR
@@ -1962,22 +2080,23 @@ Operation type:
 - 19 = TRUNC
 
 # Unfold
+
 ```
 y = unfold(x)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
+- one_blob_only
+
+| param id | name       | type | default    | description |
+| -------- | ---------- | ---- | ---------- | ----------- |
+| 0        | num_output | int  | 0          |             |
+| 1        | kernel_w   | int  | 0          |             |
+| 2        | dilation_w | int  | 1          |             |
+| 3        | stride_w   | int  | 1          |             |
+| 4        | pad_left   | int  | 0          |             |
+| 11       | kernel_h   | int  | kernel_w   |             |
+| 12       | dilation_h | int  | dilation_w |             |
+| 13       | stride_h   | int  | stride_w   |             |
+| 14       | pad_top    | int  | pad_left   |             |
+| 15       | pad_right  | int  | pad_left   |             |
+| 16       | pad_bottom | int  | pad_top    |             |
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c97235d97a0..60f24361d8b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -169,6 +169,7 @@ ncnn_add_layer(Shrink)
 ncnn_add_layer(RMSNorm)
 ncnn_add_layer(Spectrogram)
 ncnn_add_layer(InverseSpectrogram)
+ncnn_add_layer(Flip)
 
 if(NCNN_VULKAN)
     ncnn_add_shader(${CMAKE_CURRENT_SOURCE_DIR}/convert_ycbcr.comp)
diff --git a/src/layer/flip.cpp b/src/layer/flip.cpp
new file mode 100644
index 00000000000..ae191c4ed58
--- /dev/null
+++ b/src/layer/flip.cpp
@@ -0,0 +1,588 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "flip.h"
+
+namespace ncnn {
+
+Flip::Flip()
+{
+    one_blob_only = true;
+}
+
+int Flip::load_param(const ParamDict& pd)
+{
+    axis = pd.get(0, Mat());
+    // 调试
+    // const int *axis_ptr = axis;
+    // printf("axis_len = %d\n", axis.w);
+    // printf("axis[0] = %d\n", axis_ptr[0]);
+    return 0;
+}
+
+int Flip::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    // 已知参数
+    int dims = bottom_blob.dims;
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int d = bottom_blob.d;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+
+    // 校准输入参数
+    if (axis.w > 4)
+    {
+        return -1;
+    }
+    const int* axis_ptr = axis;
+
+    if (dims == 1)
+    {
+        // 1D 只有一种情况
+        top_blob.create(w, elemsize, opt.blob_allocator);
+        const float* ptr = bottom_blob;
+        float* outptr = top_blob;
+        for (int i = 0; i < w; i++)
+        {
+            outptr[i] = ptr[w - 1 - i];
+        }
+    }
+    else if (dims == 2)
+    {
+        // 2D 有三种，安装上下、左右和上下左右同时翻转;[-2/0上下翻转, -1/1左右翻转,交叉为上下左右翻转]
+        top_blob.create(w, h, elemsize, opt.blob_allocator);
+        if (axis.w == 1)
+        {
+            if (axis_ptr[0] == -2 || axis_ptr[0] == 0)
+            {
+                // 按照行翻转
+                for (int i = 0; i < h; i++)
+                {
+                    const float* ptr = bottom_blob.row(h - 1 - i); // 从最后一行开始
+                    float* outptr = top_blob.row(i);               // 输出到当前行
+
+                    // 直接复制整行数据
+                    memcpy(outptr, ptr, w * sizeof(float));
+                }
+            }
+            else
+            {
+                // 按照列翻转
+                for (int i = 0; i < h; i++)
+                {
+                    const float* ptr = bottom_blob.row(i);
+                    float* outptr = top_blob.row(i);
+
+                    // 使用临时buffer存储反转的行数据
+                    std::vector<float> line_buffer(w);
+                    for (int j = 0; j < w; j++)
+                    {
+                        line_buffer[j] = ptr[w - 1 - j];
+                    }
+
+                    // 一次性复制整行
+                    memcpy(outptr, line_buffer.data(), w * sizeof(float));
+                }
+            }
+        }
+        else
+        {
+            // 当axis.w=2时，上下左右都翻转
+            for (int i = 0; i < h; i++)
+            {
+                const float* ptr = bottom_blob.row(h - 1 - i); // 从最后一行开始读取
+                float* outptr = top_blob.row(i);               // 输出到当前行
+
+                // 每行内左右翻转
+                for (int j = 0; j < w; j++)
+                {
+                    outptr[j] = ptr[w - 1 - j]; // 反向读取每行像素
+                }
+            }
+        }
+    }
+    else if (dims == 3)
+    {
+        top_blob.create(w, h, channels, elemsize, opt.blob_allocator);
+        if (axis.w == 1)
+        {
+            // w、h、c
+            // 约定到正数，简化后续判断
+            int axis0 = axis_ptr[0] < 0 ? 3 + axis_ptr[0] : axis_ptr[0];
+            if (axis0 == 0)
+            {
+                // -3/0 整体上下翻转
+                for (int i = 0; i < channels; i++)
+                {
+                    for (int j = 0; j < h; j++)
+                    {
+                        const float* ptr = bottom_blob.channel(channels - 1 - i).row(j); // 从最后一个channel开始
+                        float* outptr = top_blob.channel(i).row(j);
+                        memcpy(outptr, ptr, w * sizeof(float));
+                    }
+                }
+            }
+            else if (axis0 == 1)
+            {
+                // -2/1 整体内部上下翻转
+                for (int i = 0; i < channels; i++)
+                {
+                    for (int j = 0; j < h; j++)
+                    {
+                        const float* ptr = bottom_blob.channel(i).row(h - 1 - j);
+                        float* outptr = top_blob.channel(i).row(j);
+                        memcpy(outptr, ptr, w * sizeof(float));
+                    }
+                }
+            }
+            else
+            {
+                // -1/2 整体左右翻转
+                for (int i = 0; i < channels; i++)
+                {
+                    for (int j = 0; j < h; j++)
+                    {
+                        const float* ptr = bottom_blob.channel(i).row(j);
+                        float* outptr = top_blob.channel(i).row(j);
+                        for (int k = 0; k < w; k++)
+                        {
+                            outptr[k] = ptr[w - 1 - k];
+                        }
+                    }
+                }
+            }
+        }
+        else if (axis.w == 2)
+        {
+            // ch、cw、hw
+            int axis0 = axis_ptr[0] < 0 ? 3 + axis_ptr[0] : axis_ptr[0];
+            int axis1 = axis_ptr[1] < 0 ? 3 + axis_ptr[1] : axis_ptr[1];
+            int axis_sum = axis0 + axis1;
+            if (axis_sum == 1)
+            {
+                // 对应ch
+                for (int i = 0; i < channels; i++)
+                {
+                    for (int j = 0; j < h; j++)
+                    {
+                        // 组合两种翻转：channel维度和行维度同时翻转
+                        const float* ptr = bottom_blob.channel(channels - 1 - i).row(h - 1 - j);
+                        float* outptr = top_blob.channel(i).row(j);
+                        memcpy(outptr, ptr, w * sizeof(float));
+                    }
+                }
+            }
+            else if (axis_sum == 2)
+            {
+                // 对应cw
+                for (int i = 0; i < channels; i++)
+                {
+                    for (int j = 0; j < h; j++)
+                    {
+                        const float* ptr = bottom_blob.channel(channels - 1 - i).row(j);
+                        float* outptr = top_blob.channel(i).row(j);
+                        for (int k = 0; k < w; k++)
+                        {
+                            outptr[k] = ptr[w - 1 - k];
+                        }
+                    }
+                }
+            }
+            else if (axis_sum == 3)
+            {
+                // 对应hw
+                for (int i = 0; i < channels; i++)
+                {
+                    for (int j = 0; j < h; j++)
+                    {
+                        const float* ptr = bottom_blob.channel(i).row(h - 1 - j);
+                        float* outptr = top_blob.channel(i).row(j);
+
+                        // 增加左右翻转
+                        for (int k = 0; k < w; k++)
+                        {
+                            outptr[k] = ptr[w - 1 - k];
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            // whc
+            for (int i = 0; i < channels; i++)
+            {
+                for (int j = 0; j < h; j++)
+                {
+                    const float* ptr = bottom_blob.channel(channels - 1 - i).row(h - 1 - j);
+                    float* outptr = top_blob.channel(i).row(j);
+
+                    // 左右翻转实现完全倒序
+                    for (int k = 0; k < w; k++)
+                    {
+                        outptr[k] = ptr[w - 1 - k];
+                    }
+                }
+            }
+        }
+    }
+    else if (dims == 4)
+    {
+        top_blob.create(w, h, d, channels, elemsize, opt.blob_allocator);
+        if (axis.w == 1)
+        {
+            // w、h、d、c
+            int axis0 = axis_ptr[0] < 0 ? 4 + axis_ptr[0] : axis_ptr[0];
+            if (axis0 == 0)
+            {
+                // -4/0 整体上下翻转 torch中按c维度翻转
+                for (int c = 0; c < channels; c++) // 遍历channels=3
+                {
+                    int flipped_c = channels - 1 - c; // 计算channels翻转位置
+                    for (int z = 0; z < d; z++)       // 遍历d=2维度
+                    {
+                        for (int j = 0; j < h; j++) // 遍历行
+                        {
+                            const float* ptr = bottom_blob.channel(c).row(z * h + j);
+                            float* outptr = top_blob.channel(flipped_c).row(z * h + j);
+                            memcpy(outptr, ptr, w * sizeof(float));
+                        }
+                    }
+                }
+            }
+            else if (axis0 == 1)
+            {
+                // -3/1 torh中按d维度内部上下翻转
+                for (int i = 0; i < channels; i++) // 遍历channels
+                {
+                    for (int z = 0; z < d; z++) // 遍历d维度
+                    {
+                        for (int j = 0; j < h; j++) // 遍历h维度
+                        {
+                            // 翻转d维度的数据读取位置
+                            const float* ptr = bottom_blob.channel(i).row((d - 1 - z) * h + j);
+                            float* outptr = top_blob.channel(i).row(z * h + j);
+                            // 逐行复制w元素
+                            memcpy(outptr, ptr, w * sizeof(float));
+                        }
+                    }
+                }
+            }
+            else if (axis0 == 2)
+            {
+                // -2/2 按torch中H维度翻转 上下
+                for (int i = 0; i < channels; i++)
+                {
+                    for (int z = 0; z < d; z++)
+                    {
+                        for (int j = 0; j < h; j++)
+                        {
+                            const float* ptr = bottom_blob.channel(i).row(z * h + (h - 1 - j));
+                            float* outptr = top_blob.channel(i).row(z * h + j);
+                            memcpy(outptr, ptr, w * sizeof(float));
+                        }
+                    }
+                }
+            }
+            else
+            {
+                // -1/3 按torch中W维度翻转 左右
+                for (int i = 0; i < channels; i++)
+                {
+                    for (int z = 0; z < d; z++)
+                    {
+                        for (int j = 0; j < h; j++)
+                        {
+                            const float* ptr = bottom_blob.channel(i).row(z * h + j);
+                            float* outptr = top_blob.channel(i).row(z * h + j);
+                            for (int k = 0; k < w; k++)
+                            {
+                                outptr[k] = ptr[w - 1 - k];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else if (axis.w == 2)
+        {
+            // dc1、dh2、dw3、ch3、cw4、hw5
+            int axis0 = axis_ptr[0] < 0 ? 4 + axis_ptr[0] : axis_ptr[0];
+            int axis1 = axis_ptr[1] < 0 ? 4 + axis_ptr[1] : axis_ptr[1];
+            int axis_sum = axis0 + axis1;
+            if (axis_sum == 1)
+            {
+                // 对应dc
+                for (int c = 0; c < channels; c++) // 遍历channels
+                {
+                    int flipped_c = channels - 1 - c; // 翻转后的channel位置
+
+                    for (int z = 0; z < d; z++) // 遍历d维度
+                    {
+                        int flipped_d = d - 1 - z; // 翻转后的d位置
+
+                        for (int j = 0; j < h; j++) // 遍历行
+                        {
+                            const float* ptr = bottom_blob.channel(c).row(z * h + j);
+                            float* outptr = top_blob.channel(flipped_c).row(flipped_d * h + j);
+                            memcpy(outptr, ptr, w * sizeof(float));
+                        }
+                    }
+                }
+            }
+            else if (axis_sum == 2)
+            {
+                // 对应dh
+                for (int c = 0; c < channels; c++) // 遍历 channels=2 维度
+                {
+                    int flipped_c = channels - 1 - c; // 计算 c 维度翻转位置 (0→1, 1→0)
+
+                    for (int z = 0; z < d; z++) // 遍历 d=3 维度
+                    {
+                        // 按翻转顺序逐行复制 h 维度数据
+                        for (int i = 0; i < h; i++)
+                        {
+                            const float* ptr = bottom_blob.channel(c).row(z * h + i);
+                            float* outptr = top_blob.channel(flipped_c).row(z * h + (h - 1 - i)); // 保持z维度顺序,翻转h维度
+                            memcpy(outptr, ptr, w * sizeof(float));                               // 按行复制，保持 w 维度顺序
+                        }
+                    }
+                }
+            }
+            else if (axis_sum == 3)
+            {
+                // 对应dw；有一个为0或3
+                if (axis0 == 0 || axis0 == 3)
+                {
+                    // 对应dw
+                    for (int c = 0; c < channels; c++)
+                    {
+                        int flipped_c = channels - 1 - c; // 翻转c维度
+
+                        for (int z = 0; z < d; z++) // d维度保持不变
+                        {
+                            for (int j = 0; j < h; j++) // h维度保持不变
+                            {
+                                const float* ptr = bottom_blob.channel(c).row(z * h + j);
+                                float* outptr = top_blob.channel(flipped_c).row(z * h + j);
+
+                                // 翻转w维度
+                                for (int k = 0; k < w; k++)
+                                {
+                                    outptr[k] = ptr[w - 1 - k];
+                                }
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    // 对应ch
+                    for (int c = 0; c < channels; c++)
+                    {
+                        for (int z = 0; z < d; z++)
+                        {
+                            int flipped_d = d - 1 - z;
+
+                            for (int j = 0; j < h; j++)
+                            {
+                                int flipped_h = h - 1 - j;
+                                // 读取源数据
+                                const float* ptr = bottom_blob.channel(c).row(z * h + j);
+                                float* outptr = top_blob.channel(c).row(flipped_d * h + flipped_h);
+                                memcpy(outptr, ptr, w * sizeof(float));
+                            }
+                        }
+                    }
+                }
+            }
+            else if (axis_sum == 4)
+            {
+                // 对应cw
+                for (int c = 0; c < channels; c++)
+                {
+                    for (int z = 0; z < d; z++)
+                    {
+                        int flipped_d = d - 1 - z; // 翻转 d 维度
+
+                        for (int j = 0; j < h; j++)
+                        {
+                            const float* ptr = bottom_blob.channel(c).row(z * h + j);
+                            float* outptr = top_blob.channel(c).row(flipped_d * h + j); // c维度保持不变
+
+                            // 翻转 w 维度
+                            for (int k = 0; k < w; k++)
+                            {
+                                outptr[k] = ptr[w - 1 - k];
+                            }
+                        }
+                    }
+                }
+            }
+            else
+            {
+                // 对应hw
+                for (int c = 0; c < channels; c++)
+                {
+                    for (int z = 0; z < d; z++)
+                    {
+                        for (int j = 0; j < h; j++)
+                        {
+                            const float* ptr = bottom_blob.channel(c).row(z * h + j);
+                            float* outptr = top_blob.channel(c).row(z * h + (h - 1 - j)); // 翻转 h 维度
+
+                            // 翻转 w 维度
+                            for (int k = 0; k < w; k++)
+                            {
+                                outptr[k] = ptr[w - 1 - k];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else if (axis.w == 3)
+        {
+            // dch3、dcw4、dhw5,chw6
+            int axis0 = axis_ptr[0] < 0 ? 4 + axis_ptr[0] : axis_ptr[0];
+            int axis1 = axis_ptr[1] < 0 ? 4 + axis_ptr[1] : axis_ptr[1];
+            int axis2 = axis_ptr[2] < 0 ? 4 + axis_ptr[2] : axis_ptr[2];
+            int axis_sum = axis0 + axis1 + axis2;
+            if (axis_sum == 3)
+            {
+                // 对应dch，除w外，其余全翻转
+                for (int c = 0; c < channels; c++)
+                {
+                    int flipped_c = channels - 1 - c;
+
+                    for (int z = 0; z < d; z++)
+                    {
+                        int flipped_d = d - 1 - z;
+
+                        for (int i = 0; i < h; i++)
+                        {
+                            const float* ptr = bottom_blob.channel(c).depth(z).row(i);
+                            float* outptr = top_blob.channel(flipped_c).depth(flipped_d).row(h - 1 - i);
+                            memcpy(outptr, ptr, w * sizeof(float));
+                        }
+                    }
+                }
+            }
+            else if (axis_sum == 4)
+            {
+                // 对应dcw，除h外，其余全翻转
+                for (int c = 0; c < channels; c++)
+                {
+                    int flipped_c = channels - 1 - c; // 翻转c维度
+
+                    for (int z = 0; z < d; z++)
+                    {
+                        int flipped_d = d - 1 - z; // 翻转d维度
+
+                        for (int i = 0; i < h; i++)
+                        {
+                            const float* ptr = bottom_blob.channel(c).row(z * h + i);
+                            float* outptr = top_blob.channel(flipped_c).row(flipped_d * h + i); // h维度保持不变
+
+                            // 翻转w维度
+                            for (int k = 0; k < w; k++)
+                            {
+                                outptr[k] = ptr[w - 1 - k];
+                            }
+                        }
+                    }
+                }
+            }
+            else if (axis_sum == 5)
+            {
+                // 对应dhw，除了d外全翻转
+                for (int c = 0; c < channels; c++)
+                {
+                    int flipped_c = channels - 1 - c; // 翻转c维度
+
+                    for (int z = 0; z < d; z++) // d维度保持不变
+                    {
+                        for (int i = 0; i < h; i++)
+                        {
+                            const float* ptr = bottom_blob.channel(c).depth(z).row(i);
+                            float* outptr = top_blob.channel(flipped_c).depth(z).row(h - 1 - i); // 翻转h维度
+
+                            // 翻转w维度
+                            for (int k = 0; k < w; k++)
+                            {
+                                outptr[k] = ptr[w - 1 - k];
+                            }
+                        }
+                    }
+                }
+            }
+            else if (axis_sum == 6)
+            {
+                // 对应chw,除了c外全翻转
+                for (int c = 0; c < channels; c++) // c维度保持不变
+                {
+                    for (int z = 0; z < d; z++)
+                    {
+                        int flipped_d = d - 1 - z; // 翻转d维度
+
+                        for (int i = 0; i < h; i++)
+                        {
+                            const float* ptr = bottom_blob.channel(c).depth(z).row(i);
+                            float* outptr = top_blob.channel(c).depth(flipped_d).row(h - 1 - i); // 翻转h维度
+                            // 翻转w维度
+                            for (int k = 0; k < w; k++)
+                            {
+                                outptr[k] = ptr[w - 1 - k];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            // dchw全部翻转
+            for (int c = 0; c < channels; c++)
+            {
+                int flipped_c = channels - 1 - c; // 翻转c维度
+
+                for (int z = 0; z < d; z++)
+                {
+                    int flipped_d = d - 1 - z; // 翻转d维度
+
+                    for (int i = 0; i < h; i++)
+                    {
+                        const float* ptr = bottom_blob.channel(c).row(z * h + i);
+                        float* outptr = top_blob.channel(flipped_c).row(flipped_d * h + (h - 1 - i)); // 翻转h维度
+
+                        // 翻转w维度
+                        for (int k = 0; k < w; k++)
+                        {
+                            outptr[k] = ptr[w - 1 - k];
+                        }
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        return -1;
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/flip.h b/src/layer/flip.h
new file mode 100644
index 00000000000..61a05d4538a
--- /dev/null
+++ b/src/layer/flip.h
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_FLIP_H
+#define LAYER_FLIP_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class Flip : public Layer
+{
+public:
+    Flip();
+
+    virtual int load_param(const ParamDict& pd);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+public:
+    Mat axis; // 翻转维度
+};
+
+} // namespace ncnn
+
+#endif // LAYER_FLIP_H
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index f55859e736e..48853470d3f 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -105,6 +105,7 @@ ncnn_add_layer_test(Embed)
 ncnn_add_layer_test(Erf)
 ncnn_add_layer_test(ExpandDims)
 ncnn_add_layer_test(Flatten)
+ncnn_add_layer_test(Flip)
 ncnn_add_layer_test(Fold)
 ncnn_add_layer_test(GELU)
 ncnn_add_layer_test(GLU)
diff --git a/tests/test_flip.cpp b/tests/test_flip.cpp
new file mode 100644
index 00000000000..7ebf787a462
--- /dev/null
+++ b/tests/test_flip.cpp
@@ -0,0 +1,132 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "layer.h"
+#include "testutil.h"
+
+// 为兼容低于c++11
+// ncnn::Mat axis_mat(axis.size());
+// for (size_t i = 0; i < axis.size(); i++)
+// {
+//     axis_mat[i] = axis[i];
+// }
+static ncnn::Mat IntArrayMat(int a0)
+{
+    ncnn::Mat m(1);
+    int* p = m;
+    p[0] = a0;
+    return m;
+}
+
+static ncnn::Mat IntArrayMat(int a0, int a1)
+{
+    ncnn::Mat m(2);
+    int* p = m;
+    p[0] = a0;
+    p[1] = a1;
+    return m;
+}
+
+static ncnn::Mat IntArrayMat(int a0, int a1, int a2)
+{
+    ncnn::Mat m(3);
+    int* p = m;
+    p[0] = a0;
+    p[1] = a1;
+    p[2] = a2;
+    return m;
+}
+
+static ncnn::Mat IntArrayMat(int a0, int a1, int a2, int a3)
+{
+    ncnn::Mat m(4);
+    int* p = m;
+    p[0] = a0;
+    p[1] = a1;
+    p[2] = a2;
+    p[3] = a3;
+    return m;
+}
+
+static int test_flip(const ncnn::Mat& a, const ncnn::Mat& axis)
+{
+    ncnn::ParamDict pd;
+    pd.set(0, axis);
+
+    std::vector<ncnn::Mat> weights(0);
+
+    int ret = test_layer("Flip", pd, weights, a);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_flip failed a.dims=%d a=(%d %d %d %d) axis_w=%d\n", a.dims, a.w, a.h, a.d, a.c, axis.w);
+    }
+
+    return ret;
+}
+
+static int test_flip_0()
+{
+    return 0
+           || test_flip(RandomMat(2, 3, 4, 5), IntArrayMat(0))
+           || test_flip(RandomMat(3, 2, 4, 5), IntArrayMat(1))
+           || test_flip(RandomMat(4, 3, 2, 5), IntArrayMat(2))
+           || test_flip(RandomMat(2, 3, 1, 5), IntArrayMat(3))
+           || test_flip(RandomMat(6, 3, 4, 5), IntArrayMat(0, 1))
+           || test_flip(RandomMat(2, 3, 1, 6), IntArrayMat(0, 2))
+           || test_flip(RandomMat(5, 1, 2, 5), IntArrayMat(0, 3))
+           || test_flip(RandomMat(5, 2, 1, 5), IntArrayMat(1, 2))
+           || test_flip(RandomMat(4, 5, 2, 3), IntArrayMat(1, 3))
+           || test_flip(RandomMat(2, 6, 4, 5), IntArrayMat(2, 3))
+           || test_flip(RandomMat(6, 1, 4, 5), IntArrayMat(0, 1, 2))
+           || test_flip(RandomMat(5, 2, 1, 5), IntArrayMat(0, 1, 3))
+           || test_flip(RandomMat(4, 3, 3, 5), IntArrayMat(0, 2, 3))
+           || test_flip(RandomMat(4, 3, 4, 5), IntArrayMat(1, 2, 3))
+           || test_flip(RandomMat(6, 3, 3, 2), IntArrayMat(0, 1, 2, 3));
+}
+
+static int test_flip_1()
+{
+    return 0
+           || test_flip(RandomMat(2, 3, 5), IntArrayMat(0))
+           || test_flip(RandomMat(3, 3, 5), IntArrayMat(1))
+           || test_flip(RandomMat(4, 3, 5), IntArrayMat(2))
+           || test_flip(RandomMat(3, 1, 5), IntArrayMat(0, 1))
+           || test_flip(RandomMat(3, 2, 5), IntArrayMat(0, 2))
+           || test_flip(RandomMat(3, 3, 4), IntArrayMat(1, 2))
+           || test_flip(RandomMat(4, 3, 2), IntArrayMat(0, 1, 2));
+}
+
+static int test_flip_2()
+{
+    return 0
+           || test_flip(RandomMat(8, 2), IntArrayMat(-2))
+           || test_flip(RandomMat(16, 3), IntArrayMat(-1))
+           || test_flip(RandomMat(7, 2), IntArrayMat(-2, -1));
+}
+
+static int test_flip_3()
+{
+    return 0
+           || test_flip(RandomMat(18), IntArrayMat(-1));
+}
+
+int main()
+{
+    SRAND(7767517);
+    return 0
+           || test_flip_0()
+           || test_flip_1()
+           || test_flip_2()
+           || test_flip_3();
+}
\ No newline at end of file
diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt
index b1ac6f5c024..5d681ab9c4b 100644
--- a/tools/pnnx/src/CMakeLists.txt
+++ b/tools/pnnx/src/CMakeLists.txt
@@ -575,6 +575,7 @@ set(pnnx_pass_ncnn_SRCS
     pass_ncnn/torch_cumsum.cpp
     pass_ncnn/torch_diag.cpp
     pass_ncnn/torch_flatten.cpp
+    pass_ncnn/torch_flip.cpp
     pass_ncnn/torch_istft.cpp
     pass_ncnn/torch_logsumexp.cpp
     pass_ncnn/torch_matmul.cpp
diff --git a/tools/pnnx/src/pass_ncnn/torch_flip.cpp b/tools/pnnx/src/pass_ncnn/torch_flip.cpp
new file mode 100644
index 00000000000..bc0e3348548
--- /dev/null
+++ b/tools/pnnx/src/pass_ncnn/torch_flip.cpp
@@ -0,0 +1,56 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#include "pass_ncnn.h"
+
+namespace pnnx {
+
+namespace ncnn {
+
+class torch_flip : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+torch.flip              op_0        1 1 input out dims=%dims
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "Flip";
+    }
+
+    const char* name_str() const
+    {
+        return "flip";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        const std::vector<int>& dims = captured_params.at("dims").ai;
+
+        // 设置参数
+        op->params["0"] = dims;
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_flip, 20)
+
+} // namespace ncnn
+
+} // namespace pnnx
\ No newline at end of file
diff --git a/tools/pnnx/tests/ncnn/CMakeLists.txt b/tools/pnnx/tests/ncnn/CMakeLists.txt
index 42c3bed32e0..54c8896ef77 100644
--- a/tools/pnnx/tests/ncnn/CMakeLists.txt
+++ b/tools/pnnx/tests/ncnn/CMakeLists.txt
@@ -188,6 +188,7 @@ pnnx_ncnn_add_test(torch_clamp)
 pnnx_ncnn_add_test(torch_cos)
 pnnx_ncnn_add_test(torch_exp)
 pnnx_ncnn_add_test(torch_floor)
+pnnx_ncnn_add_test(torch_flip)
 pnnx_ncnn_add_test(torch_log)
 pnnx_ncnn_add_test(torch_log10)
 pnnx_ncnn_add_test(torch_maximum)
diff --git a/tools/pnnx/tests/ncnn/test_torch_flip.py b/tools/pnnx/tests/ncnn/test_torch_flip.py
new file mode 100644
index 00000000000..b07a8d297a7
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_torch_flip.py
@@ -0,0 +1,155 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z, d):
+        # 1D
+        x0 = torch.flip(x, [0])
+        # 2D
+        y0 = torch.flip(y, [0])
+        y1 = torch.flip(y, [1])
+        y2 = torch.flip(y, [-2, -1])
+        # 3D
+        z0 = torch.flip(z, [0])
+        z1 = torch.flip(z, [1])
+        z2 = torch.flip(z, [2])
+        z3 = torch.flip(z, [0, 1])
+        z4 = torch.flip(z, [0, 2])
+        z5 = torch.flip(z, [1, 2])
+        z6 = torch.flip(z, [0, 1, 2])
+        # 4D
+        d0 = torch.flip(d, [-1])
+        d1 = torch.flip(d, [-2])
+        d2 = torch.flip(d, [-3])
+        d3 = torch.flip(d, [-4])
+        d4 = torch.flip(d, [0, 1])
+        d5 = torch.flip(d, [0, 2])
+        d6 = torch.flip(d, [0, 3])
+        d7 = torch.flip(d, [1, 2])
+        d8 = torch.flip(d, [1, 3])
+        d9 = torch.flip(d, [2, 3])
+        d10 = torch.flip(d, [0, 1, 2])
+        d11 = torch.flip(d, [0, 1, 3])
+        d12 = torch.flip(d, [0, 2, 3])
+        d13 = torch.flip(d, [1, 2, 3])
+        d14 = torch.flip(d, [0, 1, 2, 3])
+
+        return (
+            x0,
+            y0,
+            y1,
+            y2,
+            z0,
+            z1,
+            z2,
+            z3,
+            z4,
+            z5,
+            z6,
+            d0,
+            d1,
+            d2,
+            d3,
+            d4,
+            d5,
+            d6,
+            d7,
+            d8,
+            d9,
+            d10,
+            d11,
+            d12,
+            d13,
+            d14,
+        )
+
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(36)  # 1D
+    y = torch.rand(4, 7)  # 2D
+    z = torch.rand(3, 4, 5)  # 3D
+    d = torch.rand(4, 2, 6, 7)  # 4D
+
+    a = net(x, y, z, d)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z, d))
+    mod.save("test_torch_flip.pt")
+
+    # torchscript to pnnx
+    import os
+
+    os.system(
+        "../../src/pnnx test_torch_flip.pt inputshape=[36],[4,7],[3,4,5],[4,2,6,7]"
+    )
+
+    # pnnx inference
+    import test_torch_flip_ncnn
+
+    b = test_torch_flip_ncnn.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-3, 1e-3):
+            return False
+    return True
+
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)