From 1ea2a8d4cff6e700f75dc60cdca0c90a408ab978 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Thu, 4 Nov 2021 01:49:49 +0100 Subject: [PATCH 01/33] conformer encoder first draft --- nn/conformer.py | 204 ++++++++++++++++++++++++++++++++++++++++++++++++ nn/math_.py | 6 ++ 2 files changed, 210 insertions(+) create mode 100644 nn/conformer.py diff --git a/nn/conformer.py b/nn/conformer.py new file mode 100644 index 00000000..5f50096f --- /dev/null +++ b/nn/conformer.py @@ -0,0 +1,204 @@ +""" +Conformer code. +Ref: https://arxiv.org/abs/2005.08100 +""" + +from typing import Tuple, List, Union +from . import Module, ModuleList, LayerRef, Linear, dropout, layer_norm, batch_norm, Conv, swish, glu, split_dims, \ + merge_dims, pool + + +class _PositionwiseFeedForward(Module): + """ + Conformer position-wise feedforward neural network layer + FF -> Activation -> Dropout -> FF + """ + + def __init__(self, d_model: int, d_ff: int, dropout: float, activation, l2: float = 0.0): + """ + :param d_model: + :param d_ff: + :param dropout: + :param activation: + :param l2: + """ + super().__init__() + + self.dropout = dropout + self.activation = activation + + self.linear1 = Linear(n_out=d_ff, l2=l2) + self.linear2 = Linear(n_out=d_model, l2=l2) + + def forward(self, inp: LayerRef) -> LayerRef: + return self.linear2(dropout(self.activation(self.linear1(inp)), dropout=self.dropout)) + + +class _ConformerConvBlock(Module): + """ + Conformer convolution block + FF -> GLU -> depthwise conv -> BN -> Swish -> FF + """ + + def __init__(self, d_model: int, kernel_size: Tuple[int], l2: float = 0.0): + """ + :param d_model: + :param kernel_size: + :param l2: + """ + super().__init__() + + self.positionwise_conv1 = Linear(n_out=d_model * 2, l2=l2) + self.depthwise_conv = Conv(n_out=d_model, filter_size=kernel_size, groups=d_model, l2=l2, padding='same') + self.positionwise_conv2 = Linear(n_out=d_model, l2=l2) + + def forward(self, inp: LayerRef) -> LayerRef: + x_conv1 = self.positionwise_conv1(inp) + x_act = glu(x_conv1) + x_depthwise_conv = self.depthwise_conv(x_act) + x_bn = batch_norm(x_depthwise_conv) + x_swish = swish(x_bn) + x_conv2 = self.positionwise_conv2(x_swish) + return x_conv2 + + +class _ConformerConvSubsampleLayer(Module): + """ + Conv 2D block with optional max-pooling + """ + + def __init__(self, filter_sizes: List[Tuple[int, ...]], pool_sizes: Union[List[Tuple[int, ...]], None], + channel_sizes: List[int], l2: float = 0.0, dropout: float = 0.3, act: str = 'relu', + padding: str = 'same'): + """ + :param filter_sizes: + :param pool_sizes: + :param channel_sizes: + :param l2: + :param dropout: + :param act: + :param padding: + """ + super().__init__() + + self.dropout = dropout + self.pool_sizes = pool_sizes + + self.conv_layers = ModuleList() + for filter_size, channel_size in zip(filter_sizes, channel_sizes): + self.conv_layers.append( + Conv(l2=l2, activation=act, filter_size=filter_size, n_out=channel_size, padding=padding)) + + def forward(self, inp: LayerRef) -> LayerRef: + x = split_dims(inp, axis='F', dims=(-1, 1)) + for i, conv_layer in enumerate(self.conv_layers): + x = conv_layer(x) + if self.pool_sizes and i < len(self.pool_sizes): + x = pool(x, pool_size=self.pool_sizes[i], padding='same', mode='max') + if self.dropout: + x = dropout(x, dropout=self.dropout) + out = merge_dims(x, axes='static') + return out + + +class ConformerEncoderLayer(Module): + """ + Represents a conformer block + """ + + def __init__(self, conv_kernel_size: Tuple[int], ff_act, ff_dim: int, dropout: float, att_dropout: float, + enc_key_dim: int, att_n_heads: int, l2: float): + """ + :param conv_kernel_size: + :param ff_act: + :param ff_dim: + :param dropout: + :param att_dropout: + :param enc_key_dim: + :param att_n_heads: + :param l2: + """ + super().__init__() + + self.dropout = dropout + + self.ffn1 = _PositionwiseFeedForward( + d_model=enc_key_dim, d_ff=ff_dim, dropout=dropout, activation=ff_act, l2=l2) + + self.ffn2 = _PositionwiseFeedForward( + d_model=enc_key_dim, d_ff=ff_dim, dropout=dropout, activation=ff_act, l2=l2) + + self.conv_module = _ConformerConvBlock(d_model=enc_key_dim, kernel_size=conv_kernel_size) + + self.mhsa_module = MultiheadAttention(d_model, att_n_heads, dropout=att_dropout) # TODO: to be implemented + + def forward(self, inp: LayerRef) -> LayerRef: + # FFN + x_ffn1_ln = layer_norm(inp) + x_ffn1 = self.ffn1(x_ffn1_ln) + x_ffn1_out = 0.5 * dropout(x_ffn1, dropout=self.dropout) + inp + + # MHSA + x_mhsa_ln = layer_norm(x_ffn1_out) + x_mhsa = self.mhsa_module(x_mhsa_ln) + x_mhsa_out = x_mhsa + x_ffn1_out + + # Conv + x_conv_ln = layer_norm(x_mhsa_out) + x_conv = self.conv_module(x_conv_ln) + x_conv_out = dropout(x_conv, dropout=self.dropout) + x_mhsa_out + + # FFN + x_ffn2_ln = layer_norm(x_conv_out) + x_ffn2 = self.ffn2(x_ffn2_ln) + x_ffn2_out = 0.5 * dropout(x_ffn2, dropout=self.dropout) + x_conv_out + + # last LN layer + return layer_norm(x_ffn2_out) + + +class ConformerEncoder(Module): + """ + Represents Conformer encoder architecture + """ + + def __init__(self, encoder_layer: Module, num_blocks: int, conv_kernel_size: Tuple[int, ...] = (32,), ff_act=swish, + ff_dim: int = 512, dropout: float = 0.1, att_dropout: float = 0.1, enc_key_dim: int = 256, + att_n_heads: int = 4, l2: float = 0.0): + """ + :param encoder_layer: + :param num_blocks: + :param conv_kernel_size: + :param ff_act: + :param ff_dim: + :param dropout: + :param att_dropout: + :param enc_key_dim: + :param att_n_heads: + :param l2: + """ + super().__init__() + + self.dropout = dropout + + self.conv_subsample_layer = _ConformerConvSubsampleLayer( + filter_sizes=[(3, 3), (3, 3)], pool_sizes=[(2, 2), (2, 2)], channel_sizes=[enc_key_dim, enc_key_dim], + l2=l2, dropout=dropout) + + self.linear = Linear(n_out=enc_key_dim, l2=l2, with_bias=False) + + self.conformer_blocks = ModuleList([ + encoder_layer( + conv_kernel_size=conv_kernel_size, ff_act=ff_act, ff_dim=ff_dim, dropout=dropout, + att_dropout=att_dropout, enc_key_dim=enc_key_dim, att_n_heads=att_n_heads, l2=l2 + ) + for _ in range(num_blocks) + ]) + + def forward(self, inp: LayerRef) -> LayerRef: + x_subsample = self.conv_subsample_layer(inp) + x_linear = self.linear(x_subsample) + x = dropout(x_linear, dropout=self.dropout) + for conformer_block in self.conformer_blocks: + x = conformer_block(x) + return x diff --git a/nn/math_.py b/nn/math_.py index 7817801e..64dbb0fd 100644 --- a/nn/math_.py +++ b/nn/math_.py @@ -35,6 +35,11 @@ def gelu(x: nn.LayerRef) -> nn.Layer: return _activation(x, activation="gelu") +def glu(x: LayerRef) -> Layer: + """GLU""" + return activation(x, activation='glu') + + def exp(x: nn.LayerRef) -> nn.Layer: """exp""" return _activation(x, activation="exp") @@ -102,3 +107,4 @@ def cumsum( name=name) del state return layer + From ea327e43d124ac62e03babc91404ebe43deb0314 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Thu, 4 Nov 2021 11:58:17 +0100 Subject: [PATCH 02/33] fix indent and formatting --- nn/conformer.py | 313 ++++++++++++++++++++++++------------------------ 1 file changed, 157 insertions(+), 156 deletions(-) diff --git a/nn/conformer.py b/nn/conformer.py index 5f50096f..f13a56e1 100644 --- a/nn/conformer.py +++ b/nn/conformer.py @@ -4,201 +4,202 @@ """ from typing import Tuple, List, Union + from . import Module, ModuleList, LayerRef, Linear, dropout, layer_norm, batch_norm, Conv, swish, glu, split_dims, \ - merge_dims, pool + merge_dims, pool class _PositionwiseFeedForward(Module): + """ + Conformer position-wise feedforward neural network layer + FF -> Activation -> Dropout -> FF + """ + + def __init__(self, d_model: int, d_ff: int, dropout: float, activation, l2: float = 0.0): """ - Conformer position-wise feedforward neural network layer - FF -> Activation -> Dropout -> FF + :param d_model: + :param d_ff: + :param dropout: + :param activation: + :param l2: """ + super().__init__() - def __init__(self, d_model: int, d_ff: int, dropout: float, activation, l2: float = 0.0): - """ - :param d_model: - :param d_ff: - :param dropout: - :param activation: - :param l2: - """ - super().__init__() - - self.dropout = dropout - self.activation = activation + self.dropout = dropout + self.activation = activation - self.linear1 = Linear(n_out=d_ff, l2=l2) - self.linear2 = Linear(n_out=d_model, l2=l2) + self.linear1 = Linear(n_out=d_ff, l2=l2) + self.linear2 = Linear(n_out=d_model, l2=l2) - def forward(self, inp: LayerRef) -> LayerRef: - return self.linear2(dropout(self.activation(self.linear1(inp)), dropout=self.dropout)) + def forward(self, inp: LayerRef) -> LayerRef: + return self.linear2(dropout(self.activation(self.linear1(inp)), dropout=self.dropout)) class _ConformerConvBlock(Module): + """ + Conformer convolution block + FF -> GLU -> depthwise conv -> BN -> Swish -> FF + """ + + def __init__(self, d_model: int, kernel_size: Tuple[int], l2: float = 0.0): """ - Conformer convolution block - FF -> GLU -> depthwise conv -> BN -> Swish -> FF + :param d_model: + :param kernel_size: + :param l2: """ + super().__init__() - def __init__(self, d_model: int, kernel_size: Tuple[int], l2: float = 0.0): - """ - :param d_model: - :param kernel_size: - :param l2: - """ - super().__init__() - - self.positionwise_conv1 = Linear(n_out=d_model * 2, l2=l2) - self.depthwise_conv = Conv(n_out=d_model, filter_size=kernel_size, groups=d_model, l2=l2, padding='same') - self.positionwise_conv2 = Linear(n_out=d_model, l2=l2) + self.positionwise_conv1 = Linear(n_out=d_model * 2, l2=l2) + self.depthwise_conv = Conv(n_out=d_model, filter_size=kernel_size, groups=d_model, l2=l2, padding='same') + self.positionwise_conv2 = Linear(n_out=d_model, l2=l2) - def forward(self, inp: LayerRef) -> LayerRef: - x_conv1 = self.positionwise_conv1(inp) - x_act = glu(x_conv1) - x_depthwise_conv = self.depthwise_conv(x_act) - x_bn = batch_norm(x_depthwise_conv) - x_swish = swish(x_bn) - x_conv2 = self.positionwise_conv2(x_swish) - return x_conv2 + def forward(self, inp: LayerRef) -> LayerRef: + x_conv1 = self.positionwise_conv1(inp) + x_act = glu(x_conv1) + x_depthwise_conv = self.depthwise_conv(x_act) + x_bn = batch_norm(x_depthwise_conv) + x_swish = swish(x_bn) + x_conv2 = self.positionwise_conv2(x_swish) + return x_conv2 class _ConformerConvSubsampleLayer(Module): + """ + Conv 2D block with optional max-pooling + """ + + def __init__(self, filter_sizes: List[Tuple[int, ...]], pool_sizes: Union[List[Tuple[int, ...]], None], + channel_sizes: List[int], l2: float = 0.0, dropout: float = 0.3, act: str = 'relu', + padding: str = 'same'): """ - Conv 2D block with optional max-pooling + :param filter_sizes: + :param pool_sizes: + :param channel_sizes: + :param l2: + :param dropout: + :param act: + :param padding: """ + super().__init__() - def __init__(self, filter_sizes: List[Tuple[int, ...]], pool_sizes: Union[List[Tuple[int, ...]], None], - channel_sizes: List[int], l2: float = 0.0, dropout: float = 0.3, act: str = 'relu', - padding: str = 'same'): - """ - :param filter_sizes: - :param pool_sizes: - :param channel_sizes: - :param l2: - :param dropout: - :param act: - :param padding: - """ - super().__init__() - - self.dropout = dropout - self.pool_sizes = pool_sizes - - self.conv_layers = ModuleList() - for filter_size, channel_size in zip(filter_sizes, channel_sizes): - self.conv_layers.append( - Conv(l2=l2, activation=act, filter_size=filter_size, n_out=channel_size, padding=padding)) - - def forward(self, inp: LayerRef) -> LayerRef: - x = split_dims(inp, axis='F', dims=(-1, 1)) - for i, conv_layer in enumerate(self.conv_layers): - x = conv_layer(x) - if self.pool_sizes and i < len(self.pool_sizes): - x = pool(x, pool_size=self.pool_sizes[i], padding='same', mode='max') - if self.dropout: - x = dropout(x, dropout=self.dropout) - out = merge_dims(x, axes='static') - return out + self.dropout = dropout + self.pool_sizes = pool_sizes + + self.conv_layers = ModuleList() + for filter_size, channel_size in zip(filter_sizes, channel_sizes): + self.conv_layers.append( + Conv(l2=l2, activation=act, filter_size=filter_size, n_out=channel_size, padding=padding)) + + def forward(self, inp: LayerRef) -> LayerRef: + x = split_dims(inp, axis='F', dims=(-1, 1)) + for i, conv_layer in enumerate(self.conv_layers): + x = conv_layer(x) + if self.pool_sizes and i < len(self.pool_sizes): + x = pool(x, pool_size=self.pool_sizes[i], padding='same', mode='max') + if self.dropout: + x = dropout(x, dropout=self.dropout) + out = merge_dims(x, axes='static') + return out class ConformerEncoderLayer(Module): + """ + Represents a conformer block + """ + + def __init__(self, conv_kernel_size: Tuple[int], ff_act, ff_dim: int, dropout: float, att_dropout: float, + enc_key_dim: int, att_n_heads: int, l2: float): """ - Represents a conformer block + :param conv_kernel_size: + :param ff_act: + :param ff_dim: + :param dropout: + :param att_dropout: + :param enc_key_dim: + :param att_n_heads: + :param l2: """ + super().__init__() - def __init__(self, conv_kernel_size: Tuple[int], ff_act, ff_dim: int, dropout: float, att_dropout: float, - enc_key_dim: int, att_n_heads: int, l2: float): - """ - :param conv_kernel_size: - :param ff_act: - :param ff_dim: - :param dropout: - :param att_dropout: - :param enc_key_dim: - :param att_n_heads: - :param l2: - """ - super().__init__() + self.dropout = dropout - self.dropout = dropout + self.ffn1 = _PositionwiseFeedForward( + d_model=enc_key_dim, d_ff=ff_dim, dropout=dropout, activation=ff_act, l2=l2) - self.ffn1 = _PositionwiseFeedForward( - d_model=enc_key_dim, d_ff=ff_dim, dropout=dropout, activation=ff_act, l2=l2) + self.ffn2 = _PositionwiseFeedForward( + d_model=enc_key_dim, d_ff=ff_dim, dropout=dropout, activation=ff_act, l2=l2) - self.ffn2 = _PositionwiseFeedForward( - d_model=enc_key_dim, d_ff=ff_dim, dropout=dropout, activation=ff_act, l2=l2) + self.conv_module = _ConformerConvBlock(d_model=enc_key_dim, kernel_size=conv_kernel_size) - self.conv_module = _ConformerConvBlock(d_model=enc_key_dim, kernel_size=conv_kernel_size) + self.mhsa_module = MultiheadAttention(d_model, att_n_heads, dropout=att_dropout) # TODO: to be implemented - self.mhsa_module = MultiheadAttention(d_model, att_n_heads, dropout=att_dropout) # TODO: to be implemented + def forward(self, inp: LayerRef) -> LayerRef: + # FFN + x_ffn1_ln = layer_norm(inp) + x_ffn1 = self.ffn1(x_ffn1_ln) + x_ffn1_out = 0.5 * dropout(x_ffn1, dropout=self.dropout) + inp - def forward(self, inp: LayerRef) -> LayerRef: - # FFN - x_ffn1_ln = layer_norm(inp) - x_ffn1 = self.ffn1(x_ffn1_ln) - x_ffn1_out = 0.5 * dropout(x_ffn1, dropout=self.dropout) + inp + # MHSA + x_mhsa_ln = layer_norm(x_ffn1_out) + x_mhsa = self.mhsa_module(x_mhsa_ln) + x_mhsa_out = x_mhsa + x_ffn1_out - # MHSA - x_mhsa_ln = layer_norm(x_ffn1_out) - x_mhsa = self.mhsa_module(x_mhsa_ln) - x_mhsa_out = x_mhsa + x_ffn1_out + # Conv + x_conv_ln = layer_norm(x_mhsa_out) + x_conv = self.conv_module(x_conv_ln) + x_conv_out = dropout(x_conv, dropout=self.dropout) + x_mhsa_out - # Conv - x_conv_ln = layer_norm(x_mhsa_out) - x_conv = self.conv_module(x_conv_ln) - x_conv_out = dropout(x_conv, dropout=self.dropout) + x_mhsa_out + # FFN + x_ffn2_ln = layer_norm(x_conv_out) + x_ffn2 = self.ffn2(x_ffn2_ln) + x_ffn2_out = 0.5 * dropout(x_ffn2, dropout=self.dropout) + x_conv_out - # FFN - x_ffn2_ln = layer_norm(x_conv_out) - x_ffn2 = self.ffn2(x_ffn2_ln) - x_ffn2_out = 0.5 * dropout(x_ffn2, dropout=self.dropout) + x_conv_out - - # last LN layer - return layer_norm(x_ffn2_out) + # last LN layer + return layer_norm(x_ffn2_out) class ConformerEncoder(Module): + """ + Represents Conformer encoder architecture + """ + + def __init__(self, encoder_layer: Module, num_blocks: int, conv_kernel_size: Tuple[int, ...] = (32,), ff_act=swish, + ff_dim: int = 512, dropout: float = 0.1, att_dropout: float = 0.1, enc_key_dim: int = 256, + att_n_heads: int = 4, l2: float = 0.0): """ - Represents Conformer encoder architecture + :param encoder_layer: + :param num_blocks: + :param conv_kernel_size: + :param ff_act: + :param ff_dim: + :param dropout: + :param att_dropout: + :param enc_key_dim: + :param att_n_heads: + :param l2: """ - - def __init__(self, encoder_layer: Module, num_blocks: int, conv_kernel_size: Tuple[int, ...] = (32,), ff_act=swish, - ff_dim: int = 512, dropout: float = 0.1, att_dropout: float = 0.1, enc_key_dim: int = 256, - att_n_heads: int = 4, l2: float = 0.0): - """ - :param encoder_layer: - :param num_blocks: - :param conv_kernel_size: - :param ff_act: - :param ff_dim: - :param dropout: - :param att_dropout: - :param enc_key_dim: - :param att_n_heads: - :param l2: - """ - super().__init__() - - self.dropout = dropout - - self.conv_subsample_layer = _ConformerConvSubsampleLayer( - filter_sizes=[(3, 3), (3, 3)], pool_sizes=[(2, 2), (2, 2)], channel_sizes=[enc_key_dim, enc_key_dim], - l2=l2, dropout=dropout) - - self.linear = Linear(n_out=enc_key_dim, l2=l2, with_bias=False) - - self.conformer_blocks = ModuleList([ - encoder_layer( - conv_kernel_size=conv_kernel_size, ff_act=ff_act, ff_dim=ff_dim, dropout=dropout, - att_dropout=att_dropout, enc_key_dim=enc_key_dim, att_n_heads=att_n_heads, l2=l2 - ) - for _ in range(num_blocks) - ]) - - def forward(self, inp: LayerRef) -> LayerRef: - x_subsample = self.conv_subsample_layer(inp) - x_linear = self.linear(x_subsample) - x = dropout(x_linear, dropout=self.dropout) - for conformer_block in self.conformer_blocks: - x = conformer_block(x) - return x + super().__init__() + + self.dropout = dropout + + self.conv_subsample_layer = _ConformerConvSubsampleLayer( + filter_sizes=[(3, 3), (3, 3)], pool_sizes=[(2, 2), (2, 2)], channel_sizes=[enc_key_dim, enc_key_dim], + l2=l2, dropout=dropout) + + self.linear = Linear(n_out=enc_key_dim, l2=l2, with_bias=False) + + self.conformer_blocks = ModuleList([ + encoder_layer( + conv_kernel_size=conv_kernel_size, ff_act=ff_act, ff_dim=ff_dim, dropout=dropout, + att_dropout=att_dropout, enc_key_dim=enc_key_dim, att_n_heads=att_n_heads, l2=l2 + ) + for _ in range(num_blocks) + ]) + + def forward(self, inp: LayerRef) -> LayerRef: + x_subsample = self.conv_subsample_layer(inp) + x_linear = self.linear(x_subsample) + x = dropout(x_linear, dropout=self.dropout) + for conformer_block in self.conformer_blocks: + x = conformer_block(x) + return x From cc2b4a26e57fd5586c46e7e0b1d7808de9d0a63b Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Thu, 4 Nov 2021 12:16:17 +0100 Subject: [PATCH 03/33] better import --- nn/conformer.py | 71 ++++++++++++++++++++++++------------------------- 1 file changed, 35 insertions(+), 36 deletions(-) diff --git a/nn/conformer.py b/nn/conformer.py index f13a56e1..2d0a1fbc 100644 --- a/nn/conformer.py +++ b/nn/conformer.py @@ -4,12 +4,11 @@ """ from typing import Tuple, List, Union +from .. import nn +from . import LayerRef -from . import Module, ModuleList, LayerRef, Linear, dropout, layer_norm, batch_norm, Conv, swish, glu, split_dims, \ - merge_dims, pool - -class _PositionwiseFeedForward(Module): +class _PositionwiseFeedForward(nn.Module): """ Conformer position-wise feedforward neural network layer FF -> Activation -> Dropout -> FF @@ -28,14 +27,14 @@ def __init__(self, d_model: int, d_ff: int, dropout: float, activation, l2: floa self.dropout = dropout self.activation = activation - self.linear1 = Linear(n_out=d_ff, l2=l2) - self.linear2 = Linear(n_out=d_model, l2=l2) + self.linear1 = nn.Linear(n_out=d_ff, l2=l2) + self.linear2 = nn.Linear(n_out=d_model, l2=l2) def forward(self, inp: LayerRef) -> LayerRef: - return self.linear2(dropout(self.activation(self.linear1(inp)), dropout=self.dropout)) + return self.linear2(nn.dropout(self.activation(self.linear1(inp)), dropout=self.dropout)) -class _ConformerConvBlock(Module): +class _ConformerConvBlock(nn.Module): """ Conformer convolution block FF -> GLU -> depthwise conv -> BN -> Swish -> FF @@ -49,21 +48,21 @@ def __init__(self, d_model: int, kernel_size: Tuple[int], l2: float = 0.0): """ super().__init__() - self.positionwise_conv1 = Linear(n_out=d_model * 2, l2=l2) - self.depthwise_conv = Conv(n_out=d_model, filter_size=kernel_size, groups=d_model, l2=l2, padding='same') - self.positionwise_conv2 = Linear(n_out=d_model, l2=l2) + self.positionwise_conv1 = nn.Linear(n_out=d_model * 2, l2=l2) + self.depthwise_conv = nn.Conv(n_out=d_model, filter_size=kernel_size, groups=d_model, l2=l2, padding='same') + self.positionwise_conv2 = nn.Linear(n_out=d_model, l2=l2) def forward(self, inp: LayerRef) -> LayerRef: x_conv1 = self.positionwise_conv1(inp) - x_act = glu(x_conv1) + x_act = nn.glu(x_conv1) x_depthwise_conv = self.depthwise_conv(x_act) - x_bn = batch_norm(x_depthwise_conv) - x_swish = swish(x_bn) + x_bn = nn.batch_norm(x_depthwise_conv) + x_swish = nn.swish(x_bn) x_conv2 = self.positionwise_conv2(x_swish) return x_conv2 -class _ConformerConvSubsampleLayer(Module): +class _ConformerConvSubsampleLayer(nn.Module): """ Conv 2D block with optional max-pooling """ @@ -85,24 +84,24 @@ def __init__(self, filter_sizes: List[Tuple[int, ...]], pool_sizes: Union[List[T self.dropout = dropout self.pool_sizes = pool_sizes - self.conv_layers = ModuleList() + self.conv_layers = nn.ModuleList() for filter_size, channel_size in zip(filter_sizes, channel_sizes): self.conv_layers.append( - Conv(l2=l2, activation=act, filter_size=filter_size, n_out=channel_size, padding=padding)) + nn.Conv(l2=l2, activation=act, filter_size=filter_size, n_out=channel_size, padding=padding)) def forward(self, inp: LayerRef) -> LayerRef: - x = split_dims(inp, axis='F', dims=(-1, 1)) + x = nn.split_dims(inp, axis='F', dims=(-1, 1)) for i, conv_layer in enumerate(self.conv_layers): x = conv_layer(x) if self.pool_sizes and i < len(self.pool_sizes): - x = pool(x, pool_size=self.pool_sizes[i], padding='same', mode='max') + x = nn.pool(x, pool_size=self.pool_sizes[i], padding='same', mode='max') if self.dropout: - x = dropout(x, dropout=self.dropout) - out = merge_dims(x, axes='static') + x = nn.dropout(x, dropout=self.dropout) + out = nn.merge_dims(x, axes='static') return out -class ConformerEncoderLayer(Module): +class ConformerEncoderLayer(nn.Module): """ Represents a conformer block """ @@ -135,36 +134,36 @@ def __init__(self, conv_kernel_size: Tuple[int], ff_act, ff_dim: int, dropout: f def forward(self, inp: LayerRef) -> LayerRef: # FFN - x_ffn1_ln = layer_norm(inp) + x_ffn1_ln = nn.layer_norm(inp) x_ffn1 = self.ffn1(x_ffn1_ln) - x_ffn1_out = 0.5 * dropout(x_ffn1, dropout=self.dropout) + inp + x_ffn1_out = 0.5 * nn.dropout(x_ffn1, dropout=self.dropout) + inp # MHSA - x_mhsa_ln = layer_norm(x_ffn1_out) + x_mhsa_ln = nn.layer_norm(x_ffn1_out) x_mhsa = self.mhsa_module(x_mhsa_ln) x_mhsa_out = x_mhsa + x_ffn1_out # Conv - x_conv_ln = layer_norm(x_mhsa_out) + x_conv_ln = nn.layer_norm(x_mhsa_out) x_conv = self.conv_module(x_conv_ln) - x_conv_out = dropout(x_conv, dropout=self.dropout) + x_mhsa_out + x_conv_out = nn.dropout(x_conv, dropout=self.dropout) + x_mhsa_out # FFN - x_ffn2_ln = layer_norm(x_conv_out) + x_ffn2_ln = nn.layer_norm(x_conv_out) x_ffn2 = self.ffn2(x_ffn2_ln) - x_ffn2_out = 0.5 * dropout(x_ffn2, dropout=self.dropout) + x_conv_out + x_ffn2_out = 0.5 * nn.dropout(x_ffn2, dropout=self.dropout) + x_conv_out # last LN layer - return layer_norm(x_ffn2_out) + return nn.layer_norm(x_ffn2_out) -class ConformerEncoder(Module): +class ConformerEncoder(nn.Module): """ Represents Conformer encoder architecture """ - def __init__(self, encoder_layer: Module, num_blocks: int, conv_kernel_size: Tuple[int, ...] = (32,), ff_act=swish, - ff_dim: int = 512, dropout: float = 0.1, att_dropout: float = 0.1, enc_key_dim: int = 256, + def __init__(self, encoder_layer: nn.Module, num_blocks: int, conv_kernel_size: Tuple[int, ...] = (32,), + ff_act=nn.swish, ff_dim: int = 512, dropout: float = 0.1, att_dropout: float = 0.1, enc_key_dim: int = 256, att_n_heads: int = 4, l2: float = 0.0): """ :param encoder_layer: @@ -186,9 +185,9 @@ def __init__(self, encoder_layer: Module, num_blocks: int, conv_kernel_size: Tup filter_sizes=[(3, 3), (3, 3)], pool_sizes=[(2, 2), (2, 2)], channel_sizes=[enc_key_dim, enc_key_dim], l2=l2, dropout=dropout) - self.linear = Linear(n_out=enc_key_dim, l2=l2, with_bias=False) + self.linear = nn.Linear(n_out=enc_key_dim, l2=l2, with_bias=False) - self.conformer_blocks = ModuleList([ + self.conformer_blocks = nn.ModuleList([ encoder_layer( conv_kernel_size=conv_kernel_size, ff_act=ff_act, ff_dim=ff_dim, dropout=dropout, att_dropout=att_dropout, enc_key_dim=enc_key_dim, att_n_heads=att_n_heads, l2=l2 @@ -199,7 +198,7 @@ def __init__(self, encoder_layer: Module, num_blocks: int, conv_kernel_size: Tup def forward(self, inp: LayerRef) -> LayerRef: x_subsample = self.conv_subsample_layer(inp) x_linear = self.linear(x_subsample) - x = dropout(x_linear, dropout=self.dropout) + x = nn.dropout(x_linear, dropout=self.dropout) for conformer_block in self.conformer_blocks: x = conformer_block(x) return x From 83aeaa07d0721e061f831ae08ab8e9ec5f6a3342 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Thu, 4 Nov 2021 12:23:46 +0100 Subject: [PATCH 04/33] better use sequential --- nn/conformer.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/nn/conformer.py b/nn/conformer.py index 2d0a1fbc..33f542b7 100644 --- a/nn/conformer.py +++ b/nn/conformer.py @@ -187,7 +187,7 @@ def __init__(self, encoder_layer: nn.Module, num_blocks: int, conv_kernel_size: self.linear = nn.Linear(n_out=enc_key_dim, l2=l2, with_bias=False) - self.conformer_blocks = nn.ModuleList([ + self.conformer_blocks = nn.Sequential([ encoder_layer( conv_kernel_size=conv_kernel_size, ff_act=ff_act, ff_dim=ff_dim, dropout=dropout, att_dropout=att_dropout, enc_key_dim=enc_key_dim, att_n_heads=att_n_heads, l2=l2 @@ -199,6 +199,5 @@ def forward(self, inp: LayerRef) -> LayerRef: x_subsample = self.conv_subsample_layer(inp) x_linear = self.linear(x_subsample) x = nn.dropout(x_linear, dropout=self.dropout) - for conformer_block in self.conformer_blocks: - x = conformer_block(x) + x = self.conformer_blocks(x) return x From 63700ab3ad460859f79dcc563709304a0ced5310 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Thu, 4 Nov 2021 16:16:27 +0100 Subject: [PATCH 05/33] fix params naming --- nn/conformer.py | 52 ++++++++++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/nn/conformer.py b/nn/conformer.py index 33f542b7..eb046c51 100644 --- a/nn/conformer.py +++ b/nn/conformer.py @@ -14,10 +14,10 @@ class _PositionwiseFeedForward(nn.Module): FF -> Activation -> Dropout -> FF """ - def __init__(self, d_model: int, d_ff: int, dropout: float, activation, l2: float = 0.0): + def __init__(self, dim_model: int, dim_ff: int, dropout: float, activation, l2: float = 0.0): """ - :param d_model: - :param d_ff: + :param dim_model: + :param dim_ff: :param dropout: :param activation: :param l2: @@ -27,8 +27,8 @@ def __init__(self, d_model: int, d_ff: int, dropout: float, activation, l2: floa self.dropout = dropout self.activation = activation - self.linear1 = nn.Linear(n_out=d_ff, l2=l2) - self.linear2 = nn.Linear(n_out=d_model, l2=l2) + self.linear1 = nn.Linear(n_out=dim_ff, l2=l2) + self.linear2 = nn.Linear(n_out=dim_model, l2=l2) def forward(self, inp: LayerRef) -> LayerRef: return self.linear2(nn.dropout(self.activation(self.linear1(inp)), dropout=self.dropout)) @@ -40,17 +40,17 @@ class _ConformerConvBlock(nn.Module): FF -> GLU -> depthwise conv -> BN -> Swish -> FF """ - def __init__(self, d_model: int, kernel_size: Tuple[int], l2: float = 0.0): + def __init__(self, dim_model: int, kernel_size: Tuple[int], l2: float = 0.0): """ - :param d_model: + :param dim_model: :param kernel_size: :param l2: """ super().__init__() - self.positionwise_conv1 = nn.Linear(n_out=d_model * 2, l2=l2) - self.depthwise_conv = nn.Conv(n_out=d_model, filter_size=kernel_size, groups=d_model, l2=l2, padding='same') - self.positionwise_conv2 = nn.Linear(n_out=d_model, l2=l2) + self.positionwise_conv1 = nn.Linear(n_out=dim_model * 2, l2=l2) + self.depthwise_conv = nn.Conv(n_out=dim_model, filter_size=kernel_size, groups=dim_model, l2=l2, padding='same') + self.positionwise_conv2 = nn.Linear(n_out=dim_model, l2=l2) def forward(self, inp: LayerRef) -> LayerRef: x_conv1 = self.positionwise_conv1(inp) @@ -68,7 +68,7 @@ class _ConformerConvSubsampleLayer(nn.Module): """ def __init__(self, filter_sizes: List[Tuple[int, ...]], pool_sizes: Union[List[Tuple[int, ...]], None], - channel_sizes: List[int], l2: float = 0.0, dropout: float = 0.3, act: str = 'relu', + channel_sizes: List[int], l2: float = 0.0, dropout: float = 0.3, activation: str = 'relu', padding: str = 'same'): """ :param filter_sizes: @@ -76,7 +76,7 @@ def __init__(self, filter_sizes: List[Tuple[int, ...]], pool_sizes: Union[List[T :param channel_sizes: :param l2: :param dropout: - :param act: + :param activation: :param padding: """ super().__init__() @@ -87,7 +87,7 @@ def __init__(self, filter_sizes: List[Tuple[int, ...]], pool_sizes: Union[List[T self.conv_layers = nn.ModuleList() for filter_size, channel_size in zip(filter_sizes, channel_sizes): self.conv_layers.append( - nn.Conv(l2=l2, activation=act, filter_size=filter_size, n_out=channel_size, padding=padding)) + nn.Conv(l2=l2, activation=activation, filter_size=filter_size, n_out=channel_size, padding=padding)) def forward(self, inp: LayerRef) -> LayerRef: x = nn.split_dims(inp, axis='F', dims=(-1, 1)) @@ -106,16 +106,16 @@ class ConformerEncoderLayer(nn.Module): Represents a conformer block """ - def __init__(self, conv_kernel_size: Tuple[int], ff_act, ff_dim: int, dropout: float, att_dropout: float, - enc_key_dim: int, att_n_heads: int, l2: float): + def __init__(self, conv_kernel_size: Tuple[int], activation_ff, dim_ff: int, dropout: float, att_dropout: float, + enc_key_dim: int, num_heads: int, l2: float): """ :param conv_kernel_size: - :param ff_act: + :param activation_ff: :param ff_dim: :param dropout: :param att_dropout: :param enc_key_dim: - :param att_n_heads: + :param num_heads: :param l2: """ super().__init__() @@ -123,14 +123,14 @@ def __init__(self, conv_kernel_size: Tuple[int], ff_act, ff_dim: int, dropout: f self.dropout = dropout self.ffn1 = _PositionwiseFeedForward( - d_model=enc_key_dim, d_ff=ff_dim, dropout=dropout, activation=ff_act, l2=l2) + dim_model=enc_key_dim, dim_ff=dim_ff, dropout=dropout, activation=activation_ff, l2=l2) self.ffn2 = _PositionwiseFeedForward( - d_model=enc_key_dim, d_ff=ff_dim, dropout=dropout, activation=ff_act, l2=l2) + dim_model=enc_key_dim, dim_ff=dim_ff, dropout=dropout, activation=activation_ff, l2=l2) - self.conv_module = _ConformerConvBlock(d_model=enc_key_dim, kernel_size=conv_kernel_size) + self.conv_module = _ConformerConvBlock(dim_model=enc_key_dim, kernel_size=conv_kernel_size) - self.mhsa_module = MultiheadAttention(d_model, att_n_heads, dropout=att_dropout) # TODO: to be implemented + self.mhsa_module = self.conv_module #MultiheadAttention(enc_key_dim, num_heads, dropout=att_dropout) # TODO: to be implemented def forward(self, inp: LayerRef) -> LayerRef: # FFN @@ -163,8 +163,8 @@ class ConformerEncoder(nn.Module): """ def __init__(self, encoder_layer: nn.Module, num_blocks: int, conv_kernel_size: Tuple[int, ...] = (32,), - ff_act=nn.swish, ff_dim: int = 512, dropout: float = 0.1, att_dropout: float = 0.1, enc_key_dim: int = 256, - att_n_heads: int = 4, l2: float = 0.0): + activation_ff=nn.swish, dim_ff: int = 512, dropout: float = 0.1, att_dropout: float = 0.1, enc_key_dim: int = 256, + num_heads: int = 4, l2: float = 0.0): """ :param encoder_layer: :param num_blocks: @@ -189,8 +189,8 @@ def __init__(self, encoder_layer: nn.Module, num_blocks: int, conv_kernel_size: self.conformer_blocks = nn.Sequential([ encoder_layer( - conv_kernel_size=conv_kernel_size, ff_act=ff_act, ff_dim=ff_dim, dropout=dropout, - att_dropout=att_dropout, enc_key_dim=enc_key_dim, att_n_heads=att_n_heads, l2=l2 + conv_kernel_size=conv_kernel_size, activation_ff=activation_ff, dim_ff=dim_ff, dropout=dropout, + att_dropout=att_dropout, enc_key_dim=enc_key_dim, num_heads=num_heads, l2=l2 ) for _ in range(num_blocks) ]) @@ -200,4 +200,4 @@ def forward(self, inp: LayerRef) -> LayerRef: x_linear = self.linear(x_subsample) x = nn.dropout(x_linear, dropout=self.dropout) x = self.conformer_blocks(x) - return x + return x \ No newline at end of file From aaffdc885b1e89ee75399bd9bc13a412fbe5ffc4 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Thu, 4 Nov 2021 16:44:39 +0100 Subject: [PATCH 06/33] fix batch_norm and glu --- nn/conformer.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/nn/conformer.py b/nn/conformer.py index eb046c51..6599a4df 100644 --- a/nn/conformer.py +++ b/nn/conformer.py @@ -40,7 +40,9 @@ class _ConformerConvBlock(nn.Module): FF -> GLU -> depthwise conv -> BN -> Swish -> FF """ - def __init__(self, dim_model: int, kernel_size: Tuple[int], l2: float = 0.0): + def __init__(self, dim_model: int, kernel_size: Tuple[int], l2: float = 0.0, batch_norm_eps: float = 1e-5, + batch_norm_momentum: float = 0.1, batch_norm_update_sample_only_in_training=True, + batch_norm_delay_sample_update=True, batch_norm_other_opts=None): """ :param dim_model: :param kernel_size: @@ -52,11 +54,23 @@ def __init__(self, dim_model: int, kernel_size: Tuple[int], l2: float = 0.0): self.depthwise_conv = nn.Conv(n_out=dim_model, filter_size=kernel_size, groups=dim_model, l2=l2, padding='same') self.positionwise_conv2 = nn.Linear(n_out=dim_model, l2=l2) + if batch_norm_other_opts is None: + batch_norm_other_opts = {} + + self.batch_norm = nn.BatchNorm(epsilon=batch_norm_eps, momentum=batch_norm_momentum, + update_sample_only_in_training=batch_norm_update_sample_only_in_training, + delay_sample_update=batch_norm_delay_sample_update, **batch_norm_other_opts) + + @staticmethod + def _glu(v: LayerRef): + a, b = nn.split(v, axis='F') + return a * nn.sigmoid(b) + def forward(self, inp: LayerRef) -> LayerRef: x_conv1 = self.positionwise_conv1(inp) - x_act = nn.glu(x_conv1) + x_act = self._glu(x_conv1) x_depthwise_conv = self.depthwise_conv(x_act) - x_bn = nn.batch_norm(x_depthwise_conv) + x_bn = self.batch_norm(x_depthwise_conv) x_swish = nn.swish(x_bn) x_conv2 = self.positionwise_conv2(x_swish) return x_conv2 From 86731d74e520b27eca7d94d934e72b69fefd1977 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Thu, 4 Nov 2021 16:47:40 +0100 Subject: [PATCH 07/33] make conv kernel size int --- nn/conformer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nn/conformer.py b/nn/conformer.py index 6599a4df..e4f1db0c 100644 --- a/nn/conformer.py +++ b/nn/conformer.py @@ -40,7 +40,7 @@ class _ConformerConvBlock(nn.Module): FF -> GLU -> depthwise conv -> BN -> Swish -> FF """ - def __init__(self, dim_model: int, kernel_size: Tuple[int], l2: float = 0.0, batch_norm_eps: float = 1e-5, + def __init__(self, dim_model: int, kernel_size: int, l2: float = 0.0, batch_norm_eps: float = 1e-5, batch_norm_momentum: float = 0.1, batch_norm_update_sample_only_in_training=True, batch_norm_delay_sample_update=True, batch_norm_other_opts=None): """ @@ -51,7 +51,7 @@ def __init__(self, dim_model: int, kernel_size: Tuple[int], l2: float = 0.0, bat super().__init__() self.positionwise_conv1 = nn.Linear(n_out=dim_model * 2, l2=l2) - self.depthwise_conv = nn.Conv(n_out=dim_model, filter_size=kernel_size, groups=dim_model, l2=l2, padding='same') + self.depthwise_conv = nn.Conv(n_out=dim_model, filter_size=(kernel_size,), groups=dim_model, l2=l2, padding='same') self.positionwise_conv2 = nn.Linear(n_out=dim_model, l2=l2) if batch_norm_other_opts is None: @@ -120,7 +120,7 @@ class ConformerEncoderLayer(nn.Module): Represents a conformer block """ - def __init__(self, conv_kernel_size: Tuple[int], activation_ff, dim_ff: int, dropout: float, att_dropout: float, + def __init__(self, conv_kernel_size: int, activation_ff, dim_ff: int, dropout: float, att_dropout: float, enc_key_dim: int, num_heads: int, l2: float): """ :param conv_kernel_size: @@ -176,7 +176,7 @@ class ConformerEncoder(nn.Module): Represents Conformer encoder architecture """ - def __init__(self, encoder_layer: nn.Module, num_blocks: int, conv_kernel_size: Tuple[int, ...] = (32,), + def __init__(self, encoder_layer: nn.Module, num_blocks: int, conv_kernel_size: int = 32, activation_ff=nn.swish, dim_ff: int = 512, dropout: float = 0.1, att_dropout: float = 0.1, enc_key_dim: int = 256, num_heads: int = 4, l2: float = 0.0): """ From 68946191ffae6a50ce5d4a376b6d251612a0bd10 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Thu, 4 Nov 2021 16:48:47 +0100 Subject: [PATCH 08/33] fix bug --- nn/conformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nn/conformer.py b/nn/conformer.py index e4f1db0c..0c82c792 100644 --- a/nn/conformer.py +++ b/nn/conformer.py @@ -144,7 +144,7 @@ def __init__(self, conv_kernel_size: int, activation_ff, dim_ff: int, dropout: f self.conv_module = _ConformerConvBlock(dim_model=enc_key_dim, kernel_size=conv_kernel_size) - self.mhsa_module = self.conv_module #MultiheadAttention(enc_key_dim, num_heads, dropout=att_dropout) # TODO: to be implemented + self.mhsa_module = MultiheadAttention(enc_key_dim, num_heads, dropout=att_dropout) # TODO: to be implemented def forward(self, inp: LayerRef) -> LayerRef: # FFN From db58c9fe382626600dc0a7900e5b68329edae7ce Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Thu, 4 Nov 2021 16:57:21 +0100 Subject: [PATCH 09/33] better batch_norm --- nn/conformer.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/nn/conformer.py b/nn/conformer.py index 0c82c792..a68664e0 100644 --- a/nn/conformer.py +++ b/nn/conformer.py @@ -41,8 +41,7 @@ class _ConformerConvBlock(nn.Module): """ def __init__(self, dim_model: int, kernel_size: int, l2: float = 0.0, batch_norm_eps: float = 1e-5, - batch_norm_momentum: float = 0.1, batch_norm_update_sample_only_in_training=True, - batch_norm_delay_sample_update=True, batch_norm_other_opts=None): + batch_norm_momentum: float = 0.1, batch_norm_other_opts=None): """ :param dim_model: :param kernel_size: @@ -56,10 +55,9 @@ def __init__(self, dim_model: int, kernel_size: int, l2: float = 0.0, batch_norm if batch_norm_other_opts is None: batch_norm_other_opts = {} - - self.batch_norm = nn.BatchNorm(epsilon=batch_norm_eps, momentum=batch_norm_momentum, - update_sample_only_in_training=batch_norm_update_sample_only_in_training, - delay_sample_update=batch_norm_delay_sample_update, **batch_norm_other_opts) + self.batch_norm = nn.BatchNorm( + epsilon=batch_norm_eps, momentum=batch_norm_momentum, update_sample_only_in_training=True, + delay_sample_update=True, **batch_norm_other_opts) @staticmethod def _glu(v: LayerRef): @@ -214,4 +212,4 @@ def forward(self, inp: LayerRef) -> LayerRef: x_linear = self.linear(x_subsample) x = nn.dropout(x_linear, dropout=self.dropout) x = self.conformer_blocks(x) - return x \ No newline at end of file + return x From 253405dbed4bed3414bde73ad118072a923be852 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Thu, 4 Nov 2021 16:59:52 +0100 Subject: [PATCH 10/33] make separate calls --- nn/conformer.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/nn/conformer.py b/nn/conformer.py index a68664e0..7e4d8fe1 100644 --- a/nn/conformer.py +++ b/nn/conformer.py @@ -31,7 +31,11 @@ def __init__(self, dim_model: int, dim_ff: int, dropout: float, activation, l2: self.linear2 = nn.Linear(n_out=dim_model, l2=l2) def forward(self, inp: LayerRef) -> LayerRef: - return self.linear2(nn.dropout(self.activation(self.linear1(inp)), dropout=self.dropout)) + x_ff1 = self.linear1(inp) + x_act = self.activation(x_ff1) + x_drop = nn.dropout(x_act, dropout=self.dropout) + x_ff2 = self.linear2(x_drop) + return x_ff2 class _ConformerConvBlock(nn.Module): From 286c3e949c9ce8c9b1586e5ad42adf820644b993 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Thu, 4 Nov 2021 17:01:35 +0100 Subject: [PATCH 11/33] remove l2 --- nn/conformer.py | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/nn/conformer.py b/nn/conformer.py index 7e4d8fe1..946de048 100644 --- a/nn/conformer.py +++ b/nn/conformer.py @@ -14,21 +14,20 @@ class _PositionwiseFeedForward(nn.Module): FF -> Activation -> Dropout -> FF """ - def __init__(self, dim_model: int, dim_ff: int, dropout: float, activation, l2: float = 0.0): + def __init__(self, dim_model: int, dim_ff: int, dropout: float, activation): """ :param dim_model: :param dim_ff: :param dropout: :param activation: - :param l2: """ super().__init__() self.dropout = dropout self.activation = activation - self.linear1 = nn.Linear(n_out=dim_ff, l2=l2) - self.linear2 = nn.Linear(n_out=dim_model, l2=l2) + self.linear1 = nn.Linear(n_out=dim_ff) + self.linear2 = nn.Linear(n_out=dim_model) def forward(self, inp: LayerRef) -> LayerRef: x_ff1 = self.linear1(inp) @@ -44,18 +43,17 @@ class _ConformerConvBlock(nn.Module): FF -> GLU -> depthwise conv -> BN -> Swish -> FF """ - def __init__(self, dim_model: int, kernel_size: int, l2: float = 0.0, batch_norm_eps: float = 1e-5, + def __init__(self, dim_model: int, kernel_size: int, batch_norm_eps: float = 1e-5, batch_norm_momentum: float = 0.1, batch_norm_other_opts=None): """ :param dim_model: :param kernel_size: - :param l2: """ super().__init__() - self.positionwise_conv1 = nn.Linear(n_out=dim_model * 2, l2=l2) - self.depthwise_conv = nn.Conv(n_out=dim_model, filter_size=(kernel_size,), groups=dim_model, l2=l2, padding='same') - self.positionwise_conv2 = nn.Linear(n_out=dim_model, l2=l2) + self.positionwise_conv1 = nn.Linear(n_out=dim_model * 2) + self.depthwise_conv = nn.Conv(n_out=dim_model, filter_size=(kernel_size,), groups=dim_model, padding='same') + self.positionwise_conv2 = nn.Linear(n_out=dim_model) if batch_norm_other_opts is None: batch_norm_other_opts = {} @@ -84,13 +82,12 @@ class _ConformerConvSubsampleLayer(nn.Module): """ def __init__(self, filter_sizes: List[Tuple[int, ...]], pool_sizes: Union[List[Tuple[int, ...]], None], - channel_sizes: List[int], l2: float = 0.0, dropout: float = 0.3, activation: str = 'relu', + channel_sizes: List[int], dropout: float = 0.3, activation: str = 'relu', padding: str = 'same'): """ :param filter_sizes: :param pool_sizes: :param channel_sizes: - :param l2: :param dropout: :param activation: :param padding: @@ -103,7 +100,7 @@ def __init__(self, filter_sizes: List[Tuple[int, ...]], pool_sizes: Union[List[T self.conv_layers = nn.ModuleList() for filter_size, channel_size in zip(filter_sizes, channel_sizes): self.conv_layers.append( - nn.Conv(l2=l2, activation=activation, filter_size=filter_size, n_out=channel_size, padding=padding)) + nn.Conv(activation=activation, filter_size=filter_size, n_out=channel_size, padding=padding)) def forward(self, inp: LayerRef) -> LayerRef: x = nn.split_dims(inp, axis='F', dims=(-1, 1)) @@ -123,7 +120,7 @@ class ConformerEncoderLayer(nn.Module): """ def __init__(self, conv_kernel_size: int, activation_ff, dim_ff: int, dropout: float, att_dropout: float, - enc_key_dim: int, num_heads: int, l2: float): + enc_key_dim: int, num_heads: int): """ :param conv_kernel_size: :param activation_ff: @@ -132,17 +129,16 @@ def __init__(self, conv_kernel_size: int, activation_ff, dim_ff: int, dropout: f :param att_dropout: :param enc_key_dim: :param num_heads: - :param l2: """ super().__init__() self.dropout = dropout self.ffn1 = _PositionwiseFeedForward( - dim_model=enc_key_dim, dim_ff=dim_ff, dropout=dropout, activation=activation_ff, l2=l2) + dim_model=enc_key_dim, dim_ff=dim_ff, dropout=dropout, activation=activation_ff) self.ffn2 = _PositionwiseFeedForward( - dim_model=enc_key_dim, dim_ff=dim_ff, dropout=dropout, activation=activation_ff, l2=l2) + dim_model=enc_key_dim, dim_ff=dim_ff, dropout=dropout, activation=activation_ff) self.conv_module = _ConformerConvBlock(dim_model=enc_key_dim, kernel_size=conv_kernel_size) @@ -180,7 +176,7 @@ class ConformerEncoder(nn.Module): def __init__(self, encoder_layer: nn.Module, num_blocks: int, conv_kernel_size: int = 32, activation_ff=nn.swish, dim_ff: int = 512, dropout: float = 0.1, att_dropout: float = 0.1, enc_key_dim: int = 256, - num_heads: int = 4, l2: float = 0.0): + num_heads: int = 4): """ :param encoder_layer: :param num_blocks: @@ -191,7 +187,6 @@ def __init__(self, encoder_layer: nn.Module, num_blocks: int, conv_kernel_size: :param att_dropout: :param enc_key_dim: :param att_n_heads: - :param l2: """ super().__init__() @@ -199,14 +194,14 @@ def __init__(self, encoder_layer: nn.Module, num_blocks: int, conv_kernel_size: self.conv_subsample_layer = _ConformerConvSubsampleLayer( filter_sizes=[(3, 3), (3, 3)], pool_sizes=[(2, 2), (2, 2)], channel_sizes=[enc_key_dim, enc_key_dim], - l2=l2, dropout=dropout) + dropout=dropout) - self.linear = nn.Linear(n_out=enc_key_dim, l2=l2, with_bias=False) + self.linear = nn.Linear(n_out=enc_key_dim, with_bias=False) self.conformer_blocks = nn.Sequential([ encoder_layer( conv_kernel_size=conv_kernel_size, activation_ff=activation_ff, dim_ff=dim_ff, dropout=dropout, - att_dropout=att_dropout, enc_key_dim=enc_key_dim, num_heads=num_heads, l2=l2 + att_dropout=att_dropout, enc_key_dim=enc_key_dim, num_heads=num_heads ) for _ in range(num_blocks) ]) From fadd2b6c61858d26e566a4c55c09d729db79946c Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Thu, 4 Nov 2021 17:04:23 +0100 Subject: [PATCH 12/33] use nn.LayerRef --- nn/conformer.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/nn/conformer.py b/nn/conformer.py index 946de048..63dbf32e 100644 --- a/nn/conformer.py +++ b/nn/conformer.py @@ -5,7 +5,6 @@ from typing import Tuple, List, Union from .. import nn -from . import LayerRef class _PositionwiseFeedForward(nn.Module): @@ -29,7 +28,7 @@ def __init__(self, dim_model: int, dim_ff: int, dropout: float, activation): self.linear1 = nn.Linear(n_out=dim_ff) self.linear2 = nn.Linear(n_out=dim_model) - def forward(self, inp: LayerRef) -> LayerRef: + def forward(self, inp: nn.LayerRef) -> nn.LayerRef: x_ff1 = self.linear1(inp) x_act = self.activation(x_ff1) x_drop = nn.dropout(x_act, dropout=self.dropout) @@ -62,11 +61,11 @@ def __init__(self, dim_model: int, kernel_size: int, batch_norm_eps: float = 1e- delay_sample_update=True, **batch_norm_other_opts) @staticmethod - def _glu(v: LayerRef): + def _glu(v: nn.LayerRef): a, b = nn.split(v, axis='F') return a * nn.sigmoid(b) - def forward(self, inp: LayerRef) -> LayerRef: + def forward(self, inp: nn.LayerRef) -> nn.LayerRef: x_conv1 = self.positionwise_conv1(inp) x_act = self._glu(x_conv1) x_depthwise_conv = self.depthwise_conv(x_act) @@ -102,7 +101,7 @@ def __init__(self, filter_sizes: List[Tuple[int, ...]], pool_sizes: Union[List[T self.conv_layers.append( nn.Conv(activation=activation, filter_size=filter_size, n_out=channel_size, padding=padding)) - def forward(self, inp: LayerRef) -> LayerRef: + def forward(self, inp: nn.LayerRef) -> nn.LayerRef: x = nn.split_dims(inp, axis='F', dims=(-1, 1)) for i, conv_layer in enumerate(self.conv_layers): x = conv_layer(x) @@ -144,7 +143,7 @@ def __init__(self, conv_kernel_size: int, activation_ff, dim_ff: int, dropout: f self.mhsa_module = MultiheadAttention(enc_key_dim, num_heads, dropout=att_dropout) # TODO: to be implemented - def forward(self, inp: LayerRef) -> LayerRef: + def forward(self, inp: nn.LayerRef) -> nn.LayerRef: # FFN x_ffn1_ln = nn.layer_norm(inp) x_ffn1 = self.ffn1(x_ffn1_ln) @@ -206,7 +205,7 @@ def __init__(self, encoder_layer: nn.Module, num_blocks: int, conv_kernel_size: for _ in range(num_blocks) ]) - def forward(self, inp: LayerRef) -> LayerRef: + def forward(self, inp: nn.LayerRef) -> nn.LayerRef: x_subsample = self.conv_subsample_layer(inp) x_linear = self.linear(x_subsample) x = nn.dropout(x_linear, dropout=self.dropout) From 7460c08298422ffa0e0964c9f2e7c591e49297d3 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Thu, 4 Nov 2021 17:50:03 +0100 Subject: [PATCH 13/33] better sequential --- nn/conformer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nn/conformer.py b/nn/conformer.py index 63dbf32e..d4865a68 100644 --- a/nn/conformer.py +++ b/nn/conformer.py @@ -197,13 +197,13 @@ def __init__(self, encoder_layer: nn.Module, num_blocks: int, conv_kernel_size: self.linear = nn.Linear(n_out=enc_key_dim, with_bias=False) - self.conformer_blocks = nn.Sequential([ + self.conformer_blocks = nn.Sequential( encoder_layer( conv_kernel_size=conv_kernel_size, activation_ff=activation_ff, dim_ff=dim_ff, dropout=dropout, att_dropout=att_dropout, enc_key_dim=enc_key_dim, num_heads=num_heads ) for _ in range(num_blocks) - ]) + ) def forward(self, inp: nn.LayerRef) -> nn.LayerRef: x_subsample = self.conv_subsample_layer(inp) From 04e534358bc819cce705d4be85800e16ef080ff0 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Thu, 4 Nov 2021 17:58:02 +0100 Subject: [PATCH 14/33] better params naming --- nn/conformer.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/nn/conformer.py b/nn/conformer.py index d4865a68..59875974 100644 --- a/nn/conformer.py +++ b/nn/conformer.py @@ -13,9 +13,9 @@ class _PositionwiseFeedForward(nn.Module): FF -> Activation -> Dropout -> FF """ - def __init__(self, dim_model: int, dim_ff: int, dropout: float, activation): + def __init__(self, out_dim: int, dim_ff: int, dropout: float, activation): """ - :param dim_model: + :param out_dim: :param dim_ff: :param dropout: :param activation: @@ -25,14 +25,14 @@ def __init__(self, dim_model: int, dim_ff: int, dropout: float, activation): self.dropout = dropout self.activation = activation - self.linear1 = nn.Linear(n_out=dim_ff) - self.linear2 = nn.Linear(n_out=dim_model) + self.linear_ff = nn.Linear(n_out=dim_ff) + self.linear_out = nn.Linear(n_out=out_dim) def forward(self, inp: nn.LayerRef) -> nn.LayerRef: - x_ff1 = self.linear1(inp) + x_ff1 = self.linear_ff(inp) x_act = self.activation(x_ff1) x_drop = nn.dropout(x_act, dropout=self.dropout) - x_ff2 = self.linear2(x_drop) + x_ff2 = self.linear_out(x_drop) return x_ff2 @@ -42,17 +42,17 @@ class _ConformerConvBlock(nn.Module): FF -> GLU -> depthwise conv -> BN -> Swish -> FF """ - def __init__(self, dim_model: int, kernel_size: int, batch_norm_eps: float = 1e-5, + def __init__(self, out_dim: int, kernel_size: int, batch_norm_eps: float = 1e-5, batch_norm_momentum: float = 0.1, batch_norm_other_opts=None): """ - :param dim_model: + :param out_dim: :param kernel_size: """ super().__init__() - self.positionwise_conv1 = nn.Linear(n_out=dim_model * 2) - self.depthwise_conv = nn.Conv(n_out=dim_model, filter_size=(kernel_size,), groups=dim_model, padding='same') - self.positionwise_conv2 = nn.Linear(n_out=dim_model) + self.positionwise_conv1 = nn.Linear(n_out=out_dim * 2) + self.depthwise_conv = nn.Conv(n_out=out_dim, filter_size=(kernel_size,), groups=out_dim, padding='same') + self.positionwise_conv2 = nn.Linear(n_out=out_dim) if batch_norm_other_opts is None: batch_norm_other_opts = {} @@ -134,14 +134,14 @@ def __init__(self, conv_kernel_size: int, activation_ff, dim_ff: int, dropout: f self.dropout = dropout self.ffn1 = _PositionwiseFeedForward( - dim_model=enc_key_dim, dim_ff=dim_ff, dropout=dropout, activation=activation_ff) + out_dim=enc_key_dim, dim_ff=dim_ff, dropout=dropout, activation=activation_ff) self.ffn2 = _PositionwiseFeedForward( - dim_model=enc_key_dim, dim_ff=dim_ff, dropout=dropout, activation=activation_ff) + out_dim=enc_key_dim, dim_ff=dim_ff, dropout=dropout, activation=activation_ff) - self.conv_module = _ConformerConvBlock(dim_model=enc_key_dim, kernel_size=conv_kernel_size) + self.conv_block = _ConformerConvBlock(out_dim=enc_key_dim, kernel_size=conv_kernel_size) - self.mhsa_module = MultiheadAttention(enc_key_dim, num_heads, dropout=att_dropout) # TODO: to be implemented + self.self_att = MultiheadAttention(enc_key_dim, num_heads, dropout=att_dropout) # TODO: to be implemented def forward(self, inp: nn.LayerRef) -> nn.LayerRef: # FFN @@ -151,12 +151,12 @@ def forward(self, inp: nn.LayerRef) -> nn.LayerRef: # MHSA x_mhsa_ln = nn.layer_norm(x_ffn1_out) - x_mhsa = self.mhsa_module(x_mhsa_ln) + x_mhsa = self.self_att(x_mhsa_ln) x_mhsa_out = x_mhsa + x_ffn1_out # Conv x_conv_ln = nn.layer_norm(x_mhsa_out) - x_conv = self.conv_module(x_conv_ln) + x_conv = self.conv_block(x_conv_ln) x_conv_out = nn.dropout(x_conv, dropout=self.dropout) + x_mhsa_out # FFN From 9b8f2e0e568bde6b3ff47c98e39d0c0ef17aecc2 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Thu, 4 Nov 2021 18:01:44 +0100 Subject: [PATCH 15/33] move and fix glu --- nn/conformer.py | 7 +------ nn/math_.py | 8 +++++--- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/nn/conformer.py b/nn/conformer.py index 59875974..eecf73ae 100644 --- a/nn/conformer.py +++ b/nn/conformer.py @@ -60,14 +60,9 @@ def __init__(self, out_dim: int, kernel_size: int, batch_norm_eps: float = 1e-5, epsilon=batch_norm_eps, momentum=batch_norm_momentum, update_sample_only_in_training=True, delay_sample_update=True, **batch_norm_other_opts) - @staticmethod - def _glu(v: nn.LayerRef): - a, b = nn.split(v, axis='F') - return a * nn.sigmoid(b) - def forward(self, inp: nn.LayerRef) -> nn.LayerRef: x_conv1 = self.positionwise_conv1(inp) - x_act = self._glu(x_conv1) + x_act = nn.glu(x_conv1) x_depthwise_conv = self.depthwise_conv(x_act) x_bn = self.batch_norm(x_depthwise_conv) x_swish = nn.swish(x_bn) diff --git a/nn/math_.py b/nn/math_.py index 64dbb0fd..793d40f1 100644 --- a/nn/math_.py +++ b/nn/math_.py @@ -35,9 +35,11 @@ def gelu(x: nn.LayerRef) -> nn.Layer: return _activation(x, activation="gelu") -def glu(x: LayerRef) -> Layer: - """GLU""" - return activation(x, activation='glu') +def glu(v: LayerRef): + """GLU https://arxiv.org/abs/1612.08083""" + from . import split + a, b = split(v, axis='F', num_splits=2) + return a * sigmoid(b) def exp(x: nn.LayerRef) -> nn.Layer: From 34ea6153aeb53107b2575af2b3ccd53a5a1f9fd8 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Fri, 5 Nov 2021 23:13:52 +0100 Subject: [PATCH 16/33] fix indent --- nn/conformer.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/nn/conformer.py b/nn/conformer.py index eecf73ae..433d0476 100644 --- a/nn/conformer.py +++ b/nn/conformer.py @@ -42,8 +42,9 @@ class _ConformerConvBlock(nn.Module): FF -> GLU -> depthwise conv -> BN -> Swish -> FF """ - def __init__(self, out_dim: int, kernel_size: int, batch_norm_eps: float = 1e-5, - batch_norm_momentum: float = 0.1, batch_norm_other_opts=None): + def __init__( + self, out_dim: int, kernel_size: int, batch_norm_eps: float = 1e-5, batch_norm_momentum: float = 0.1, + batch_norm_other_opts=None): """ :param out_dim: :param kernel_size: @@ -75,9 +76,9 @@ class _ConformerConvSubsampleLayer(nn.Module): Conv 2D block with optional max-pooling """ - def __init__(self, filter_sizes: List[Tuple[int, ...]], pool_sizes: Union[List[Tuple[int, ...]], None], - channel_sizes: List[int], dropout: float = 0.3, activation: str = 'relu', - padding: str = 'same'): + def __init__( + self, filter_sizes: List[Tuple[int, ...]], pool_sizes: Union[List[Tuple[int, ...]], None], + channel_sizes: List[int], dropout: float = 0.3, activation: str = 'relu', padding: str = 'same'): """ :param filter_sizes: :param pool_sizes: @@ -92,6 +93,7 @@ def __init__(self, filter_sizes: List[Tuple[int, ...]], pool_sizes: Union[List[T self.pool_sizes = pool_sizes self.conv_layers = nn.ModuleList() + assert len(filter_sizes) == len(channel_sizes) for filter_size, channel_size in zip(filter_sizes, channel_sizes): self.conv_layers.append( nn.Conv(activation=activation, filter_size=filter_size, n_out=channel_size, padding=padding)) @@ -113,8 +115,9 @@ class ConformerEncoderLayer(nn.Module): Represents a conformer block """ - def __init__(self, conv_kernel_size: int, activation_ff, dim_ff: int, dropout: float, att_dropout: float, - enc_key_dim: int, num_heads: int): + def __init__( + self, conv_kernel_size: int, activation_ff, dim_ff: int, dropout: float, att_dropout: float, enc_key_dim: int, + num_heads: int): """ :param conv_kernel_size: :param activation_ff: @@ -168,9 +171,9 @@ class ConformerEncoder(nn.Module): Represents Conformer encoder architecture """ - def __init__(self, encoder_layer: nn.Module, num_blocks: int, conv_kernel_size: int = 32, - activation_ff=nn.swish, dim_ff: int = 512, dropout: float = 0.1, att_dropout: float = 0.1, enc_key_dim: int = 256, - num_heads: int = 4): + def __init__( + self, encoder_layer: nn.Module, num_blocks: int, conv_kernel_size: int = 32, activation_ff=nn.swish, + dim_ff: int = 512, dropout: float = 0.1, att_dropout: float = 0.1, enc_key_dim: int = 256, num_heads: int = 4): """ :param encoder_layer: :param num_blocks: From c09bfcede5281d2f46d792aeeda43d462b54025d Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Fri, 5 Nov 2021 23:14:23 +0100 Subject: [PATCH 17/33] add return type for glu func --- nn/math_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nn/math_.py b/nn/math_.py index 793d40f1..7cb03a20 100644 --- a/nn/math_.py +++ b/nn/math_.py @@ -35,7 +35,7 @@ def gelu(x: nn.LayerRef) -> nn.Layer: return _activation(x, activation="gelu") -def glu(v: LayerRef): +def glu(v: LayerRef) -> Layer: """GLU https://arxiv.org/abs/1612.08083""" from . import split a, b = split(v, axis='F', num_splits=2) From c3891aad81a246f9884572b5fe3186302a5dd542 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Fri, 5 Nov 2021 23:25:32 +0100 Subject: [PATCH 18/33] use nn.activation instead of str for conv --- nn/conformer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/nn/conformer.py b/nn/conformer.py index 433d0476..6d4f094d 100644 --- a/nn/conformer.py +++ b/nn/conformer.py @@ -78,7 +78,7 @@ class _ConformerConvSubsampleLayer(nn.Module): def __init__( self, filter_sizes: List[Tuple[int, ...]], pool_sizes: Union[List[Tuple[int, ...]], None], - channel_sizes: List[int], dropout: float = 0.3, activation: str = 'relu', padding: str = 'same'): + channel_sizes: List[int], dropout: float = 0.3, activation=nn.relu, padding: str = 'same'): """ :param filter_sizes: :param pool_sizes: @@ -91,17 +91,19 @@ def __init__( self.dropout = dropout self.pool_sizes = pool_sizes + self.activation = activation self.conv_layers = nn.ModuleList() assert len(filter_sizes) == len(channel_sizes) for filter_size, channel_size in zip(filter_sizes, channel_sizes): self.conv_layers.append( - nn.Conv(activation=activation, filter_size=filter_size, n_out=channel_size, padding=padding)) + nn.Conv(filter_size=filter_size, n_out=channel_size, padding=padding)) def forward(self, inp: nn.LayerRef) -> nn.LayerRef: x = nn.split_dims(inp, axis='F', dims=(-1, 1)) for i, conv_layer in enumerate(self.conv_layers): x = conv_layer(x) + x = self.activation(x) if self.pool_sizes and i < len(self.pool_sizes): x = nn.pool(x, pool_size=self.pool_sizes[i], padding='same', mode='max') if self.dropout: From e4887a5e00b99ede2f79b6a8d64e187124e4276a Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Fri, 5 Nov 2021 23:36:26 +0100 Subject: [PATCH 19/33] add param type for activation --- nn/conformer.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/nn/conformer.py b/nn/conformer.py index 6d4f094d..ad56d7e4 100644 --- a/nn/conformer.py +++ b/nn/conformer.py @@ -3,7 +3,7 @@ Ref: https://arxiv.org/abs/2005.08100 """ -from typing import Tuple, List, Union +from typing import Tuple, List, Union, Callable from .. import nn @@ -13,7 +13,7 @@ class _PositionwiseFeedForward(nn.Module): FF -> Activation -> Dropout -> FF """ - def __init__(self, out_dim: int, dim_ff: int, dropout: float, activation): + def __init__(self, out_dim: int, dim_ff: int, dropout: float, activation: Callable[[nn.LayerRef], nn.LayerRef]): """ :param out_dim: :param dim_ff: @@ -78,7 +78,8 @@ class _ConformerConvSubsampleLayer(nn.Module): def __init__( self, filter_sizes: List[Tuple[int, ...]], pool_sizes: Union[List[Tuple[int, ...]], None], - channel_sizes: List[int], dropout: float = 0.3, activation=nn.relu, padding: str = 'same'): + channel_sizes: List[int], dropout: float = 0.3, activation: Callable[[nn.LayerRef], nn.LayerRef] = nn.relu, + padding: str = 'same'): """ :param filter_sizes: :param pool_sizes: @@ -118,8 +119,8 @@ class ConformerEncoderLayer(nn.Module): """ def __init__( - self, conv_kernel_size: int, activation_ff, dim_ff: int, dropout: float, att_dropout: float, enc_key_dim: int, - num_heads: int): + self, conv_kernel_size: int, activation_ff: Callable[[nn.LayerRef], nn.LayerRef], dim_ff: int, dropout: float, + att_dropout: float, enc_key_dim: int, num_heads: int): """ :param conv_kernel_size: :param activation_ff: @@ -174,8 +175,9 @@ class ConformerEncoder(nn.Module): """ def __init__( - self, encoder_layer: nn.Module, num_blocks: int, conv_kernel_size: int = 32, activation_ff=nn.swish, - dim_ff: int = 512, dropout: float = 0.1, att_dropout: float = 0.1, enc_key_dim: int = 256, num_heads: int = 4): + self, encoder_layer: nn.Module, num_blocks: int, conv_kernel_size: int = 32, + activation_ff: Callable[[nn.LayerRef], nn.LayerRef] = nn.swish, dim_ff: int = 512, dropout: float = 0.1, + att_dropout: float = 0.1, enc_key_dim: int = 256, num_heads: int = 4): """ :param encoder_layer: :param num_blocks: From 4226cb1c29e603fbbd418b527e4d538b2d11e301 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Sat, 6 Nov 2021 00:36:33 +0100 Subject: [PATCH 20/33] fix param types and indent --- nn/conformer.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/nn/conformer.py b/nn/conformer.py index ad56d7e4..f515925b 100644 --- a/nn/conformer.py +++ b/nn/conformer.py @@ -3,7 +3,7 @@ Ref: https://arxiv.org/abs/2005.08100 """ -from typing import Tuple, List, Union, Callable +from typing import Tuple, List, Union, Callable, Optional from .. import nn @@ -43,8 +43,8 @@ class _ConformerConvBlock(nn.Module): """ def __init__( - self, out_dim: int, kernel_size: int, batch_norm_eps: float = 1e-5, batch_norm_momentum: float = 0.1, - batch_norm_other_opts=None): + self, out_dim: int, kernel_size: int, batch_norm_eps: float = 1e-5, batch_norm_momentum: float = 0.1, + batch_norm_other_opts=None): """ :param out_dim: :param kernel_size: @@ -77,9 +77,9 @@ class _ConformerConvSubsampleLayer(nn.Module): """ def __init__( - self, filter_sizes: List[Tuple[int, ...]], pool_sizes: Union[List[Tuple[int, ...]], None], - channel_sizes: List[int], dropout: float = 0.3, activation: Callable[[nn.LayerRef], nn.LayerRef] = nn.relu, - padding: str = 'same'): + self, filter_sizes: List[Tuple[int, int]], pool_sizes: Optional[List[Tuple[int, int]]], + channel_sizes: List[int], dropout: float = 0.3, activation: Callable[[nn.LayerRef], nn.LayerRef] = nn.relu, + padding: str = 'same'): """ :param filter_sizes: :param pool_sizes: @@ -119,8 +119,8 @@ class ConformerEncoderLayer(nn.Module): """ def __init__( - self, conv_kernel_size: int, activation_ff: Callable[[nn.LayerRef], nn.LayerRef], dim_ff: int, dropout: float, - att_dropout: float, enc_key_dim: int, num_heads: int): + self, conv_kernel_size: int, activation_ff: Callable[[nn.LayerRef], nn.LayerRef], dim_ff: int, dropout: float, + att_dropout: float, enc_key_dim: int, num_heads: int): """ :param conv_kernel_size: :param activation_ff: @@ -175,9 +175,9 @@ class ConformerEncoder(nn.Module): """ def __init__( - self, encoder_layer: nn.Module, num_blocks: int, conv_kernel_size: int = 32, - activation_ff: Callable[[nn.LayerRef], nn.LayerRef] = nn.swish, dim_ff: int = 512, dropout: float = 0.1, - att_dropout: float = 0.1, enc_key_dim: int = 256, num_heads: int = 4): + self, encoder_layer: nn.Module, num_blocks: int, conv_kernel_size: int = 32, + activation_ff: Callable[[nn.LayerRef], nn.LayerRef] = nn.swish, dim_ff: int = 512, dropout: float = 0.1, + att_dropout: float = 0.1, enc_key_dim: int = 256, num_heads: int = 4): """ :param encoder_layer: :param num_blocks: From fae6c9c4128f65a83110ecaf188de184ad52142a Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Sat, 6 Nov 2021 01:08:24 +0100 Subject: [PATCH 21/33] fix batch norm --- nn/conformer.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/nn/conformer.py b/nn/conformer.py index f515925b..3287a159 100644 --- a/nn/conformer.py +++ b/nn/conformer.py @@ -3,8 +3,9 @@ Ref: https://arxiv.org/abs/2005.08100 """ -from typing import Tuple, List, Union, Callable, Optional +from typing import Tuple, List, Callable, Optional, Dict, Any from .. import nn +import copy class _PositionwiseFeedForward(nn.Module): @@ -42,9 +43,7 @@ class _ConformerConvBlock(nn.Module): FF -> GLU -> depthwise conv -> BN -> Swish -> FF """ - def __init__( - self, out_dim: int, kernel_size: int, batch_norm_eps: float = 1e-5, batch_norm_momentum: float = 0.1, - batch_norm_other_opts=None): + def __init__(self, out_dim: int, kernel_size: int, batch_norm_opts: Optional[Dict[str, Any]] = None): """ :param out_dim: :param kernel_size: @@ -55,11 +54,14 @@ def __init__( self.depthwise_conv = nn.Conv(n_out=out_dim, filter_size=(kernel_size,), groups=out_dim, padding='same') self.positionwise_conv2 = nn.Linear(n_out=out_dim) - if batch_norm_other_opts is None: - batch_norm_other_opts = {} + batch_norm_opts = copy.deepcopy(batch_norm_opts) + if batch_norm_opts is None: + batch_norm_opts = {} + epsilon = batch_norm_opts.pop('epsilon', 1e-5) + momentum = batch_norm_opts.pop('momentum', 0.1) self.batch_norm = nn.BatchNorm( - epsilon=batch_norm_eps, momentum=batch_norm_momentum, update_sample_only_in_training=True, - delay_sample_update=True, **batch_norm_other_opts) + epsilon=epsilon, momentum=momentum, update_sample_only_in_training=True, delay_sample_update=True, + **batch_norm_opts) def forward(self, inp: nn.LayerRef) -> nn.LayerRef: x_conv1 = self.positionwise_conv1(inp) @@ -120,7 +122,7 @@ class ConformerEncoderLayer(nn.Module): def __init__( self, conv_kernel_size: int, activation_ff: Callable[[nn.LayerRef], nn.LayerRef], dim_ff: int, dropout: float, - att_dropout: float, enc_key_dim: int, num_heads: int): + att_dropout: float, enc_key_dim: int, num_heads: int, batch_norm_opts: Optional[Dict[str, Any]]): """ :param conv_kernel_size: :param activation_ff: @@ -140,7 +142,8 @@ def __init__( self.ffn2 = _PositionwiseFeedForward( out_dim=enc_key_dim, dim_ff=dim_ff, dropout=dropout, activation=activation_ff) - self.conv_block = _ConformerConvBlock(out_dim=enc_key_dim, kernel_size=conv_kernel_size) + self.conv_block = _ConformerConvBlock( + out_dim=enc_key_dim, kernel_size=conv_kernel_size, batch_norm_opts=batch_norm_opts) self.self_att = MultiheadAttention(enc_key_dim, num_heads, dropout=att_dropout) # TODO: to be implemented @@ -177,7 +180,8 @@ class ConformerEncoder(nn.Module): def __init__( self, encoder_layer: nn.Module, num_blocks: int, conv_kernel_size: int = 32, activation_ff: Callable[[nn.LayerRef], nn.LayerRef] = nn.swish, dim_ff: int = 512, dropout: float = 0.1, - att_dropout: float = 0.1, enc_key_dim: int = 256, num_heads: int = 4): + att_dropout: float = 0.1, enc_key_dim: int = 256, num_heads: int = 4, + batch_norm_opts: Optional[Dict[str, Any]] = None): """ :param encoder_layer: :param num_blocks: @@ -202,7 +206,7 @@ def __init__( self.conformer_blocks = nn.Sequential( encoder_layer( conv_kernel_size=conv_kernel_size, activation_ff=activation_ff, dim_ff=dim_ff, dropout=dropout, - att_dropout=att_dropout, enc_key_dim=enc_key_dim, num_heads=num_heads + att_dropout=att_dropout, enc_key_dim=enc_key_dim, num_heads=num_heads, batch_norm_opts=batch_norm_opts ) for _ in range(num_blocks) ) From 614b9bb12d63e7685bb26d89ba523e1b9d018b87 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Sat, 6 Nov 2021 01:10:15 +0100 Subject: [PATCH 22/33] remove layer suffix from convsubsample module --- nn/conformer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nn/conformer.py b/nn/conformer.py index 3287a159..0d96edcb 100644 --- a/nn/conformer.py +++ b/nn/conformer.py @@ -73,7 +73,7 @@ def forward(self, inp: nn.LayerRef) -> nn.LayerRef: return x_conv2 -class _ConformerConvSubsampleLayer(nn.Module): +class _ConformerConvSubsample(nn.Module): """ Conv 2D block with optional max-pooling """ @@ -197,7 +197,7 @@ def __init__( self.dropout = dropout - self.conv_subsample_layer = _ConformerConvSubsampleLayer( + self.conv_subsample_layer = _ConformerConvSubsample( filter_sizes=[(3, 3), (3, 3)], pool_sizes=[(2, 2), (2, 2)], channel_sizes=[enc_key_dim, enc_key_dim], dropout=dropout) From 40e9ad1e0dae0f8c773417f2ad97fd2fd3c48cd2 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Sat, 6 Nov 2021 01:12:30 +0100 Subject: [PATCH 23/33] better list compr --- nn/conformer.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/nn/conformer.py b/nn/conformer.py index 0d96edcb..1afa1345 100644 --- a/nn/conformer.py +++ b/nn/conformer.py @@ -206,10 +206,8 @@ def __init__( self.conformer_blocks = nn.Sequential( encoder_layer( conv_kernel_size=conv_kernel_size, activation_ff=activation_ff, dim_ff=dim_ff, dropout=dropout, - att_dropout=att_dropout, enc_key_dim=enc_key_dim, num_heads=num_heads, batch_norm_opts=batch_norm_opts - ) - for _ in range(num_blocks) - ) + att_dropout=att_dropout, enc_key_dim=enc_key_dim, num_heads=num_heads, batch_norm_opts=batch_norm_opts) + for _ in range(num_blocks)) def forward(self, inp: nn.LayerRef) -> nn.LayerRef: x_subsample = self.conv_subsample_layer(inp) From 6b1166be9ddaf0a0233a8ab4927b4622dbb33cec Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Sat, 6 Nov 2021 01:15:33 +0100 Subject: [PATCH 24/33] add axis as param for glu func --- nn/math_.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nn/math_.py b/nn/math_.py index 7cb03a20..16816171 100644 --- a/nn/math_.py +++ b/nn/math_.py @@ -35,10 +35,10 @@ def gelu(x: nn.LayerRef) -> nn.Layer: return _activation(x, activation="gelu") -def glu(v: LayerRef) -> Layer: +def glu(x: nn.LayerRef, axis: Optional[str] = "F") -> nn.Layer: """GLU https://arxiv.org/abs/1612.08083""" from . import split - a, b = split(v, axis='F', num_splits=2) + a, b = split(x, axis=axis, num_splits=2) return a * sigmoid(b) From a626bbbd983890882d158a680fcb018a21244d2e Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Sat, 6 Nov 2021 01:47:39 +0100 Subject: [PATCH 25/33] set None by default for optional params --- nn/conformer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nn/conformer.py b/nn/conformer.py index 1afa1345..cf78fae0 100644 --- a/nn/conformer.py +++ b/nn/conformer.py @@ -79,9 +79,9 @@ class _ConformerConvSubsample(nn.Module): """ def __init__( - self, filter_sizes: List[Tuple[int, int]], pool_sizes: Optional[List[Tuple[int, int]]], - channel_sizes: List[int], dropout: float = 0.3, activation: Callable[[nn.LayerRef], nn.LayerRef] = nn.relu, - padding: str = 'same'): + self, filter_sizes: List[Tuple[int, int]], channel_sizes: List[int], + pool_sizes: Optional[List[Tuple[int, int]]] = None, dropout: float = 0.3, + activation: Callable[[nn.LayerRef], nn.LayerRef] = nn.relu, padding: str = 'same'): """ :param filter_sizes: :param pool_sizes: @@ -122,7 +122,7 @@ class ConformerEncoderLayer(nn.Module): def __init__( self, conv_kernel_size: int, activation_ff: Callable[[nn.LayerRef], nn.LayerRef], dim_ff: int, dropout: float, - att_dropout: float, enc_key_dim: int, num_heads: int, batch_norm_opts: Optional[Dict[str, Any]]): + att_dropout: float, enc_key_dim: int, num_heads: int, batch_norm_opts: Optional[Dict[str, Any]] = None): """ :param conv_kernel_size: :param activation_ff: From a0b398077b7e6333020cb57aec0abfa646e30b28 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Sat, 6 Nov 2021 18:02:11 +0100 Subject: [PATCH 26/33] remove default val for dropout --- nn/conformer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nn/conformer.py b/nn/conformer.py index cf78fae0..39daab78 100644 --- a/nn/conformer.py +++ b/nn/conformer.py @@ -79,9 +79,9 @@ class _ConformerConvSubsample(nn.Module): """ def __init__( - self, filter_sizes: List[Tuple[int, int]], channel_sizes: List[int], - pool_sizes: Optional[List[Tuple[int, int]]] = None, dropout: float = 0.3, - activation: Callable[[nn.LayerRef], nn.LayerRef] = nn.relu, padding: str = 'same'): + self, filter_sizes: List[Tuple[int, int]], channel_sizes: List[int], dropout: float, + pool_sizes: Optional[List[Tuple[int, int]]] = None, activation: Callable[[nn.LayerRef], nn.LayerRef] = nn.relu, + padding: str = 'same'): """ :param filter_sizes: :param pool_sizes: From 8ec724d84070a0a6ade7643a5fda2c476085a900 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Sun, 7 Nov 2021 12:14:08 +0100 Subject: [PATCH 27/33] fix bn + conformer encoder --- nn/conformer.py | 60 ++++++++++++++++++++----------------------------- 1 file changed, 24 insertions(+), 36 deletions(-) diff --git a/nn/conformer.py b/nn/conformer.py index 39daab78..7c8b995b 100644 --- a/nn/conformer.py +++ b/nn/conformer.py @@ -3,7 +3,7 @@ Ref: https://arxiv.org/abs/2005.08100 """ -from typing import Tuple, List, Callable, Optional, Dict, Any +from typing import Tuple, List, Callable, Optional, Dict, Any, Union from .. import nn import copy @@ -54,14 +54,12 @@ def __init__(self, out_dim: int, kernel_size: int, batch_norm_opts: Optional[Dic self.depthwise_conv = nn.Conv(n_out=out_dim, filter_size=(kernel_size,), groups=out_dim, padding='same') self.positionwise_conv2 = nn.Linear(n_out=out_dim) - batch_norm_opts = copy.deepcopy(batch_norm_opts) if batch_norm_opts is None: batch_norm_opts = {} - epsilon = batch_norm_opts.pop('epsilon', 1e-5) - momentum = batch_norm_opts.pop('momentum', 0.1) - self.batch_norm = nn.BatchNorm( - epsilon=epsilon, momentum=momentum, update_sample_only_in_training=True, delay_sample_update=True, - **batch_norm_opts) + batch_norm_opts = batch_norm_opts.copy() + batch_norm_opts.setdefault('epsilon', 1e-5) + batch_norm_opts.setdefault('momentum', 0.1) + self.batch_norm = nn.BatchNorm(update_sample_only_in_training=True, delay_sample_update=True, **batch_norm_opts) def forward(self, inp: nn.LayerRef) -> nn.LayerRef: x_conv1 = self.positionwise_conv1(inp) @@ -84,9 +82,9 @@ def __init__( padding: str = 'same'): """ :param filter_sizes: - :param pool_sizes: :param channel_sizes: :param dropout: + :param pool_sizes: :param activation: :param padding: """ @@ -121,29 +119,32 @@ class ConformerEncoderLayer(nn.Module): """ def __init__( - self, conv_kernel_size: int, activation_ff: Callable[[nn.LayerRef], nn.LayerRef], dim_ff: int, dropout: float, - att_dropout: float, enc_key_dim: int, num_heads: int, batch_norm_opts: Optional[Dict[str, Any]] = None): + self, conv_kernel_size: int = 32, activation_ff: Callable[[nn.LayerRef], nn.LayerRef] = nn.swish, + dim_ff: int = 2048, dropout: float = 0.1, att_dropout: float = 0.1, out_dim: int = 512, num_heads: int = 8, + batch_norm_opts: Optional[Dict[str, Any]] = None): """ :param conv_kernel_size: :param activation_ff: - :param ff_dim: + :param dim_ff: :param dropout: :param att_dropout: - :param enc_key_dim: + :param out_dim: :param num_heads: + :param batch_norm_opts: """ super().__init__() self.dropout = dropout + self.out_dim = out_dim self.ffn1 = _PositionwiseFeedForward( - out_dim=enc_key_dim, dim_ff=dim_ff, dropout=dropout, activation=activation_ff) + out_dim=out_dim, dim_ff=dim_ff, dropout=dropout, activation=activation_ff) self.ffn2 = _PositionwiseFeedForward( - out_dim=enc_key_dim, dim_ff=dim_ff, dropout=dropout, activation=activation_ff) + out_dim=out_dim, dim_ff=dim_ff, dropout=dropout, activation=activation_ff) self.conv_block = _ConformerConvBlock( - out_dim=enc_key_dim, kernel_size=conv_kernel_size, batch_norm_opts=batch_norm_opts) + out_dim=out_dim, kernel_size=conv_kernel_size, batch_norm_opts=batch_norm_opts) self.self_att = MultiheadAttention(enc_key_dim, num_heads, dropout=att_dropout) # TODO: to be implemented @@ -178,40 +179,27 @@ class ConformerEncoder(nn.Module): """ def __init__( - self, encoder_layer: nn.Module, num_blocks: int, conv_kernel_size: int = 32, - activation_ff: Callable[[nn.LayerRef], nn.LayerRef] = nn.swish, dim_ff: int = 512, dropout: float = 0.1, - att_dropout: float = 0.1, enc_key_dim: int = 256, num_heads: int = 4, - batch_norm_opts: Optional[Dict[str, Any]] = None): + self, encoder_layer: Union[ConformerEncoderLayer, Any], num_blocks: int): """ :param encoder_layer: :param num_blocks: - :param conv_kernel_size: - :param ff_act: - :param ff_dim: - :param dropout: - :param att_dropout: - :param enc_key_dim: - :param att_n_heads: """ super().__init__() - self.dropout = dropout + self.dropout = getattr(encoder_layer, 'dropout') + out_dim = getattr(encoder_layer, 'out_dim') self.conv_subsample_layer = _ConformerConvSubsample( - filter_sizes=[(3, 3), (3, 3)], pool_sizes=[(2, 2), (2, 2)], channel_sizes=[enc_key_dim, enc_key_dim], - dropout=dropout) + filter_sizes=[(3, 3), (3, 3)], pool_sizes=[(2, 2), (2, 2)], channel_sizes=[out_dim, out_dim], + dropout=self.dropout) - self.linear = nn.Linear(n_out=enc_key_dim, with_bias=False) + self.linear = nn.Linear(n_out=out_dim, with_bias=False) - self.conformer_blocks = nn.Sequential( - encoder_layer( - conv_kernel_size=conv_kernel_size, activation_ff=activation_ff, dim_ff=dim_ff, dropout=dropout, - att_dropout=att_dropout, enc_key_dim=enc_key_dim, num_heads=num_heads, batch_norm_opts=batch_norm_opts) - for _ in range(num_blocks)) + self.layers = nn.Sequential(copy.deepcopy(encoder_layer) for _ in range(num_blocks)) def forward(self, inp: nn.LayerRef) -> nn.LayerRef: x_subsample = self.conv_subsample_layer(inp) x_linear = self.linear(x_subsample) x = nn.dropout(x_linear, dropout=self.dropout) - x = self.conformer_blocks(x) + x = self.layers(x) return x From 0f11719539baa63d2e0d1ea16372987874052640 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Sun, 7 Nov 2021 12:19:58 +0100 Subject: [PATCH 28/33] fix indent --- nn/conformer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nn/conformer.py b/nn/conformer.py index 7c8b995b..812c74dd 100644 --- a/nn/conformer.py +++ b/nn/conformer.py @@ -178,8 +178,7 @@ class ConformerEncoder(nn.Module): Represents Conformer encoder architecture """ - def __init__( - self, encoder_layer: Union[ConformerEncoderLayer, Any], num_blocks: int): + def __init__(self, encoder_layer: Union[ConformerEncoderLayer, Any], num_blocks: int): """ :param encoder_layer: :param num_blocks: From c4d1a441fe5dd10d85a178cf2f6d595f6c85ae33 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Mon, 8 Nov 2021 12:09:31 +0100 Subject: [PATCH 29/33] add docs + make classes public --- nn/conformer.py | 73 +++++++++++++++++++++++++------------------------ 1 file changed, 37 insertions(+), 36 deletions(-) diff --git a/nn/conformer.py b/nn/conformer.py index 812c74dd..f10a451a 100644 --- a/nn/conformer.py +++ b/nn/conformer.py @@ -8,7 +8,7 @@ import copy -class _PositionwiseFeedForward(nn.Module): +class ConformerPositionwiseFeedForward(nn.Module): """ Conformer position-wise feedforward neural network layer FF -> Activation -> Dropout -> FF @@ -16,10 +16,10 @@ class _PositionwiseFeedForward(nn.Module): def __init__(self, out_dim: int, dim_ff: int, dropout: float, activation: Callable[[nn.LayerRef], nn.LayerRef]): """ - :param out_dim: - :param dim_ff: - :param dropout: - :param activation: + :param out_dim: output feature dimension + :param dim_ff: dimension of the feed-forward layers + :param dropout: dropout value + :param activation: activation function """ super().__init__() @@ -37,7 +37,7 @@ def forward(self, inp: nn.LayerRef) -> nn.LayerRef: return x_ff2 -class _ConformerConvBlock(nn.Module): +class ConformerConvBlock(nn.Module): """ Conformer convolution block FF -> GLU -> depthwise conv -> BN -> Swish -> FF @@ -45,8 +45,9 @@ class _ConformerConvBlock(nn.Module): def __init__(self, out_dim: int, kernel_size: int, batch_norm_opts: Optional[Dict[str, Any]] = None): """ - :param out_dim: - :param kernel_size: + :param out_dim: output feature dimension + :param kernel_size: kernel size of depthwise convolution + :param batch_norm_opts: batch norm options """ super().__init__() @@ -71,7 +72,7 @@ def forward(self, inp: nn.LayerRef) -> nn.LayerRef: return x_conv2 -class _ConformerConvSubsample(nn.Module): +class ConformerConvSubsample(nn.Module): """ Conv 2D block with optional max-pooling """ @@ -81,12 +82,12 @@ def __init__( pool_sizes: Optional[List[Tuple[int, int]]] = None, activation: Callable[[nn.LayerRef], nn.LayerRef] = nn.relu, padding: str = 'same'): """ - :param filter_sizes: - :param channel_sizes: - :param dropout: - :param pool_sizes: - :param activation: - :param padding: + :param filter_sizes: a list of filter sizes for the conv layer + :param channel_sizes: the number of output channels + :param dropout: the dropout value + :param pool_sizes: a list of pooling factors applied after conv layer + :param activation: the activation function + :param padding: 'same' or 'valid' """ super().__init__() @@ -123,30 +124,30 @@ def __init__( dim_ff: int = 2048, dropout: float = 0.1, att_dropout: float = 0.1, out_dim: int = 512, num_heads: int = 8, batch_norm_opts: Optional[Dict[str, Any]] = None): """ - :param conv_kernel_size: - :param activation_ff: - :param dim_ff: - :param dropout: - :param att_dropout: - :param out_dim: - :param num_heads: - :param batch_norm_opts: + :param conv_kernel_size: the kernel size of depthwise convolution + :param activation_ff: activation funtion for feed-forward network + :param dim_ff: the dimension of feed-forward layers + :param dropout: the dropout value + :param att_dropout: attention dropout value + :param out_dim: the output feature dimension + :param num_heads: the number of attention heads + :param batch_norm_opts: batch norm options """ super().__init__() self.dropout = dropout self.out_dim = out_dim - self.ffn1 = _PositionwiseFeedForward( + self.ffn1 = ConformerPositionwiseFeedForward( out_dim=out_dim, dim_ff=dim_ff, dropout=dropout, activation=activation_ff) - self.ffn2 = _PositionwiseFeedForward( + self.ffn2 = ConformerPositionwiseFeedForward( out_dim=out_dim, dim_ff=dim_ff, dropout=dropout, activation=activation_ff) - self.conv_block = _ConformerConvBlock( + self.conv_block = ConformerConvBlock( out_dim=out_dim, kernel_size=conv_kernel_size, batch_norm_opts=batch_norm_opts) - self.self_att = MultiheadAttention(enc_key_dim, num_heads, dropout=att_dropout) # TODO: to be implemented + self.self_att = MultiheadAttention(out_dim, num_heads, dropout=att_dropout) # TODO: to be implemented def forward(self, inp: nn.LayerRef) -> nn.LayerRef: # FFN @@ -178,23 +179,23 @@ class ConformerEncoder(nn.Module): Represents Conformer encoder architecture """ - def __init__(self, encoder_layer: Union[ConformerEncoderLayer, Any], num_blocks: int): + def __init__(self, encoder_layer: ConformerEncoderLayer, num_layers: int): """ - :param encoder_layer: - :param num_blocks: + :param encoder_layer: an instance of `class:ConformerEncoderLayer` + :param num_layers: the number of encoder layers """ super().__init__() - self.dropout = getattr(encoder_layer, 'dropout') - out_dim = getattr(encoder_layer, 'out_dim') + self.dropout = encoder_layer.dropout + self.out_dim = encoder_layer.out_dim - self.conv_subsample_layer = _ConformerConvSubsample( - filter_sizes=[(3, 3), (3, 3)], pool_sizes=[(2, 2), (2, 2)], channel_sizes=[out_dim, out_dim], + self.conv_subsample_layer = ConformerConvSubsample( + filter_sizes=[(3, 3), (3, 3)], pool_sizes=[(2, 2), (2, 2)], channel_sizes=[self.out_dim, self.out_dim], dropout=self.dropout) - self.linear = nn.Linear(n_out=out_dim, with_bias=False) + self.linear = nn.Linear(n_out=self.out_dim, with_bias=False) - self.layers = nn.Sequential(copy.deepcopy(encoder_layer) for _ in range(num_blocks)) + self.layers = nn.Sequential(copy.deepcopy(encoder_layer) for _ in range(num_layers)) def forward(self, inp: nn.LayerRef) -> nn.LayerRef: x_subsample = self.conv_subsample_layer(inp) From ef512664e32ed9da45bb14bd80f07d51bb407d67 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Mon, 8 Nov 2021 13:49:53 +0100 Subject: [PATCH 30/33] fix docs --- nn/conformer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nn/conformer.py b/nn/conformer.py index f10a451a..d810990f 100644 --- a/nn/conformer.py +++ b/nn/conformer.py @@ -131,7 +131,7 @@ def __init__( :param att_dropout: attention dropout value :param out_dim: the output feature dimension :param num_heads: the number of attention heads - :param batch_norm_opts: batch norm options + :param batch_norm_opts: passed to :class:`nn.BatchNorm` """ super().__init__() @@ -181,7 +181,7 @@ class ConformerEncoder(nn.Module): def __init__(self, encoder_layer: ConformerEncoderLayer, num_layers: int): """ - :param encoder_layer: an instance of `class:ConformerEncoderLayer` + :param encoder_layer: an instance of :class:`ConformerEncoderLayer` :param num_layers: the number of encoder layers """ super().__init__() From 687a6f7c8bb1c43b2e7f1ec9a904afbab261aba9 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Tue, 9 Nov 2021 19:57:52 +0100 Subject: [PATCH 31/33] add self att --- nn/conformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nn/conformer.py b/nn/conformer.py index d810990f..59103b99 100644 --- a/nn/conformer.py +++ b/nn/conformer.py @@ -147,7 +147,7 @@ def __init__( self.conv_block = ConformerConvBlock( out_dim=out_dim, kernel_size=conv_kernel_size, batch_norm_opts=batch_norm_opts) - self.self_att = MultiheadAttention(out_dim, num_heads, dropout=att_dropout) # TODO: to be implemented + self.self_att = nn.SelfAttention(axis='T', key_dim_total=out_dim, value_dim_total=out_dim, num_heads=num_heads) def forward(self, inp: nn.LayerRef) -> nn.LayerRef: # FFN From c3d605fbf268deb46d9d2e2eaacba2bc55db02e2 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Tue, 9 Nov 2021 19:58:19 +0100 Subject: [PATCH 32/33] cleanup --- nn/conformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nn/conformer.py b/nn/conformer.py index 59103b99..1eaa413c 100644 --- a/nn/conformer.py +++ b/nn/conformer.py @@ -3,7 +3,7 @@ Ref: https://arxiv.org/abs/2005.08100 """ -from typing import Tuple, List, Callable, Optional, Dict, Any, Union +from typing import Tuple, List, Callable, Optional, Dict, Any from .. import nn import copy From 89bc71b79738c6df9133d4f3e4977623d4391561 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Tue, 9 Nov 2021 20:45:48 +0100 Subject: [PATCH 33/33] Update nn/conformer.py --- nn/conformer.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/nn/conformer.py b/nn/conformer.py index 1eaa413c..45b11277 100644 --- a/nn/conformer.py +++ b/nn/conformer.py @@ -57,9 +57,6 @@ def __init__(self, out_dim: int, kernel_size: int, batch_norm_opts: Optional[Dic if batch_norm_opts is None: batch_norm_opts = {} - batch_norm_opts = batch_norm_opts.copy() - batch_norm_opts.setdefault('epsilon', 1e-5) - batch_norm_opts.setdefault('momentum', 0.1) self.batch_norm = nn.BatchNorm(update_sample_only_in_training=True, delay_sample_update=True, **batch_norm_opts) def forward(self, inp: nn.LayerRef) -> nn.LayerRef: