diff --git a/paddlespeech/t2s/modules/conformer/encoder_layer.py b/paddlespeech/t2s/modules/conformer/encoder_layer.py index 26a354565a8..6c416088bcd 100644 --- a/paddlespeech/t2s/modules/conformer/encoder_layer.py +++ b/paddlespeech/t2s/modules/conformer/encoder_layer.py @@ -113,7 +113,6 @@ def forward(self, x_input, mask, cache=None): x, pos_emb = x_input[0], x_input[1] else: x, pos_emb = x_input, None - skip_layer = False # with stochastic depth, residual connection `x + f(x)` becomes # `x <- x + 1 / (1 - p) * f(x)` at training time. @@ -121,14 +120,12 @@ def forward(self, x_input, mask, cache=None): if self.training and self.stochastic_depth_rate > 0: skip_layer = paddle.rand(1).item() < self.stochastic_depth_rate stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate) - if skip_layer: if cache is not None: x = paddle.concat([cache, x], axis=1) if pos_emb is not None: return (x, pos_emb), mask return x, mask - # whether to use macaron style if self.feed_forward_macaron is not None: residual = x @@ -138,7 +135,6 @@ def forward(self, x_input, mask, cache=None): self.feed_forward_macaron(x)) if not self.normalize_before: x = self.norm_ff_macaron(x) - # multi-headed self-attention module residual = x if self.normalize_before: diff --git a/paddlespeech/t2s/modules/transformer/attention.py b/paddlespeech/t2s/modules/transformer/attention.py index e3c9a992ad2..3237be1b6bd 100644 --- a/paddlespeech/t2s/modules/transformer/attention.py +++ b/paddlespeech/t2s/modules/transformer/attention.py @@ -103,7 +103,7 @@ def forward_attention(self, value, scores, mask=None): mask = paddle.logical_not(mask) # assume scores.dtype==paddle.float32, we only use "float32" here dtype = str(scores.dtype).split(".")[-1] - min_value = numpy.finfo(dtype).min + min_value = float(numpy.finfo(dtype).min) scores = masked_fill(scores, mask, min_value) # (batch, head, time1, time2) self.attn = softmax(scores) @@ -192,12 +192,11 @@ def rel_shift(self, x): x_padded = paddle.concat([zero_pad, x], axis=-1) x_padded = x_padded.reshape([b, h, t2 + 1, t1]) # only keep the positions from 0 to time2 - x = x_padded[:, :, 1:].reshape([b, h, t1, t2])[:, :, :, :t2 // 2 + 1] - + new_t = paddle.cast(paddle.floor(t2 / 2) + 1, dtype='int32') + x = x_padded[:, :, 1:].reshape([b, h, t1, t2])[:, :, :, :new_t] if self.zero_triu: ones = paddle.ones((t1, t2)) x = x * paddle.tril(ones, t2 - t1)[None, None, :, :] - return x def forward(self, query, key, value, pos_emb, mask): @@ -221,7 +220,6 @@ def forward(self, query, key, value, pos_emb, mask): q, k, v = self.forward_qkv(query, key, value) # (batch, time1, head, d_k) q = q.transpose([0, 2, 1, 3]) - n_batch_pos = paddle.shape(pos_emb)[0] p = self.linear_pos(pos_emb).reshape( [n_batch_pos, -1, self.h, self.d_k]) diff --git a/paddlespeech/t2s/modules/transformer/embedding.py b/paddlespeech/t2s/modules/transformer/embedding.py index 7ba301cbd6a..f90eb44a455 100644 --- a/paddlespeech/t2s/modules/transformer/embedding.py +++ b/paddlespeech/t2s/modules/transformer/embedding.py @@ -198,7 +198,8 @@ def forward(self, x: paddle.Tensor): x = x * self.xscale T = paddle.shape(x)[1] pe_size = paddle.shape(self.pe) - pos_emb = self.pe[:, pe_size[1] // 2 - T + 1:pe_size[1] // 2 + T, ] + tmp = paddle.cast(paddle.floor(pe_size[1] / 2), dtype='int32') + pos_emb = self.pe[:, tmp - T + 1:tmp + T, ] return self.dropout(x), self.dropout(pos_emb) diff --git a/paddlespeech/t2s/modules/transformer/multi_layer_conv.py b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py index 91d67ca5837..a322becd072 100644 --- a/paddlespeech/t2s/modules/transformer/multi_layer_conv.py +++ b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py @@ -69,8 +69,8 @@ def forward(self, x): Tensor: Batch of output tensors (B, T, in_chans). """ x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1]) - return self.w_2(self.dropout(x).transpose([0, 2, 1])).transpose( - [0, 2, 1]) + out = self.w_2(self.dropout(x).transpose([0, 2, 1])).transpose([0, 2, 1]) + return out class Conv1dLinear(nn.Layer):