Skip to content

decoder

Origin: Sashi Novitasari Modification: Heli Qi Affiliation: NAIST Date: 2022.07

Conv1dEv

Bases: Module

A 1D convolutional layer with support for different padding modes.

Attributes:

Name Type Description
cutoff bool

Indicates whether the output should be cut off for the 'same' padding mode.

causal_padding int

Additional padding required for the 'causal' padding mode.

dilation int

The dilation rate of the convolutional layer.

conv_lyr Conv1d

The 1D convolutional layer.

Source code in speechain/module/prenet/conv1d.py
class Conv1dEv(torch.nn.Module):
    """A 1D convolutional layer with support for different padding modes.

    Attributes:
        cutoff (bool):
            Indicates whether the output should be cut off for the 'same' padding mode.
        causal_padding (int):
            Additional padding required for the 'causal' padding mode.
        dilation (int):
            The dilation rate of the convolutional layer.
        conv_lyr (torch.nn.Conv1d):
            The 1D convolutional layer.
    """

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int,
        stride: int = 1,
        dilation: int = 1,
        padding_mode: str = "same",
        bias: bool = True,
        use_weight_norm: bool = False,
        groups: int = 1,
    ):
        """Initializes the Conv1dEv module with the specified parameters.

        Args:
            in_channels (int):
                Number of channels in the input feature.
            out_channels (int):
                Number of channels produced by the convolution.
            kernel_size (int):
                Size of the convolutional kernel.
            stride (int, optional):
                Stride of the convolution. Defaults to 1.
            dilation (int, optional):
                The dilation rate of the kernel. Defaults to 1.
            padding_mode (str, optional):
                Padding mode. Supported values are 'valid', 'full', 'same' and 'causal'. Defaults to 'same'.
            bias (bool, optional):
                If True, adds a learnable bias to the output. Defaults to True.

        Raises:
            ValueError: If an unsupported padding mode is specified.
        """
        super().__init__()

        self.cutoff = False
        self.causal_padding = 0
        self.dilation = dilation

        # no padding is used
        if padding_mode == "valid":
            padding = 0
        # full padding
        elif padding_mode == "full":
            padding = dilation * (kernel_size - 1)
        # same padding, the output is the same in dimension with input
        elif padding_mode == "same":
            assert stride == 1, "Stride should be 1 for 'same' padding mode"
            if kernel_size % 2 == 0:
                padding = dilation * kernel_size // 2
                self.cutoff = True
            else:
                padding = dilation * (kernel_size - 1) // 2
        # causal padding
        elif padding_mode == "causal":
            padding = 0
            self.causal_padding = dilation * (kernel_size - 1)
        else:
            raise ValueError(
                "Unsupported padding mode. Supported modes are 'valid', 'full', 'same' and 'causal'."
            )

        self.conv_lyr = torch.nn.Conv1d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
            dilation=dilation,
            padding=padding,
            bias=bias,
            groups=groups,
        )
        if use_weight_norm:
            self.conv_lyr = weight_norm(self.conv_lyr)

    def forward(self, feat: torch.Tensor):
        """Performs a forward pass through the convolutional layer.

        Args:
            feat (torch.Tensor):
                The input feature tensor. Shape: (batch, feat_dim, feat_maxlen).

        Returns:
            torch.Tensor:
                The output tensor. Shape: (batch, out_channels, output_len).
        """
        # attach additional paddings at the end for the 'causal' padding mode
        if self.causal_padding > 0:
            feat = F.pad(feat, (self.causal_padding, 0))
        output = self.conv_lyr(feat)
        # cut off the redundant tails for the 'same' padding mode
        if self.cutoff:
            output = output[:, :, : -self.dilation]
        return output

__init__(in_channels, out_channels, kernel_size, stride=1, dilation=1, padding_mode='same', bias=True, use_weight_norm=False, groups=1)

Initializes the Conv1dEv module with the specified parameters.

Parameters:

Name Type Description Default
in_channels int

Number of channels in the input feature.

required
out_channels int

Number of channels produced by the convolution.

required
kernel_size int

Size of the convolutional kernel.

required
stride int

Stride of the convolution. Defaults to 1.

1
dilation int

The dilation rate of the kernel. Defaults to 1.

1
padding_mode str

Padding mode. Supported values are 'valid', 'full', 'same' and 'causal'. Defaults to 'same'.

'same'
bias bool

If True, adds a learnable bias to the output. Defaults to True.

True

Raises:

Type Description
ValueError

If an unsupported padding mode is specified.

Source code in speechain/module/prenet/conv1d.py
def __init__(
    self,
    in_channels: int,
    out_channels: int,
    kernel_size: int,
    stride: int = 1,
    dilation: int = 1,
    padding_mode: str = "same",
    bias: bool = True,
    use_weight_norm: bool = False,
    groups: int = 1,
):
    """Initializes the Conv1dEv module with the specified parameters.

    Args:
        in_channels (int):
            Number of channels in the input feature.
        out_channels (int):
            Number of channels produced by the convolution.
        kernel_size (int):
            Size of the convolutional kernel.
        stride (int, optional):
            Stride of the convolution. Defaults to 1.
        dilation (int, optional):
            The dilation rate of the kernel. Defaults to 1.
        padding_mode (str, optional):
            Padding mode. Supported values are 'valid', 'full', 'same' and 'causal'. Defaults to 'same'.
        bias (bool, optional):
            If True, adds a learnable bias to the output. Defaults to True.

    Raises:
        ValueError: If an unsupported padding mode is specified.
    """
    super().__init__()

    self.cutoff = False
    self.causal_padding = 0
    self.dilation = dilation

    # no padding is used
    if padding_mode == "valid":
        padding = 0
    # full padding
    elif padding_mode == "full":
        padding = dilation * (kernel_size - 1)
    # same padding, the output is the same in dimension with input
    elif padding_mode == "same":
        assert stride == 1, "Stride should be 1 for 'same' padding mode"
        if kernel_size % 2 == 0:
            padding = dilation * kernel_size // 2
            self.cutoff = True
        else:
            padding = dilation * (kernel_size - 1) // 2
    # causal padding
    elif padding_mode == "causal":
        padding = 0
        self.causal_padding = dilation * (kernel_size - 1)
    else:
        raise ValueError(
            "Unsupported padding mode. Supported modes are 'valid', 'full', 'same' and 'causal'."
        )

    self.conv_lyr = torch.nn.Conv1d(
        in_channels=in_channels,
        out_channels=out_channels,
        kernel_size=kernel_size,
        stride=stride,
        dilation=dilation,
        padding=padding,
        bias=bias,
        groups=groups,
    )
    if use_weight_norm:
        self.conv_lyr = weight_norm(self.conv_lyr)

forward(feat)

Performs a forward pass through the convolutional layer.

Parameters:

Name Type Description Default
feat Tensor

The input feature tensor. Shape: (batch, feat_dim, feat_maxlen).

required

Returns:

Type Description

torch.Tensor: The output tensor. Shape: (batch, out_channels, output_len).

Source code in speechain/module/prenet/conv1d.py
def forward(self, feat: torch.Tensor):
    """Performs a forward pass through the convolutional layer.

    Args:
        feat (torch.Tensor):
            The input feature tensor. Shape: (batch, feat_dim, feat_maxlen).

    Returns:
        torch.Tensor:
            The output tensor. Shape: (batch, out_channels, output_len).
    """
    # attach additional paddings at the end for the 'causal' padding mode
    if self.causal_padding > 0:
        feat = F.pad(feat, (self.causal_padding, 0))
    output = self.conv_lyr(feat)
    # cut off the redundant tails for the 'same' padding mode
    if self.cutoff:
        output = output[:, :, : -self.dilation]
    return output

MultiHeadedAttention

Bases: Module

A Multi-Head Attention layer has

· Query linear layer · Key linear layer · Value linear layer · Softmax layer · Attention Dropout layer · Output linear layer

Implementation modified from OpenNMT-py. https://github.com/OpenNMT/OpenNMT-py

Source code in speechain/module/transformer/attention.py
class MultiHeadedAttention(Module):
    """
    A Multi-Head Attention layer has:
        · Query linear layer
        · Key linear layer
        · Value linear layer
        · Softmax layer
        · Attention Dropout layer
        · Output linear layer

    Implementation modified from OpenNMT-py.
    https://github.com/OpenNMT/OpenNMT-py
    """

    def module_init(
        self,
        num_heads: int,
        d_model: int,
        dropout: float = 0.1,
        scale_dp_by_head: bool = False,
    ):
        """Create a multi-headed attention layer.

        Args:
            num_heads:
                The number of heads
            d_model:
                Model size (must be divisible by num_heads)
            dropout:
                The dropout rate of the Dropout layer after the softmax operation
        """
        assert d_model % num_heads == 0, "d_model is not divisible by num_heads!"

        self.head_size = d_model // num_heads
        self.d_model = d_model
        self.num_heads = num_heads

        self.k_layer = nn.Linear(d_model, num_heads * self.head_size)
        self.v_layer = nn.Linear(d_model, num_heads * self.head_size)
        self.q_layer = nn.Linear(d_model, num_heads * self.head_size)

        self.softmax = nn.Softmax(dim=-1)
        self.dropout = nn.Dropout(dropout)
        self.output_layer = nn.Linear(d_model, d_model)

        self.scale = (
            1 / math.sqrt(self.head_size)
            if scale_dp_by_head
            else 1 / math.sqrt(self.d_model)
        )

    def kvq_forward(self, k: torch.Tensor, v: torch.Tensor, q: torch.Tensor):

        batch_size = k.size(0)

        # project the queries (q), keys (k), and values (v)
        k = self.k_layer(k)
        v = self.v_layer(v)
        q = self.q_layer(q)

        # separate all heads of q, k, v
        k = k.view(batch_size, -1, self.num_heads, self.head_size).transpose(1, 2)
        v = v.view(batch_size, -1, self.num_heads, self.head_size).transpose(1, 2)
        q = q.view(batch_size, -1, self.num_heads, self.head_size).transpose(1, 2)

        return k, v, q

    def attention_forward(
        self, v: torch.Tensor, scores: torch.Tensor, mask: torch.Tensor
    ):

        # apply the mask (if we have one)
        # we add a dimension for the heads to it below: [B, 1, 1, M]
        if mask is not None:
            scores = scores.masked_fill(~mask.unsqueeze(1), float("-inf"))

        # apply attention dropout and compute context vectors.
        attention = self.softmax(scores)
        score_soft = attention.clone()
        attention = self.dropout(attention)

        # get context vector (select values with attention) and reshape
        # back to [B, M, D]
        context = torch.matmul(attention, v)
        context = (
            context.transpose(1, 2)
            .contiguous()
            .view(v.size(0), -1, self.num_heads * self.head_size)
        )

        output = self.output_layer(context)

        return output, score_soft

    def forward(
        self,
        k: torch.Tensor,
        v: torch.Tensor,
        q: torch.Tensor,
        mask: torch.Tensor = None,
    ):
        """Computes multi-headed attention.

        Args:
            k: keys   [B, M, D] with M being the sentence length.
            v: values [B, M, D]
            q: query  [B, M, D]
            mask: optional mask [B, 1, M]

        Returns:
        """

        k, v, q = self.kvq_forward(k, v, q)

        # compute scaled attention scores
        scores = torch.matmul(q, k.transpose(2, 3)) * self.scale

        return self.attention_forward(v, scores, mask)

forward(k, v, q, mask=None)

Computes multi-headed attention.

Parameters:

Name Type Description Default
k Tensor

keys [B, M, D] with M being the sentence length.

required
v Tensor

values [B, M, D]

required
q Tensor

query [B, M, D]

required
mask Tensor

optional mask [B, 1, M]

None

Returns:

Source code in speechain/module/transformer/attention.py
def forward(
    self,
    k: torch.Tensor,
    v: torch.Tensor,
    q: torch.Tensor,
    mask: torch.Tensor = None,
):
    """Computes multi-headed attention.

    Args:
        k: keys   [B, M, D] with M being the sentence length.
        v: values [B, M, D]
        q: query  [B, M, D]
        mask: optional mask [B, 1, M]

    Returns:
    """

    k, v, q = self.kvq_forward(k, v, q)

    # compute scaled attention scores
    scores = torch.matmul(q, k.transpose(2, 3)) * self.scale

    return self.attention_forward(v, scores, mask)

module_init(num_heads, d_model, dropout=0.1, scale_dp_by_head=False)

Create a multi-headed attention layer.

Parameters:

Name Type Description Default
num_heads int

The number of heads

required
d_model int

Model size (must be divisible by num_heads)

required
dropout float

The dropout rate of the Dropout layer after the softmax operation

0.1
Source code in speechain/module/transformer/attention.py
def module_init(
    self,
    num_heads: int,
    d_model: int,
    dropout: float = 0.1,
    scale_dp_by_head: bool = False,
):
    """Create a multi-headed attention layer.

    Args:
        num_heads:
            The number of heads
        d_model:
            Model size (must be divisible by num_heads)
        dropout:
            The dropout rate of the Dropout layer after the softmax operation
    """
    assert d_model % num_heads == 0, "d_model is not divisible by num_heads!"

    self.head_size = d_model // num_heads
    self.d_model = d_model
    self.num_heads = num_heads

    self.k_layer = nn.Linear(d_model, num_heads * self.head_size)
    self.v_layer = nn.Linear(d_model, num_heads * self.head_size)
    self.q_layer = nn.Linear(d_model, num_heads * self.head_size)

    self.softmax = nn.Softmax(dim=-1)
    self.dropout = nn.Dropout(dropout)
    self.output_layer = nn.Linear(d_model, d_model)

    self.scale = (
        1 / math.sqrt(self.head_size)
        if scale_dp_by_head
        else 1 / math.sqrt(self.d_model)
    )

PositionalEncoding

Bases: Module

Pre-compute position encodings (PE).

In forward pass, this module adds the positional encodings to the embedded feature vectors to make the Transformer aware of the positional information of the sequences.

Source code in speechain/module/transformer/pos_enc.py
class PositionalEncoding(Module):
    """Pre-compute position encodings (PE).

    In forward pass, this module adds the positional encodings to the embedded feature
    vectors to make the Transformer aware of the positional information of the
    sequences.
    """

    def module_init(
        self,
        posenc_type: str = "mix",
        d_model: int = 512,
        emb_scale: bool = False,
        emb_layernorm: bool = False,
        posenc_scale: bool = False,
        init_alpha: float = 1.0,
        max_len: int = 5000,
        dropout: float = 0.0,
    ):
        """Positional Encoding with maximum length max_len.

        Args:
            posenc_type: str
                The type of positional encoding (must be either 'mix' or 'sep').
                For the 'mix' type, sin is applied to the odd dimensions and cos is applied to the even dimensions.
                The equations are as below:
                    PE(pos, 2i) = sin(pos / 10000^{2i / d_model}), i ∈ {0, ..., d_model / 2 - 1}
                    PE(pos, 2i + 1) = cos(pos / 10000^{2i / d_model}), i ∈ {0, ..., d_model / 2 - 1}
                    Reference:
                        'Attention Is All You Need'
                        https://proceedings.neurips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf
                For the 'sep' type, sin is applied to the first half of dimensions and cos is applied to the second half
                of dimensions. The equations are as below:
                    PE(pos, i) = sin(pos / 10000^{2i / d_model}), i ∈ {0, ..., d_model / 2 - 1}
                    PE(pos, i) = cos(pos / 10000^{2i / d_model}), i ∈ {d_model / 2, ..., d_model - 1}
                    Reference:
                        'Speech-transformer: a no-recurrence sequence-to-sequence model for speech recognition'
                        https://ieeexplore.ieee.org/abstract/document/8462506/
            d_model: int
                The dimension of the hidden feature vectors of the Transformer layers.
            emb_scale: bool
                Controls whether the embedding vectors are scaled up by sqrt(d_model) before adding into the positional
                encoding or not.
                References:
                    Section 3.4 in 'Attention Is All You Need'
                    https://proceedings.neurips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf
                In most cases, we don't recommend you to turn it on especially when you don't have a large training set
                (e.g. LibriSpeech-train_clean_100) because it may make your model hard to converge. Please consider it
                only when you want to emphasize the embedded features over the positional encodings.
            emb_layernorm: bool
                Controls whether the embedding vectors are normalized by LayerNorm before adding into the positional
                encoding or not.
            posenc_scale: bool
                Controls whether the positional encodings are scaled up by a trainable scalar before adding into the
                embedded features or not.
                Reference:
                    'Neural Speech Synthesis with Transformer Network'
                    https://ojs.aaai.org/index.php/AAAI/article/view/4642/4520
            init_alpha: float
                The initial value of the alpha used for positional encoding scaling.
                Only effective when posenc_scale is True.
            max_len: int
                The maximum length of the input feature sequences.
            dropout: float
                The dropout rate for the Dropout layer after adding the positional encoding to the input
        """

        assert posenc_type in [
            "mix",
            "sep",
        ], f"The type of PositionalEncoding layer must be either 'mix' or 'sep', but got type={posenc_type}!"
        assert (
            d_model % 2 == 0
        ), f"Cannot apply sin/cos positional encoding to the vectors with odd dimensions (got d_model={d_model:d})."

        self.posenc_type = posenc_type
        self.d_model = d_model
        self.emb_scale = emb_scale
        if emb_layernorm:
            self.emb_layernorm = torch.nn.LayerNorm(d_model)

        self.init_alpha = (
            init_alpha if isinstance(init_alpha, float) else float(init_alpha)
        )
        if posenc_scale:
            self.alpha = torch.nn.Parameter(torch.tensor(self.init_alpha))

        # positional encoding matrix
        self.update_posenc(max_len)

        # positional encoding Dropout layer
        self.dropout = torch.nn.Dropout(p=dropout)

    def reset_parameters(self):
        """Make sure that the scalar value is not influenced by different model
        initialization methods."""
        if hasattr(self, "alpha"):
            self.alpha.data = torch.tensor(self.init_alpha)

    def update_posenc(self, max_len: int):
        """

        Args:
            max_len:

        """

        # positional encoding calculation
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, self.d_model, 2, dtype=torch.float)
            * (math.log(10000.0) / self.d_model)
        )
        posenc = torch.zeros(max_len, self.d_model)

        # 'mix' positional encoding: sine functions and cosine functions mix up with each other
        if self.posenc_type == "mix":
            posenc[:, 0::2] = torch.sin(position / div_term)
            posenc[:, 1::2] = torch.cos(position / div_term)
        # 'sep' positional encoding: sine functions and cosine functions occupy the positional encoding separately
        elif self.posenc_type == "sep":
            div_term_ext = torch.exp(
                torch.arange(self.d_model, self.d_model * 2, 2, dtype=torch.float)
                * (math.log(10000.0) / self.d_model)
            )
            posenc[:, : int(self.d_model / 2)] = torch.sin(position / div_term)
            posenc[:, int(self.d_model / 2) :] = torch.cos(position / div_term_ext)

        # posenc = posenc.unsqueeze(0) does not put posenc into the buffer
        # here register_buffer() allows posenc to be automatically put onto GPUs as a buffer member
        self.register_buffer("posenc", posenc.unsqueeze(0))

    def forward(self, emb_feat: torch.Tensor):
        """Embedded feature.

            -> LayerNorm(Embedded feature)
                -> LayerNorm(Embedded feature) * sqrt(d_model)
                    -> LayerNorm(Embedded feature) * sqrt(d_model) + Positional Encoding * learnable scalar
                        -> Dropout(LayerNorm(Embedded feature) * sqrt(d_model) + Positional Encoding * learnable scalar)

        Args:
            emb_feat: (batch_size, seq_len, d_model)
                Embedded input feature sequences

        Returns:
            Embedded input feature sequences with positional encoding
        """
        # in case that the input sequence is longer than the preset max_len
        if emb_feat.size(1) > self.posenc.size(1):
            self.update_posenc(emb_feat.size(1), self.d_model)

        # 1. (optional) normalize the embedded feature by LayerNorm
        if hasattr(self, "emb_layernorm"):
            emb_feat = self.emb_layernorm(emb_feat)

        # 2. (optional) scale the embedded feature up by sqrt(d_model)
        if self.emb_scale:
            emb_feat *= math.sqrt(self.d_model)

        # 3. (optional) scale the positional encoding vectors
        posenc = self.posenc[:, : emb_feat.size(1)]
        if hasattr(self, "alpha"):
            # avoid posenc *= self.alpha to protect the original positional encoding
            posenc = posenc * self.alpha

        # 4. (mandatory) add positional encoding into embedded feature and apply the dropout
        return self.dropout(emb_feat + posenc)

    def get_recordable_para(self) -> Dict or None:
        if hasattr(self, "alpha"):
            return dict(alpha=self.alpha)
        else:
            return None

    def extra_repr(self) -> str:
        return f"emb_scale={self.emb_scale}\n" f"posenc_scale={hasattr(self, 'alpha')}"

forward(emb_feat)

Embedded feature.

-> LayerNorm(Embedded feature)
    -> LayerNorm(Embedded feature) * sqrt(d_model)
        -> LayerNorm(Embedded feature) * sqrt(d_model) + Positional Encoding * learnable scalar
            -> Dropout(LayerNorm(Embedded feature) * sqrt(d_model) + Positional Encoding * learnable scalar)

Parameters:

Name Type Description Default
emb_feat Tensor

(batch_size, seq_len, d_model) Embedded input feature sequences

required

Returns:

Type Description

Embedded input feature sequences with positional encoding

Source code in speechain/module/transformer/pos_enc.py
def forward(self, emb_feat: torch.Tensor):
    """Embedded feature.

        -> LayerNorm(Embedded feature)
            -> LayerNorm(Embedded feature) * sqrt(d_model)
                -> LayerNorm(Embedded feature) * sqrt(d_model) + Positional Encoding * learnable scalar
                    -> Dropout(LayerNorm(Embedded feature) * sqrt(d_model) + Positional Encoding * learnable scalar)

    Args:
        emb_feat: (batch_size, seq_len, d_model)
            Embedded input feature sequences

    Returns:
        Embedded input feature sequences with positional encoding
    """
    # in case that the input sequence is longer than the preset max_len
    if emb_feat.size(1) > self.posenc.size(1):
        self.update_posenc(emb_feat.size(1), self.d_model)

    # 1. (optional) normalize the embedded feature by LayerNorm
    if hasattr(self, "emb_layernorm"):
        emb_feat = self.emb_layernorm(emb_feat)

    # 2. (optional) scale the embedded feature up by sqrt(d_model)
    if self.emb_scale:
        emb_feat *= math.sqrt(self.d_model)

    # 3. (optional) scale the positional encoding vectors
    posenc = self.posenc[:, : emb_feat.size(1)]
    if hasattr(self, "alpha"):
        # avoid posenc *= self.alpha to protect the original positional encoding
        posenc = posenc * self.alpha

    # 4. (mandatory) add positional encoding into embedded feature and apply the dropout
    return self.dropout(emb_feat + posenc)

module_init(posenc_type='mix', d_model=512, emb_scale=False, emb_layernorm=False, posenc_scale=False, init_alpha=1.0, max_len=5000, dropout=0.0)

Positional Encoding with maximum length max_len.

Parameters:

Name Type Description Default
posenc_type str

str The type of positional encoding (must be either 'mix' or 'sep'). For the 'mix' type, sin is applied to the odd dimensions and cos is applied to the even dimensions. The equations are as below: PE(pos, 2i) = sin(pos / 10000^{2i / d_model}), i ∈ {0, ..., d_model / 2 - 1} PE(pos, 2i + 1) = cos(pos / 10000^{2i / d_model}), i ∈ {0, ..., d_model / 2 - 1} Reference: 'Attention Is All You Need' https://proceedings.neurips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf For the 'sep' type, sin is applied to the first half of dimensions and cos is applied to the second half of dimensions. The equations are as below: PE(pos, i) = sin(pos / 10000^{2i / d_model}), i ∈ {0, ..., d_model / 2 - 1} PE(pos, i) = cos(pos / 10000^{2i / d_model}), i ∈ {d_model / 2, ..., d_model - 1} Reference: 'Speech-transformer: a no-recurrence sequence-to-sequence model for speech recognition' https://ieeexplore.ieee.org/abstract/document/8462506/

'mix'
d_model int

int The dimension of the hidden feature vectors of the Transformer layers.

512
emb_scale bool

bool Controls whether the embedding vectors are scaled up by sqrt(d_model) before adding into the positional encoding or not. References: Section 3.4 in 'Attention Is All You Need' https://proceedings.neurips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf In most cases, we don't recommend you to turn it on especially when you don't have a large training set (e.g. LibriSpeech-train_clean_100) because it may make your model hard to converge. Please consider it only when you want to emphasize the embedded features over the positional encodings.

False
emb_layernorm bool

bool Controls whether the embedding vectors are normalized by LayerNorm before adding into the positional encoding or not.

False
posenc_scale bool

bool Controls whether the positional encodings are scaled up by a trainable scalar before adding into the embedded features or not. Reference: 'Neural Speech Synthesis with Transformer Network' https://ojs.aaai.org/index.php/AAAI/article/view/4642/4520

False
init_alpha float

float The initial value of the alpha used for positional encoding scaling. Only effective when posenc_scale is True.

1.0
max_len int

int The maximum length of the input feature sequences.

5000
dropout float

float The dropout rate for the Dropout layer after adding the positional encoding to the input

0.0
Source code in speechain/module/transformer/pos_enc.py
def module_init(
    self,
    posenc_type: str = "mix",
    d_model: int = 512,
    emb_scale: bool = False,
    emb_layernorm: bool = False,
    posenc_scale: bool = False,
    init_alpha: float = 1.0,
    max_len: int = 5000,
    dropout: float = 0.0,
):
    """Positional Encoding with maximum length max_len.

    Args:
        posenc_type: str
            The type of positional encoding (must be either 'mix' or 'sep').
            For the 'mix' type, sin is applied to the odd dimensions and cos is applied to the even dimensions.
            The equations are as below:
                PE(pos, 2i) = sin(pos / 10000^{2i / d_model}), i ∈ {0, ..., d_model / 2 - 1}
                PE(pos, 2i + 1) = cos(pos / 10000^{2i / d_model}), i ∈ {0, ..., d_model / 2 - 1}
                Reference:
                    'Attention Is All You Need'
                    https://proceedings.neurips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf
            For the 'sep' type, sin is applied to the first half of dimensions and cos is applied to the second half
            of dimensions. The equations are as below:
                PE(pos, i) = sin(pos / 10000^{2i / d_model}), i ∈ {0, ..., d_model / 2 - 1}
                PE(pos, i) = cos(pos / 10000^{2i / d_model}), i ∈ {d_model / 2, ..., d_model - 1}
                Reference:
                    'Speech-transformer: a no-recurrence sequence-to-sequence model for speech recognition'
                    https://ieeexplore.ieee.org/abstract/document/8462506/
        d_model: int
            The dimension of the hidden feature vectors of the Transformer layers.
        emb_scale: bool
            Controls whether the embedding vectors are scaled up by sqrt(d_model) before adding into the positional
            encoding or not.
            References:
                Section 3.4 in 'Attention Is All You Need'
                https://proceedings.neurips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf
            In most cases, we don't recommend you to turn it on especially when you don't have a large training set
            (e.g. LibriSpeech-train_clean_100) because it may make your model hard to converge. Please consider it
            only when you want to emphasize the embedded features over the positional encodings.
        emb_layernorm: bool
            Controls whether the embedding vectors are normalized by LayerNorm before adding into the positional
            encoding or not.
        posenc_scale: bool
            Controls whether the positional encodings are scaled up by a trainable scalar before adding into the
            embedded features or not.
            Reference:
                'Neural Speech Synthesis with Transformer Network'
                https://ojs.aaai.org/index.php/AAAI/article/view/4642/4520
        init_alpha: float
            The initial value of the alpha used for positional encoding scaling.
            Only effective when posenc_scale is True.
        max_len: int
            The maximum length of the input feature sequences.
        dropout: float
            The dropout rate for the Dropout layer after adding the positional encoding to the input
    """

    assert posenc_type in [
        "mix",
        "sep",
    ], f"The type of PositionalEncoding layer must be either 'mix' or 'sep', but got type={posenc_type}!"
    assert (
        d_model % 2 == 0
    ), f"Cannot apply sin/cos positional encoding to the vectors with odd dimensions (got d_model={d_model:d})."

    self.posenc_type = posenc_type
    self.d_model = d_model
    self.emb_scale = emb_scale
    if emb_layernorm:
        self.emb_layernorm = torch.nn.LayerNorm(d_model)

    self.init_alpha = (
        init_alpha if isinstance(init_alpha, float) else float(init_alpha)
    )
    if posenc_scale:
        self.alpha = torch.nn.Parameter(torch.tensor(self.init_alpha))

    # positional encoding matrix
    self.update_posenc(max_len)

    # positional encoding Dropout layer
    self.dropout = torch.nn.Dropout(p=dropout)

reset_parameters()

Make sure that the scalar value is not influenced by different model initialization methods.

Source code in speechain/module/transformer/pos_enc.py
def reset_parameters(self):
    """Make sure that the scalar value is not influenced by different model
    initialization methods."""
    if hasattr(self, "alpha"):
        self.alpha.data = torch.tensor(self.init_alpha)

update_posenc(max_len)

Parameters:

Name Type Description Default
max_len int
required
Source code in speechain/module/transformer/pos_enc.py
def update_posenc(self, max_len: int):
    """

    Args:
        max_len:

    """

    # positional encoding calculation
    position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
    div_term = torch.exp(
        torch.arange(0, self.d_model, 2, dtype=torch.float)
        * (math.log(10000.0) / self.d_model)
    )
    posenc = torch.zeros(max_len, self.d_model)

    # 'mix' positional encoding: sine functions and cosine functions mix up with each other
    if self.posenc_type == "mix":
        posenc[:, 0::2] = torch.sin(position / div_term)
        posenc[:, 1::2] = torch.cos(position / div_term)
    # 'sep' positional encoding: sine functions and cosine functions occupy the positional encoding separately
    elif self.posenc_type == "sep":
        div_term_ext = torch.exp(
            torch.arange(self.d_model, self.d_model * 2, 2, dtype=torch.float)
            * (math.log(10000.0) / self.d_model)
        )
        posenc[:, : int(self.d_model / 2)] = torch.sin(position / div_term)
        posenc[:, int(self.d_model / 2) :] = torch.cos(position / div_term_ext)

    # posenc = posenc.unsqueeze(0) does not put posenc into the buffer
    # here register_buffer() allows posenc to be automatically put onto GPUs as a buffer member
    self.register_buffer("posenc", posenc.unsqueeze(0))

PositionwiseFeedForward

Bases: Module

Position-wise Feed-forward layer Projects the output vectors of multi- head attention layer to fdfwd_dim and then back to d_model.

Source code in speechain/module/transformer/feed_forward.py
class PositionwiseFeedForward(Module):
    """Position-wise Feed-forward layer Projects the output vectors of multi- head
    attention layer to fdfwd_dim and then back to d_model."""

    def module_init(
        self,
        d_model: int = 512,
        fdfwd_dim: int = 2048,
        fdfwd_type: str = "linear",
        fdfwd_activation: str = "ReLU",
        fdfwd_args: Dict[str, Any] = {},
        dropout=0.1,
    ):
        """Initializes position-wise feed-forward layer.

        Args:
            d_model: int
                The dimension of the hidden feature vector in each Transformer layer
            fdfwd_dim: int
                The value of the out_features of the first linear feedforward layer and the in_features of the second
                linear feedforward layer
            fdfwd_type: str
                The type of the feed-forward layer. 'linear' means the Linear layer while 'conv' means the Conv1d layer.
            fdfwd_activation: str
                The name of the activation function of feedforward layers. Should be the name of functions in 'torch.nn'.
            fdfwd_kernel: int
                The kernal size of the Conv1d feed-forward layer. This argument is not effective if fdfwd_type == 'linear'.
            dropout: float
                The dropout rate for the Dropout layer after the first linear feedforward layer
        """
        if len(fdfwd_args) == 0:
            if fdfwd_type == "conv":
                fdfwd_args = dict(kernel_size=3)

        # In-layer at the beginning
        if fdfwd_type == "linear":
            self.in_layer = nn.Linear(d_model, fdfwd_dim, **fdfwd_args)
        elif fdfwd_type == "conv":
            self.in_layer = Conv1dEv(d_model, fdfwd_dim, **fdfwd_args)
        else:
            raise NotImplementedError(
                f"Currently, fdfwd_type can only be one of 'linear' and 'conv'. "
                f"But got {fdfwd_type}!"
            )

        # ReLU and DropOut layers in the middle
        self.activation = getattr(torch.nn, fdfwd_activation)()
        self.dropout = nn.Dropout(dropout)

        # Out-layer at the end
        if fdfwd_type == "linear":
            self.out_layer = nn.Linear(fdfwd_dim, d_model, **fdfwd_args)
        elif fdfwd_type == "conv":
            self.out_layer = Conv1dEv(fdfwd_dim, d_model, **fdfwd_args)
        else:
            raise NotImplementedError(
                f"Currently, fdfwd_type can only be one of 'linear' and 'conv'. "
                f"But got {fdfwd_type}!"
            )

    def forward(self, x: torch.Tensor):
        """

        Args:
            x: (batch, seq_maxlen, d_model)

        Returns:

        """
        # forward the convolutional layers
        if isinstance(self.in_layer, Conv1dEv):
            # (batch, seq_maxlen, d_model) -> (batch, d_model, seq_maxlen)
            x = x.transpose(1, 2)
        # pass the in-layer at the beginning
        # (batch, d_model, seq_maxlen) -> (batch, fdfwd_dim, seq_maxlen) or
        # (batch, seq_maxlen, d_model) -> (batch, seq_maxlen, fdfwd_dim)
        x = self.in_layer(x)

        # pass the middle layers
        x = self.dropout(self.activation(x))

        # pass the out-layer at the end
        # (batch, fdfwd_dim, seq_maxlen) -> (batch, d_model, seq_maxlen) or
        # (batch, seq_maxlen, fdfwd_dim) -> (batch, seq_maxlen, d_model)
        x = self.out_layer(x)
        # forward the convolutional layers
        if isinstance(self.out_layer, Conv1dEv):
            # (batch, d_model, seq_maxlen) -> (batch, seq_maxlen, d_model)
            x = x.transpose(1, 2)
        return x

forward(x)

Parameters:

Name Type Description Default
x Tensor

(batch, seq_maxlen, d_model)

required

Returns:

Source code in speechain/module/transformer/feed_forward.py
def forward(self, x: torch.Tensor):
    """

    Args:
        x: (batch, seq_maxlen, d_model)

    Returns:

    """
    # forward the convolutional layers
    if isinstance(self.in_layer, Conv1dEv):
        # (batch, seq_maxlen, d_model) -> (batch, d_model, seq_maxlen)
        x = x.transpose(1, 2)
    # pass the in-layer at the beginning
    # (batch, d_model, seq_maxlen) -> (batch, fdfwd_dim, seq_maxlen) or
    # (batch, seq_maxlen, d_model) -> (batch, seq_maxlen, fdfwd_dim)
    x = self.in_layer(x)

    # pass the middle layers
    x = self.dropout(self.activation(x))

    # pass the out-layer at the end
    # (batch, fdfwd_dim, seq_maxlen) -> (batch, d_model, seq_maxlen) or
    # (batch, seq_maxlen, fdfwd_dim) -> (batch, seq_maxlen, d_model)
    x = self.out_layer(x)
    # forward the convolutional layers
    if isinstance(self.out_layer, Conv1dEv):
        # (batch, d_model, seq_maxlen) -> (batch, seq_maxlen, d_model)
        x = x.transpose(1, 2)
    return x

module_init(d_model=512, fdfwd_dim=2048, fdfwd_type='linear', fdfwd_activation='ReLU', fdfwd_args={}, dropout=0.1)

Initializes position-wise feed-forward layer.

Parameters:

Name Type Description Default
d_model int

int The dimension of the hidden feature vector in each Transformer layer

512
fdfwd_dim int

int The value of the out_features of the first linear feedforward layer and the in_features of the second linear feedforward layer

2048
fdfwd_type str

str The type of the feed-forward layer. 'linear' means the Linear layer while 'conv' means the Conv1d layer.

'linear'
fdfwd_activation str

str The name of the activation function of feedforward layers. Should be the name of functions in 'torch.nn'.

'ReLU'
fdfwd_kernel

int The kernal size of the Conv1d feed-forward layer. This argument is not effective if fdfwd_type == 'linear'.

required
dropout

float The dropout rate for the Dropout layer after the first linear feedforward layer

0.1
Source code in speechain/module/transformer/feed_forward.py
def module_init(
    self,
    d_model: int = 512,
    fdfwd_dim: int = 2048,
    fdfwd_type: str = "linear",
    fdfwd_activation: str = "ReLU",
    fdfwd_args: Dict[str, Any] = {},
    dropout=0.1,
):
    """Initializes position-wise feed-forward layer.

    Args:
        d_model: int
            The dimension of the hidden feature vector in each Transformer layer
        fdfwd_dim: int
            The value of the out_features of the first linear feedforward layer and the in_features of the second
            linear feedforward layer
        fdfwd_type: str
            The type of the feed-forward layer. 'linear' means the Linear layer while 'conv' means the Conv1d layer.
        fdfwd_activation: str
            The name of the activation function of feedforward layers. Should be the name of functions in 'torch.nn'.
        fdfwd_kernel: int
            The kernal size of the Conv1d feed-forward layer. This argument is not effective if fdfwd_type == 'linear'.
        dropout: float
            The dropout rate for the Dropout layer after the first linear feedforward layer
    """
    if len(fdfwd_args) == 0:
        if fdfwd_type == "conv":
            fdfwd_args = dict(kernel_size=3)

    # In-layer at the beginning
    if fdfwd_type == "linear":
        self.in_layer = nn.Linear(d_model, fdfwd_dim, **fdfwd_args)
    elif fdfwd_type == "conv":
        self.in_layer = Conv1dEv(d_model, fdfwd_dim, **fdfwd_args)
    else:
        raise NotImplementedError(
            f"Currently, fdfwd_type can only be one of 'linear' and 'conv'. "
            f"But got {fdfwd_type}!"
        )

    # ReLU and DropOut layers in the middle
    self.activation = getattr(torch.nn, fdfwd_activation)()
    self.dropout = nn.Dropout(dropout)

    # Out-layer at the end
    if fdfwd_type == "linear":
        self.out_layer = nn.Linear(fdfwd_dim, d_model, **fdfwd_args)
    elif fdfwd_type == "conv":
        self.out_layer = Conv1dEv(fdfwd_dim, d_model, **fdfwd_args)
    else:
        raise NotImplementedError(
            f"Currently, fdfwd_type can only be one of 'linear' and 'conv'. "
            f"But got {fdfwd_type}!"
        )

TransformerDecoder

Bases: Module

Source code in speechain/module/transformer/decoder.py
class TransformerDecoder(Module):
    def module_init(
        self,
        posenc_type: str = "mix",
        posenc_maxlen: int = 5000,
        posenc_dropout: float = 0.1,
        posenc_scale: bool = False,
        posenc_init_alpha: float = 1.0,
        emb_layernorm: bool = False,
        emb_scale: bool = True,
        d_model: int = 512,
        num_heads: int = 4,
        num_layers: int = 8,
        scale_dp_by_head: bool = False,
        fdfwd_dim: int = 2048,
        fdfwd_activation: str = "ReLU",
        fdfwd_dropout: float = 0.1,
        att_dropout: float = 0.1,
        res_dropout: float = 0.1,
        layernorm_first: bool = True,
    ):
        """

        Args:
            posenc_type: str
                Specify the positional encoding type you would like to use in your Transformer blocks.
            posenc_maxlen: int
                Maximal length when calculating the positional encoding.
                Usually, the default value of this argument is enough for the research.
            posenc_dropout: float
                The dropout rate for the Dropout layer after adding the positional encoding to the input
            posenc_scale: bool
                Controls whether the positional encodings are scaled up by a trainable scalar before adding into the
                embedded features or not.
                Reference:
                    'Neural Speech Synthesis with Transformer Network'
                    https://ojs.aaai.org/index.php/AAAI/article/view/4642/4520
            posenc_init_alpha: float = 1.0
                The initial value of the alpha used for positional encoding scaling.
                Only effective when posenc_scale is True.
            emb_layernorm: bool
                Controls whether the embedding vectors are normalized by LayerNorm before adding into the positional
                encoding or not.
            emb_scale: bool
                Controls whether the embedding vectors are scaled up by sqrt(d_model) before adding into the positional
                encoding or not.
            d_model: int
                The dimension of the hidden feature vector in each Transformer layer
            num_heads: int
                The number of attention heads in each Transformer layer
            num_layers: int
                The number of Transformer layers
            att_dropout: float
                The dropout rate for the Dropout layer after calculating the weights in each Transformer layer
            fdfwd_dim: int
                The value of the out_features of the first linear feedforward layer and the in_features of the second
                linear feedforward layer in each Transformer layer.
            fdfwd_activation: str
                The name of the activation function of feedforward layers. Should be the name of functions in 'torch.nn'.
            fdfwd_dropout: float
                The dropout rate for the Dropout layer after the first linear feedforward layer in each Transformer layer
            res_dropout: float
                The dropout rate for the Dropout layer before adding the output of each Transformer layer into its input
            layernorm_first: bool
                controls whether the LayerNorm layer appears at the beginning or at the end of each Transformer layer.
                True means the LayerNorm layer appears at the beginning; False means the LayerNorm layer appears at the end.

        """

        # input_size and output_size initialization
        if self.input_size is not None:
            d_model = self.input_size
        self.output_size = d_model

        # para recording
        self.d_model = d_model
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.layernorm_first = layernorm_first

        # initialize the positional encoding layer
        self.posenc = PositionalEncoding(
            posenc_type=posenc_type,
            d_model=d_model,
            emb_scale=emb_scale,
            emb_layernorm=emb_layernorm,
            posenc_scale=posenc_scale,
            init_alpha=posenc_init_alpha,
            max_len=posenc_maxlen,
            dropout=posenc_dropout,
        )

        # create num_layers decoder layers and put them in a list
        self.trfm_layers = torch.nn.ModuleList(
            [
                TransformerDecoderLayer(
                    d_model=d_model,
                    num_heads=num_heads,
                    scale_dp_by_head=scale_dp_by_head,
                    att_dropout=att_dropout,
                    fdfwd_dim=fdfwd_dim,
                    fdfwd_activation=fdfwd_activation,
                    fdfwd_dropout=fdfwd_dropout,
                    res_dropout=res_dropout,
                )
                for _ in range(num_layers)
            ]
        )

        # initialize layernorm layer if necessary
        if self.layernorm_first:
            self.layernorm = nn.LayerNorm(d_model, eps=1e-6)

    @staticmethod
    def subsequent_mask(batch_size, maxlen: int) -> torch.Tensor:
        """Mask out subsequent positions (to prevent attending to future positions)
        Transformer helper function.

        Args:
            batch_size:
            maxlen: int
                size of mask (2nd and 3rd dim)

        Returns:
        """
        return ~torch.triu(
            torch.ones(batch_size, maxlen, maxlen, dtype=torch.bool), diagonal=1
        )

    def forward(
        self,
        tgt: torch.Tensor,
        src: torch.Tensor,
        tgt_mask: torch.Tensor,
        src_mask: torch.Tensor,
        return_att: bool = False,
        return_hidden: bool = False,
    ):
        """Transformer decoder forward pass.

        Args:
            tgt: (batch, tgt_maxlen, d_model)
                embedded targets
            src: (batch, src_maxlen, d_model)
                source representations
            tgt_mask: (batch, 1, tgt_maxlen)
                to mask out target paddings
                Note that a subsequent mask is applied here.
            src_mask: (batch, 1, src_maxlen)
                to mask out source paddings
            return_att:
            return_hidden:

        Returns:
            The output of the Transformer decoder.
            The outputs of each Transformer decoder layer will be returned as a List.
            The attention matrix (self and enc-dec) of each Transformer decoder layer will also be returned as a List.
        """
        assert tgt_mask is not None, "tgt_mask is required for Transformer!"

        # pass the positional encoding layer
        tgt = self.posenc(tgt)

        # generate the diagonal mask for self-attention layers
        batch_size, _, tgt_maxlen = tgt_mask.size()
        tgt_mask = torch.logical_and(
            tgt_mask.repeat(1, tgt_maxlen, 1),
            self.subsequent_mask(batch_size, tgt_maxlen).to(tgt_mask.device),
        )

        # pass the transformer layers
        self_attmat, encdec_attmat, hidden = [], [], []
        for layer in self.trfm_layers:
            tgt, _self_attmat, _encdec_attmat = layer(
                tgt=tgt, tgt_mask=tgt_mask, src=src, src_mask=src_mask
            )
            self_attmat.append(_self_attmat)
            encdec_attmat.append(_encdec_attmat)
            hidden.append(tgt.clone())

        # pass the layernorm layer if necessary
        if self.layernorm_first:
            tgt = self.layernorm(tgt)

        return tgt, self_attmat, encdec_attmat, hidden

forward(tgt, src, tgt_mask, src_mask, return_att=False, return_hidden=False)

Transformer decoder forward pass.

Parameters:

Name Type Description Default
tgt Tensor

(batch, tgt_maxlen, d_model) embedded targets

required
src Tensor

(batch, src_maxlen, d_model) source representations

required
tgt_mask Tensor

(batch, 1, tgt_maxlen) to mask out target paddings Note that a subsequent mask is applied here.

required
src_mask Tensor

(batch, 1, src_maxlen) to mask out source paddings

required
return_att bool
False
return_hidden bool
False

Returns:

Type Description

The output of the Transformer decoder.

The outputs of each Transformer decoder layer will be returned as a List.

The attention matrix (self and enc-dec) of each Transformer decoder layer will also be returned as a List.

Source code in speechain/module/transformer/decoder.py
def forward(
    self,
    tgt: torch.Tensor,
    src: torch.Tensor,
    tgt_mask: torch.Tensor,
    src_mask: torch.Tensor,
    return_att: bool = False,
    return_hidden: bool = False,
):
    """Transformer decoder forward pass.

    Args:
        tgt: (batch, tgt_maxlen, d_model)
            embedded targets
        src: (batch, src_maxlen, d_model)
            source representations
        tgt_mask: (batch, 1, tgt_maxlen)
            to mask out target paddings
            Note that a subsequent mask is applied here.
        src_mask: (batch, 1, src_maxlen)
            to mask out source paddings
        return_att:
        return_hidden:

    Returns:
        The output of the Transformer decoder.
        The outputs of each Transformer decoder layer will be returned as a List.
        The attention matrix (self and enc-dec) of each Transformer decoder layer will also be returned as a List.
    """
    assert tgt_mask is not None, "tgt_mask is required for Transformer!"

    # pass the positional encoding layer
    tgt = self.posenc(tgt)

    # generate the diagonal mask for self-attention layers
    batch_size, _, tgt_maxlen = tgt_mask.size()
    tgt_mask = torch.logical_and(
        tgt_mask.repeat(1, tgt_maxlen, 1),
        self.subsequent_mask(batch_size, tgt_maxlen).to(tgt_mask.device),
    )

    # pass the transformer layers
    self_attmat, encdec_attmat, hidden = [], [], []
    for layer in self.trfm_layers:
        tgt, _self_attmat, _encdec_attmat = layer(
            tgt=tgt, tgt_mask=tgt_mask, src=src, src_mask=src_mask
        )
        self_attmat.append(_self_attmat)
        encdec_attmat.append(_encdec_attmat)
        hidden.append(tgt.clone())

    # pass the layernorm layer if necessary
    if self.layernorm_first:
        tgt = self.layernorm(tgt)

    return tgt, self_attmat, encdec_attmat, hidden

module_init(posenc_type='mix', posenc_maxlen=5000, posenc_dropout=0.1, posenc_scale=False, posenc_init_alpha=1.0, emb_layernorm=False, emb_scale=True, d_model=512, num_heads=4, num_layers=8, scale_dp_by_head=False, fdfwd_dim=2048, fdfwd_activation='ReLU', fdfwd_dropout=0.1, att_dropout=0.1, res_dropout=0.1, layernorm_first=True)

Parameters:

Name Type Description Default
posenc_type str

str Specify the positional encoding type you would like to use in your Transformer blocks.

'mix'
posenc_maxlen int

int Maximal length when calculating the positional encoding. Usually, the default value of this argument is enough for the research.

5000
posenc_dropout float

float The dropout rate for the Dropout layer after adding the positional encoding to the input

0.1
posenc_scale bool

bool Controls whether the positional encodings are scaled up by a trainable scalar before adding into the embedded features or not. Reference: 'Neural Speech Synthesis with Transformer Network' https://ojs.aaai.org/index.php/AAAI/article/view/4642/4520

False
posenc_init_alpha float

float = 1.0 The initial value of the alpha used for positional encoding scaling. Only effective when posenc_scale is True.

1.0
emb_layernorm bool

bool Controls whether the embedding vectors are normalized by LayerNorm before adding into the positional encoding or not.

False
emb_scale bool

bool Controls whether the embedding vectors are scaled up by sqrt(d_model) before adding into the positional encoding or not.

True
d_model int

int The dimension of the hidden feature vector in each Transformer layer

512
num_heads int

int The number of attention heads in each Transformer layer

4
num_layers int

int The number of Transformer layers

8
att_dropout float

float The dropout rate for the Dropout layer after calculating the weights in each Transformer layer

0.1
fdfwd_dim int

int The value of the out_features of the first linear feedforward layer and the in_features of the second linear feedforward layer in each Transformer layer.

2048
fdfwd_activation str

str The name of the activation function of feedforward layers. Should be the name of functions in 'torch.nn'.

'ReLU'
fdfwd_dropout float

float The dropout rate for the Dropout layer after the first linear feedforward layer in each Transformer layer

0.1
res_dropout float

float The dropout rate for the Dropout layer before adding the output of each Transformer layer into its input

0.1
layernorm_first bool

bool controls whether the LayerNorm layer appears at the beginning or at the end of each Transformer layer. True means the LayerNorm layer appears at the beginning; False means the LayerNorm layer appears at the end.

True
Source code in speechain/module/transformer/decoder.py
def module_init(
    self,
    posenc_type: str = "mix",
    posenc_maxlen: int = 5000,
    posenc_dropout: float = 0.1,
    posenc_scale: bool = False,
    posenc_init_alpha: float = 1.0,
    emb_layernorm: bool = False,
    emb_scale: bool = True,
    d_model: int = 512,
    num_heads: int = 4,
    num_layers: int = 8,
    scale_dp_by_head: bool = False,
    fdfwd_dim: int = 2048,
    fdfwd_activation: str = "ReLU",
    fdfwd_dropout: float = 0.1,
    att_dropout: float = 0.1,
    res_dropout: float = 0.1,
    layernorm_first: bool = True,
):
    """

    Args:
        posenc_type: str
            Specify the positional encoding type you would like to use in your Transformer blocks.
        posenc_maxlen: int
            Maximal length when calculating the positional encoding.
            Usually, the default value of this argument is enough for the research.
        posenc_dropout: float
            The dropout rate for the Dropout layer after adding the positional encoding to the input
        posenc_scale: bool
            Controls whether the positional encodings are scaled up by a trainable scalar before adding into the
            embedded features or not.
            Reference:
                'Neural Speech Synthesis with Transformer Network'
                https://ojs.aaai.org/index.php/AAAI/article/view/4642/4520
        posenc_init_alpha: float = 1.0
            The initial value of the alpha used for positional encoding scaling.
            Only effective when posenc_scale is True.
        emb_layernorm: bool
            Controls whether the embedding vectors are normalized by LayerNorm before adding into the positional
            encoding or not.
        emb_scale: bool
            Controls whether the embedding vectors are scaled up by sqrt(d_model) before adding into the positional
            encoding or not.
        d_model: int
            The dimension of the hidden feature vector in each Transformer layer
        num_heads: int
            The number of attention heads in each Transformer layer
        num_layers: int
            The number of Transformer layers
        att_dropout: float
            The dropout rate for the Dropout layer after calculating the weights in each Transformer layer
        fdfwd_dim: int
            The value of the out_features of the first linear feedforward layer and the in_features of the second
            linear feedforward layer in each Transformer layer.
        fdfwd_activation: str
            The name of the activation function of feedforward layers. Should be the name of functions in 'torch.nn'.
        fdfwd_dropout: float
            The dropout rate for the Dropout layer after the first linear feedforward layer in each Transformer layer
        res_dropout: float
            The dropout rate for the Dropout layer before adding the output of each Transformer layer into its input
        layernorm_first: bool
            controls whether the LayerNorm layer appears at the beginning or at the end of each Transformer layer.
            True means the LayerNorm layer appears at the beginning; False means the LayerNorm layer appears at the end.

    """

    # input_size and output_size initialization
    if self.input_size is not None:
        d_model = self.input_size
    self.output_size = d_model

    # para recording
    self.d_model = d_model
    self.num_layers = num_layers
    self.num_heads = num_heads
    self.layernorm_first = layernorm_first

    # initialize the positional encoding layer
    self.posenc = PositionalEncoding(
        posenc_type=posenc_type,
        d_model=d_model,
        emb_scale=emb_scale,
        emb_layernorm=emb_layernorm,
        posenc_scale=posenc_scale,
        init_alpha=posenc_init_alpha,
        max_len=posenc_maxlen,
        dropout=posenc_dropout,
    )

    # create num_layers decoder layers and put them in a list
    self.trfm_layers = torch.nn.ModuleList(
        [
            TransformerDecoderLayer(
                d_model=d_model,
                num_heads=num_heads,
                scale_dp_by_head=scale_dp_by_head,
                att_dropout=att_dropout,
                fdfwd_dim=fdfwd_dim,
                fdfwd_activation=fdfwd_activation,
                fdfwd_dropout=fdfwd_dropout,
                res_dropout=res_dropout,
            )
            for _ in range(num_layers)
        ]
    )

    # initialize layernorm layer if necessary
    if self.layernorm_first:
        self.layernorm = nn.LayerNorm(d_model, eps=1e-6)

subsequent_mask(batch_size, maxlen) staticmethod

Mask out subsequent positions (to prevent attending to future positions) Transformer helper function.

Parameters:

Name Type Description Default
batch_size
required
maxlen int

int size of mask (2nd and 3rd dim)

required

Returns:

Source code in speechain/module/transformer/decoder.py
@staticmethod
def subsequent_mask(batch_size, maxlen: int) -> torch.Tensor:
    """Mask out subsequent positions (to prevent attending to future positions)
    Transformer helper function.

    Args:
        batch_size:
        maxlen: int
            size of mask (2nd and 3rd dim)

    Returns:
    """
    return ~torch.triu(
        torch.ones(batch_size, maxlen, maxlen, dtype=torch.bool), diagonal=1
    )

TransformerDecoderLayer

Bases: Module

A single Transformer decoder layer has: · a self multi-head attention sublayer · a LayerNorm layer exclusively for the self-attention sublayer · a encoder-decoder multi-head attention sublayer · a LayerNorm layer exclusively for the encoder-decoder attention sublayer · a position-wise feed-forward sublayer · a LayerNorm layer exclusively for the feed-forward sublayer · a residual dropout layer

Source code in speechain/module/transformer/decoder.py
class TransformerDecoderLayer(Module):
    """
    A single Transformer decoder layer has:
    · a self multi-head attention sublayer
    · a LayerNorm layer exclusively for the self-attention sublayer
    · a encoder-decoder multi-head attention sublayer
    · a LayerNorm layer exclusively for the encoder-decoder attention sublayer
    · a position-wise feed-forward sublayer
    · a LayerNorm layer exclusively for the feed-forward sublayer
    · a residual dropout layer

    """

    def module_init(
        self,
        d_model: int = 512,
        num_heads: int = 8,
        scale_dp_by_head: bool = False,
        att_dropout: float = 0.1,
        fdfwd_dim: int = 0,
        fdfwd_activation: str = "ReLU",
        fdfwd_dropout: float = 0.1,
        res_dropout: float = 0.1,
        layernorm_first: bool = True,
    ):
        """Represents a single Transformer decoder layer. It attends to the source
        representation and the previous decoder states.

        Args:
            d_model: int
                The dimension of the hidden feature vector in each Transformer layer
            num_heads: int
                The number of attention heads in each Transformer layer
            att_dropout: float
                The dropout rate for the Dropout layer after calculating the weights in each Transformer layer
            fdfwd_dim: int
                The value of the out_features of the first linear feedforward layer and the in_features of the second
                linear feedforward layer in each Transformer layer.
            fdfwd_activation: str
                The name of the activation function of feedforward layers. Should be the name of functions in 'torch.nn'.
            fdfwd_dropout: float
                The dropout rate for the Dropout layer after the first linear feedforward layer in each Transformer layer
            res_dropout: float
                The dropout rate for the Dropout layer before adding the output of each Transformer layer into its input
            layernorm_first: bool
                Whether layernorm is performed before feeding src into sublayers.
                if layernorm_first is True:
                    output = input + Sublayer(LayerNorm(input))
                elif layernorm_first is False:
                    output = LayerNorm(input + Sublayer(input))
        """
        # initialize the self attention layer
        self.self_att = MultiHeadedAttention(
            num_heads=num_heads,
            d_model=d_model,
            dropout=att_dropout,
            scale_dp_by_head=scale_dp_by_head,
        )

        # initialize the encoder-decoder attention layer
        self.encdec_att = MultiHeadedAttention(
            num_heads=num_heads, d_model=d_model, dropout=att_dropout
        )

        # initialize feedforward layer
        self.feed_forward = PositionwiseFeedForward(
            d_model=d_model,
            fdfwd_dim=fdfwd_dim,
            fdfwd_activation=fdfwd_activation,
            dropout=fdfwd_dropout,
        )

        # initialize layernorm layers
        self.layernorm_first = layernorm_first
        self.self_att_ln = nn.LayerNorm(d_model, eps=1e-6)
        self.encdec_att_ln = nn.LayerNorm(d_model, eps=1e-6)
        self.fdfwd_ln = nn.LayerNorm(d_model, eps=1e-6)

        # initialize residual dropout layer
        self.dropout = nn.Dropout(res_dropout)

    def forward(
        self,
        tgt: torch.Tensor,
        src: torch.Tensor,
        tgt_mask: torch.Tensor,
        src_mask: torch.Tensor,
    ):
        """Forward pass of a single Transformer decoder layer.

        Args:
            tgt: (batch, tgt_maxlen, d_model)
                target inputs
            src: (batch, src_maxlen, d_model)
                source representations
            tgt_mask: (batch, tgt_maxlen, tgt_maxlen)
                target mask (so as to not condition on future steps)
            src_mask: (batch, 1, src_maxlen)
                source mask

        Returns:
            The output of this Transformer decoder layer and the attention matrix (self and enc-dec)
        """

        # --- 1. Self Attention Layer part --- #
        # go through the LayerNorm layer before the self attention layer or not
        tgt_norm = self.self_att_ln(tgt) if self.layernorm_first else tgt

        # go through the self attention layer and perform the residual connection
        self_att_hidden, self_attmat = self.self_att(
            tgt_norm, tgt_norm, tgt_norm, mask=tgt_mask
        )
        self_att_output = self.dropout(self_att_hidden) + tgt

        # go through the LayerNorm layer after the self attention layer or not
        self_att_output = (
            self.self_att_ln(self_att_output)
            if not self.layernorm_first
            else self_att_output
        )

        # --- 2. Enc-Dec Attention Layer part --- #
        # go through the LayerNorm layer before the enc-dec attention layer or not
        self_att_output_norm = (
            self.encdec_att_ln(self_att_output)
            if self.layernorm_first
            else self_att_output
        )

        # go through the enc-dec attention layer and perform the residual connection
        encdec_att_hidden, encdec_attmat = self.encdec_att(
            src, src, self_att_output_norm, mask=src_mask
        )
        encdec_att_output = self.dropout(encdec_att_hidden) + self_att_output

        # go through the LayerNorm layer after the enc-dec attention layer or not
        encdec_att_output = (
            self.encdec_att_ln(encdec_att_output)
            if not self.layernorm_first
            else encdec_att_output
        )

        # --- 3. Positional FeedForward Layer part --- #
        # go through the LayerNorm layer before the feedforward layer or not
        encdec_att_output_norm = (
            self.fdfwd_ln(encdec_att_output)
            if self.layernorm_first
            else encdec_att_output
        )

        # go through the feedforward layer and perform the residual connection
        fdfwd_hidden = self.feed_forward(encdec_att_output_norm)
        fdfwd_output = self.dropout(fdfwd_hidden) + encdec_att_output

        # go through the LayerNorm layer after the feedforward layer or not
        fdfwd_output = (
            self.fdfwd_ln(fdfwd_output) if not self.layernorm_first else fdfwd_output
        )

        return fdfwd_output, self_attmat, encdec_attmat

forward(tgt, src, tgt_mask, src_mask)

Forward pass of a single Transformer decoder layer.

Parameters:

Name Type Description Default
tgt Tensor

(batch, tgt_maxlen, d_model) target inputs

required
src Tensor

(batch, src_maxlen, d_model) source representations

required
tgt_mask Tensor

(batch, tgt_maxlen, tgt_maxlen) target mask (so as to not condition on future steps)

required
src_mask Tensor

(batch, 1, src_maxlen) source mask

required

Returns:

Type Description

The output of this Transformer decoder layer and the attention matrix (self and enc-dec)

Source code in speechain/module/transformer/decoder.py
def forward(
    self,
    tgt: torch.Tensor,
    src: torch.Tensor,
    tgt_mask: torch.Tensor,
    src_mask: torch.Tensor,
):
    """Forward pass of a single Transformer decoder layer.

    Args:
        tgt: (batch, tgt_maxlen, d_model)
            target inputs
        src: (batch, src_maxlen, d_model)
            source representations
        tgt_mask: (batch, tgt_maxlen, tgt_maxlen)
            target mask (so as to not condition on future steps)
        src_mask: (batch, 1, src_maxlen)
            source mask

    Returns:
        The output of this Transformer decoder layer and the attention matrix (self and enc-dec)
    """

    # --- 1. Self Attention Layer part --- #
    # go through the LayerNorm layer before the self attention layer or not
    tgt_norm = self.self_att_ln(tgt) if self.layernorm_first else tgt

    # go through the self attention layer and perform the residual connection
    self_att_hidden, self_attmat = self.self_att(
        tgt_norm, tgt_norm, tgt_norm, mask=tgt_mask
    )
    self_att_output = self.dropout(self_att_hidden) + tgt

    # go through the LayerNorm layer after the self attention layer or not
    self_att_output = (
        self.self_att_ln(self_att_output)
        if not self.layernorm_first
        else self_att_output
    )

    # --- 2. Enc-Dec Attention Layer part --- #
    # go through the LayerNorm layer before the enc-dec attention layer or not
    self_att_output_norm = (
        self.encdec_att_ln(self_att_output)
        if self.layernorm_first
        else self_att_output
    )

    # go through the enc-dec attention layer and perform the residual connection
    encdec_att_hidden, encdec_attmat = self.encdec_att(
        src, src, self_att_output_norm, mask=src_mask
    )
    encdec_att_output = self.dropout(encdec_att_hidden) + self_att_output

    # go through the LayerNorm layer after the enc-dec attention layer or not
    encdec_att_output = (
        self.encdec_att_ln(encdec_att_output)
        if not self.layernorm_first
        else encdec_att_output
    )

    # --- 3. Positional FeedForward Layer part --- #
    # go through the LayerNorm layer before the feedforward layer or not
    encdec_att_output_norm = (
        self.fdfwd_ln(encdec_att_output)
        if self.layernorm_first
        else encdec_att_output
    )

    # go through the feedforward layer and perform the residual connection
    fdfwd_hidden = self.feed_forward(encdec_att_output_norm)
    fdfwd_output = self.dropout(fdfwd_hidden) + encdec_att_output

    # go through the LayerNorm layer after the feedforward layer or not
    fdfwd_output = (
        self.fdfwd_ln(fdfwd_output) if not self.layernorm_first else fdfwd_output
    )

    return fdfwd_output, self_attmat, encdec_attmat

module_init(d_model=512, num_heads=8, scale_dp_by_head=False, att_dropout=0.1, fdfwd_dim=0, fdfwd_activation='ReLU', fdfwd_dropout=0.1, res_dropout=0.1, layernorm_first=True)

Represents a single Transformer decoder layer. It attends to the source representation and the previous decoder states.

Parameters:

Name Type Description Default
d_model int

int The dimension of the hidden feature vector in each Transformer layer

512
num_heads int

int The number of attention heads in each Transformer layer

8
att_dropout float

float The dropout rate for the Dropout layer after calculating the weights in each Transformer layer

0.1
fdfwd_dim int

int The value of the out_features of the first linear feedforward layer and the in_features of the second linear feedforward layer in each Transformer layer.

0
fdfwd_activation str

str The name of the activation function of feedforward layers. Should be the name of functions in 'torch.nn'.

'ReLU'
fdfwd_dropout float

float The dropout rate for the Dropout layer after the first linear feedforward layer in each Transformer layer

0.1
res_dropout float

float The dropout rate for the Dropout layer before adding the output of each Transformer layer into its input

0.1
layernorm_first bool

bool Whether layernorm is performed before feeding src into sublayers. if layernorm_first is True: output = input + Sublayer(LayerNorm(input)) elif layernorm_first is False: output = LayerNorm(input + Sublayer(input))

True
Source code in speechain/module/transformer/decoder.py
def module_init(
    self,
    d_model: int = 512,
    num_heads: int = 8,
    scale_dp_by_head: bool = False,
    att_dropout: float = 0.1,
    fdfwd_dim: int = 0,
    fdfwd_activation: str = "ReLU",
    fdfwd_dropout: float = 0.1,
    res_dropout: float = 0.1,
    layernorm_first: bool = True,
):
    """Represents a single Transformer decoder layer. It attends to the source
    representation and the previous decoder states.

    Args:
        d_model: int
            The dimension of the hidden feature vector in each Transformer layer
        num_heads: int
            The number of attention heads in each Transformer layer
        att_dropout: float
            The dropout rate for the Dropout layer after calculating the weights in each Transformer layer
        fdfwd_dim: int
            The value of the out_features of the first linear feedforward layer and the in_features of the second
            linear feedforward layer in each Transformer layer.
        fdfwd_activation: str
            The name of the activation function of feedforward layers. Should be the name of functions in 'torch.nn'.
        fdfwd_dropout: float
            The dropout rate for the Dropout layer after the first linear feedforward layer in each Transformer layer
        res_dropout: float
            The dropout rate for the Dropout layer before adding the output of each Transformer layer into its input
        layernorm_first: bool
            Whether layernorm is performed before feeding src into sublayers.
            if layernorm_first is True:
                output = input + Sublayer(LayerNorm(input))
            elif layernorm_first is False:
                output = LayerNorm(input + Sublayer(input))
    """
    # initialize the self attention layer
    self.self_att = MultiHeadedAttention(
        num_heads=num_heads,
        d_model=d_model,
        dropout=att_dropout,
        scale_dp_by_head=scale_dp_by_head,
    )

    # initialize the encoder-decoder attention layer
    self.encdec_att = MultiHeadedAttention(
        num_heads=num_heads, d_model=d_model, dropout=att_dropout
    )

    # initialize feedforward layer
    self.feed_forward = PositionwiseFeedForward(
        d_model=d_model,
        fdfwd_dim=fdfwd_dim,
        fdfwd_activation=fdfwd_activation,
        dropout=fdfwd_dropout,
    )

    # initialize layernorm layers
    self.layernorm_first = layernorm_first
    self.self_att_ln = nn.LayerNorm(d_model, eps=1e-6)
    self.encdec_att_ln = nn.LayerNorm(d_model, eps=1e-6)
    self.fdfwd_ln = nn.LayerNorm(d_model, eps=1e-6)

    # initialize residual dropout layer
    self.dropout = nn.Dropout(res_dropout)