Skip to content

speech2mel

Author: Heli Qi Affiliation: NAIST Date: 2022.07

Speech2MelSpec

Bases: Module

The acoustic frontend where the input is raw speech waveforms and the output is log-mel spectrogram.

The waveform is first converted into linear spectrogram by STFT. Then, the linear spectrogram is converted into log-mel spectrogram by mel-fbank filters. Finally, the delta features of log-mel spectrogram are calculated if specified.

Source code in speechain/module/frontend/speech2mel.py
class Speech2MelSpec(Module):
    """The acoustic frontend where the input is raw speech waveforms and the output is
    log-mel spectrogram.

    The waveform is first converted into linear spectrogram by STFT. Then, the linear
    spectrogram is converted into log-mel spectrogram by mel-fbank filters. Finally, the
    delta features of log-mel spectrogram are calculated if specified.
    """

    def module_init(
        self,
        n_mels: int,
        hop_length: int or float,
        win_length: int or float,
        n_fft: int = None,
        sr: int = 16000,
        preemphasis: float = None,
        pre_stft_norm: str = None,
        window: str = "hann",
        center: bool = True,
        normalized: bool = False,
        onesided: bool = True,
        mag_spec: bool = False,
        return_energy: bool = False,
        fmin: float = 0.0,
        fmax: float = None,
        clamp: float = 1e-10,
        logging: bool = True,
        log_base: float = 10.0,
        mel_scale: str = "slaney",
        mel_norm: bool = True,
        delta_order: int = None,
        delta_N: int = 2,
    ):
        """

        Args:
            n_mels: int
                The number of filters in the mel-fbank
            n_fft: int
                The number of Fourier point used for STFT
            hop_length: int or float
                the distance between neighboring sliding window frames for STFT.
                int means the absolute number of sampling point,
                float means the duration of the speech segment (in seconds).
            win_length: int or float
                the size of window frame for STFT.
                int means the absolute number of sampling point,
                float means the duration of the speech segment (in seconds).
            sr: int
                The sampling rate of the input speech waveforms.
            preemphasis: float
                The preemphasis coefficient before STFT.
            pre_stft_norm: str
                The normalization method for the speech waveforms before STFT.
            window: str
                The window type for STFT.
            center: bool
                 whether to pad input on both sides so that the t-th frame is centered at time `t x hop_length`.
            normalized: bool
                controls whether to return the normalized STFT results
            onesided: bool
                controls whether to return half of results to avoid redundancy for real inputs.
            mag_spec: bool
                controls whether to calculate the linear magnitude spectrogram during STFT.
                True feeds the linear magnitude (energy) spectrogram into mel-fbank.
                False feeds the linear power spectrogram into mel-fbank.
            return_energy: bool
                Whether to calculate the frame-wise energy for the linear magnitude (energy) spectrogram
            fmin: float
                The minimal frequency for the mel-fbank
            fmax: float
                The maximal frequency for the mel-fbank
            clamp: float
                The minimal number for the log-mel spectrogram. Used for stability.
            logging: bool
                Controls whether the mel spectrograms are logged
            log_base: float
                The log base for the log-mel spectrogram. None means the natural log base e.
            mel_scale: str
                The tyle of mel-scale of the mel-fbank.
            mel_norm: bool
                Whether perform the area normalization to the mel-fbank filters.
            delta_order: int
                The delta order you want to add to the original log-mel spectrogram.
                1 means original log-mel spectrogram + $\delta$ Log-mel spectrogram
                2 means original log-mel spectrogram + $\delta$ Log-mel spectrogram + $\delta\delta$ log-mel spectrogram
            delta_N: int
                The number of neighboring points used for calculating the delta features.

        """
        # if hop_length and win_length are given in the unit of seconds, turn them into the corresponding time steps
        hop_length = (
            int(hop_length * sr) if isinstance(hop_length, float) else hop_length
        )
        win_length = (
            int(win_length * sr) if isinstance(win_length, float) else win_length
        )

        # if n_fft is not given, it will be initialized to the window length
        if n_fft is None:
            n_fft = win_length

        # para recording
        self.output_size = n_mels if delta_order is None else n_mels * (delta_order + 1)

        # Speech -> Linear Spectrogram (linear spectrograms are not logged for getting the mel spectrograms)
        self.return_energy = return_energy
        self.speech2linear = Speech2LinearSpec(
            n_fft=n_fft,
            sr=sr,
            hop_length=hop_length,
            win_length=win_length,
            preemphasis=preemphasis,
            pre_stft_norm=pre_stft_norm,
            window=window,
            center=center,
            normalized=normalized,
            onesided=onesided,
            mag_spec=mag_spec,
            return_energy=return_energy,
            logging=False,
        )
        # Linear Spectrogram -> (Log-)Mel Spectrogram
        self.linear2mel = LinearSpec2MelSpec(
            sr=sr,
            n_fft=n_fft,
            n_mels=n_mels,
            fmin=fmin,
            fmax=fmax,
            clamp=clamp,
            logging=logging,
            log_base=log_base,
            mel_scale=mel_scale,
            mel_norm=mel_norm,
            mag_spec=mag_spec,
        )
        # (Optional) (Log-)Mel Spectrogram -> (Log-)Mel Spectrogram + Deltas
        self.delta_order = delta_order
        if delta_order is not None:
            self.delta_N = delta_N
            self.delta = DeltaFeature(delta_order=delta_order, delta_N=delta_N)

    def forward(self, speech: torch.Tensor, speech_len: torch.Tensor):
        """

        Args:
            speech: (batch, speech_maxlen, 1) or (batch, speech_maxlen)
                The input speech data.
            speech_len: (batch,)
                The lengths of input speech data

        Returns:
            The log-mel spectrograms with their lengths.

        """

        # Speech -> Linear Spectrogram
        if self.return_energy:
            feat, feat_len, energy, energy_len = self.speech2linear(speech, speech_len)
        else:
            feat, feat_len = self.speech2linear(speech, speech_len)
            energy, energy_len = None, None

        # Linear Spectrogram -> Log-Mel Spectrogram
        feat, feat_len = self.linear2mel(feat, feat_len)

        # Log-Mel Spectrogram -> Log-Mel Spectrogram + Deltas
        if self.delta_order is not None:
            feat, feat_len = self.delta(feat, feat_len)

        if self.return_energy:
            return feat, feat_len, energy, energy_len
        else:
            return feat, feat_len

    def recover(self, feat: torch.Tensor, feat_len: torch.Tensor):
        """

        Args:
            feat:
            feat_len:

        Returns:

        """
        # No delta recovery
        assert self.delta_order is None

        # Log-Mel Spectrogram -> Linear Spectrogram
        feat = self.linear2mel.recover(feat, feat_len)

        # Linear Spectrogram -> Waveforms (GL algorithm)
        feat, feat_len = self.speech2linear.recover(feat, feat_len)

        return feat, feat_len

    def get_sample_rate(self):
        return self.speech2linear.sr

    def __repr__(self):
        string = (
            f"{self.__class__.__name__}(\n"
            + str(self.speech2linear)
            + "\n"
            + str(self.linear2mel)
        )

        if self.delta_order is not None:
            string += "\n" + str(self.delta)

        return string + "\n)"

forward(speech, speech_len)

Parameters:

Name Type Description Default
speech Tensor

(batch, speech_maxlen, 1) or (batch, speech_maxlen) The input speech data.

required
speech_len Tensor

(batch,) The lengths of input speech data

required

Returns:

Type Description

The log-mel spectrograms with their lengths.

Source code in speechain/module/frontend/speech2mel.py
def forward(self, speech: torch.Tensor, speech_len: torch.Tensor):
    """

    Args:
        speech: (batch, speech_maxlen, 1) or (batch, speech_maxlen)
            The input speech data.
        speech_len: (batch,)
            The lengths of input speech data

    Returns:
        The log-mel spectrograms with their lengths.

    """

    # Speech -> Linear Spectrogram
    if self.return_energy:
        feat, feat_len, energy, energy_len = self.speech2linear(speech, speech_len)
    else:
        feat, feat_len = self.speech2linear(speech, speech_len)
        energy, energy_len = None, None

    # Linear Spectrogram -> Log-Mel Spectrogram
    feat, feat_len = self.linear2mel(feat, feat_len)

    # Log-Mel Spectrogram -> Log-Mel Spectrogram + Deltas
    if self.delta_order is not None:
        feat, feat_len = self.delta(feat, feat_len)

    if self.return_energy:
        return feat, feat_len, energy, energy_len
    else:
        return feat, feat_len

module_init(n_mels, hop_length, win_length, n_fft=None, sr=16000, preemphasis=None, pre_stft_norm=None, window='hann', center=True, normalized=False, onesided=True, mag_spec=False, return_energy=False, fmin=0.0, fmax=None, clamp=1e-10, logging=True, log_base=10.0, mel_scale='slaney', mel_norm=True, delta_order=None, delta_N=2)

Parameters:

Name Type Description Default
n_mels int

int The number of filters in the mel-fbank

required
n_fft int

int The number of Fourier point used for STFT

None
hop_length int or float

int or float the distance between neighboring sliding window frames for STFT. int means the absolute number of sampling point, float means the duration of the speech segment (in seconds).

required
win_length int or float

int or float the size of window frame for STFT. int means the absolute number of sampling point, float means the duration of the speech segment (in seconds).

required
sr int

int The sampling rate of the input speech waveforms.

16000
preemphasis float

float The preemphasis coefficient before STFT.

None
pre_stft_norm str

str The normalization method for the speech waveforms before STFT.

None
window str

str The window type for STFT.

'hann'
center bool

bool whether to pad input on both sides so that the t-th frame is centered at time t x hop_length.

True
normalized bool

bool controls whether to return the normalized STFT results

False
onesided bool

bool controls whether to return half of results to avoid redundancy for real inputs.

True
mag_spec bool

bool controls whether to calculate the linear magnitude spectrogram during STFT. True feeds the linear magnitude (energy) spectrogram into mel-fbank. False feeds the linear power spectrogram into mel-fbank.

False
return_energy bool

bool Whether to calculate the frame-wise energy for the linear magnitude (energy) spectrogram

False
fmin float

float The minimal frequency for the mel-fbank

0.0
fmax float

float The maximal frequency for the mel-fbank

None
clamp float

float The minimal number for the log-mel spectrogram. Used for stability.

1e-10
logging bool

bool Controls whether the mel spectrograms are logged

True
log_base float

float The log base for the log-mel spectrogram. None means the natural log base e.

10.0
mel_scale str

str The tyle of mel-scale of the mel-fbank.

'slaney'
mel_norm bool

bool Whether perform the area normalization to the mel-fbank filters.

True
delta_order int

int The delta order you want to add to the original log-mel spectrogram. 1 means original log-mel spectrogram + \(\delta\) Log-mel spectrogram 2 means original log-mel spectrogram + \(\delta\) Log-mel spectrogram + \(\delta\delta\) log-mel spectrogram

None
delta_N int

int The number of neighboring points used for calculating the delta features.

2
Source code in speechain/module/frontend/speech2mel.py
def module_init(
    self,
    n_mels: int,
    hop_length: int or float,
    win_length: int or float,
    n_fft: int = None,
    sr: int = 16000,
    preemphasis: float = None,
    pre_stft_norm: str = None,
    window: str = "hann",
    center: bool = True,
    normalized: bool = False,
    onesided: bool = True,
    mag_spec: bool = False,
    return_energy: bool = False,
    fmin: float = 0.0,
    fmax: float = None,
    clamp: float = 1e-10,
    logging: bool = True,
    log_base: float = 10.0,
    mel_scale: str = "slaney",
    mel_norm: bool = True,
    delta_order: int = None,
    delta_N: int = 2,
):
    """

    Args:
        n_mels: int
            The number of filters in the mel-fbank
        n_fft: int
            The number of Fourier point used for STFT
        hop_length: int or float
            the distance between neighboring sliding window frames for STFT.
            int means the absolute number of sampling point,
            float means the duration of the speech segment (in seconds).
        win_length: int or float
            the size of window frame for STFT.
            int means the absolute number of sampling point,
            float means the duration of the speech segment (in seconds).
        sr: int
            The sampling rate of the input speech waveforms.
        preemphasis: float
            The preemphasis coefficient before STFT.
        pre_stft_norm: str
            The normalization method for the speech waveforms before STFT.
        window: str
            The window type for STFT.
        center: bool
             whether to pad input on both sides so that the t-th frame is centered at time `t x hop_length`.
        normalized: bool
            controls whether to return the normalized STFT results
        onesided: bool
            controls whether to return half of results to avoid redundancy for real inputs.
        mag_spec: bool
            controls whether to calculate the linear magnitude spectrogram during STFT.
            True feeds the linear magnitude (energy) spectrogram into mel-fbank.
            False feeds the linear power spectrogram into mel-fbank.
        return_energy: bool
            Whether to calculate the frame-wise energy for the linear magnitude (energy) spectrogram
        fmin: float
            The minimal frequency for the mel-fbank
        fmax: float
            The maximal frequency for the mel-fbank
        clamp: float
            The minimal number for the log-mel spectrogram. Used for stability.
        logging: bool
            Controls whether the mel spectrograms are logged
        log_base: float
            The log base for the log-mel spectrogram. None means the natural log base e.
        mel_scale: str
            The tyle of mel-scale of the mel-fbank.
        mel_norm: bool
            Whether perform the area normalization to the mel-fbank filters.
        delta_order: int
            The delta order you want to add to the original log-mel spectrogram.
            1 means original log-mel spectrogram + $\delta$ Log-mel spectrogram
            2 means original log-mel spectrogram + $\delta$ Log-mel spectrogram + $\delta\delta$ log-mel spectrogram
        delta_N: int
            The number of neighboring points used for calculating the delta features.

    """
    # if hop_length and win_length are given in the unit of seconds, turn them into the corresponding time steps
    hop_length = (
        int(hop_length * sr) if isinstance(hop_length, float) else hop_length
    )
    win_length = (
        int(win_length * sr) if isinstance(win_length, float) else win_length
    )

    # if n_fft is not given, it will be initialized to the window length
    if n_fft is None:
        n_fft = win_length

    # para recording
    self.output_size = n_mels if delta_order is None else n_mels * (delta_order + 1)

    # Speech -> Linear Spectrogram (linear spectrograms are not logged for getting the mel spectrograms)
    self.return_energy = return_energy
    self.speech2linear = Speech2LinearSpec(
        n_fft=n_fft,
        sr=sr,
        hop_length=hop_length,
        win_length=win_length,
        preemphasis=preemphasis,
        pre_stft_norm=pre_stft_norm,
        window=window,
        center=center,
        normalized=normalized,
        onesided=onesided,
        mag_spec=mag_spec,
        return_energy=return_energy,
        logging=False,
    )
    # Linear Spectrogram -> (Log-)Mel Spectrogram
    self.linear2mel = LinearSpec2MelSpec(
        sr=sr,
        n_fft=n_fft,
        n_mels=n_mels,
        fmin=fmin,
        fmax=fmax,
        clamp=clamp,
        logging=logging,
        log_base=log_base,
        mel_scale=mel_scale,
        mel_norm=mel_norm,
        mag_spec=mag_spec,
    )
    # (Optional) (Log-)Mel Spectrogram -> (Log-)Mel Spectrogram + Deltas
    self.delta_order = delta_order
    if delta_order is not None:
        self.delta_N = delta_N
        self.delta = DeltaFeature(delta_order=delta_order, delta_N=delta_N)

recover(feat, feat_len)

Parameters:

Name Type Description Default
feat Tensor
required
feat_len Tensor
required

Returns:

Source code in speechain/module/frontend/speech2mel.py
def recover(self, feat: torch.Tensor, feat_len: torch.Tensor):
    """

    Args:
        feat:
        feat_len:

    Returns:

    """
    # No delta recovery
    assert self.delta_order is None

    # Log-Mel Spectrogram -> Linear Spectrogram
    feat = self.linear2mel.recover(feat, feat_len)

    # Linear Spectrogram -> Waveforms (GL algorithm)
    feat, feat_len = self.speech2linear.recover(feat, feat_len)

    return feat, feat_len