Skip to content

char

Author: Heli Qi Affiliation: NAIST Date: 2022.07

CharTokenizer

Bases: Tokenizer

Tokenizer implementation that converts the input sentence string into a list of graphemes (characters).

Source code in speechain/tokenizer/char.py
class CharTokenizer(Tokenizer):
    """Tokenizer implementation that converts the input sentence string into a list of
    graphemes (characters)."""

    def text2tensor(
        self,
        text: str,
        no_sos: bool = False,
        no_eos: bool = False,
        return_tensor: bool = True,
    ):
        """

        Args:
            text:
            no_sos:
            no_eos:
            return_tensor:

        Returns:

        """
        # initialize the tensor as an empty list
        tokens = []
        # whether to attach sos at the beginning of the tokens
        if not no_sos:
            tokens.append(self.sos_eos_idx)
        # attach the main body of the text
        tokens.extend(
            [
                self.token2idx[char] if char in self.token2idx.keys() else self.unk_idx
                for char in text
            ]
        )
        # whether to attach eos at the end of the tokens
        if not no_eos:
            tokens.append(self.sos_eos_idx)
        # turn the token list into a long-type tensor
        if return_tensor:
            return torch.LongTensor(tokens)
        else:
            return tokens

text2tensor(text, no_sos=False, no_eos=False, return_tensor=True)

Parameters:

Name Type Description Default
text str
required
no_sos bool
False
no_eos bool
False
return_tensor bool
True

Returns:

Source code in speechain/tokenizer/char.py
def text2tensor(
    self,
    text: str,
    no_sos: bool = False,
    no_eos: bool = False,
    return_tensor: bool = True,
):
    """

    Args:
        text:
        no_sos:
        no_eos:
        return_tensor:

    Returns:

    """
    # initialize the tensor as an empty list
    tokens = []
    # whether to attach sos at the beginning of the tokens
    if not no_sos:
        tokens.append(self.sos_eos_idx)
    # attach the main body of the text
    tokens.extend(
        [
            self.token2idx[char] if char in self.token2idx.keys() else self.unk_idx
            for char in text
        ]
    )
    # whether to attach eos at the end of the tokens
    if not no_eos:
        tokens.append(self.sos_eos_idx)
    # turn the token list into a long-type tensor
    if return_tensor:
        return torch.LongTensor(tokens)
    else:
        return tokens