Skip to content

dump_util

Author: Heli Qi Affiliation: NAIST Date: 2022.11

en_text_process(input_text, txt_format)

The function that processes the text strings for TTS datasets to the specified text format. Currently, available text formats: punc: Letter: lowercase Punctuation: single quotes, commas, periods, hyphens no-punc: Letter: lowercase Punctuation: single quotes

Parameters:

Name Type Description Default
input_text str

str Unprocessed raw sentence from the TTS datasets

required
txt_format str

str The text format you want the processed sentence to have

required

Returns:

Type Description
str

Processed sentence string by your specified text format.

Source code in speechain/utilbox/dump_util.py
def en_text_process(input_text: str, txt_format: str) -> str:
    """
    The function that processes the text strings for TTS datasets to the specified text format.
    Currently, available text formats:
        punc:
            Letter: lowercase
            Punctuation: single quotes, commas, periods, hyphens
        no-punc:
            Letter: lowercase
            Punctuation: single quotes

    Args:
        input_text: str
            Unprocessed raw sentence from the TTS datasets
        txt_format: str
            The text format you want the processed sentence to have

    Returns:
        Processed sentence string by your specified text format.

    """

    def is_punc(input_char: str):
        return not (input_char.isalpha() or input_char == " ")

    # 1st stage: turn capital letters into their lower cases
    input_text = input_text.lower()

    # 2nd stage: convert non-English letters into English counterparts
    input_text = input_text.replace("è", "e")
    input_text = input_text.replace("é", "e")
    input_text = input_text.replace("ê", "e")
    input_text = input_text.replace("â", "a")
    input_text = input_text.replace("à", "a")
    input_text = input_text.replace("ü", "u")
    input_text = input_text.replace("ñ", "n")
    input_text = input_text.replace("ô", "o")
    input_text = input_text.replace("æ", "ae")
    input_text = input_text.replace("œ", "oe")

    # 3rd stage: convert all kinds of the quotes into half-angle single quotes '’'
    input_text = input_text.replace("’", "'")
    input_text = input_text.replace("‘", "'")
    input_text = input_text.replace("“", "'")
    input_text = input_text.replace("”", "'")
    input_text = input_text.replace('"', "'")
    input_text = input_text.replace("''", "'")

    # 4th stage: process colons and semicolons
    input_text = input_text.replace(
        ":'", ","
    )  # for the colons followed by a quote, turn them into commas
    input_text = input_text.replace(":", ",")
    input_text = input_text.replace(";", ".")

    # 5th stage: process double-hyphens and em dashes
    input_text = input_text.replace("--", "-")
    input_text = input_text.replace("—", "-")
    input_text = input_text.replace("¯", "-")
    input_text = input_text.replace("-", ",")
    input_text = input_text.replace("/", ".")

    # 7th stage: replace all the punctuation marks other than ',', '.', '\'', '!', '?' by a space
    _input_text_tmp = []
    for char in input_text:
        if not char.isalpha() and char not in [",", ".", "'", "!", "?"]:
            _input_text_tmp.append(" ")
            continue
        _input_text_tmp.append(char)
    input_text = "".join(_input_text_tmp)

    # deal with single quotations by different cases
    _input_text_tmp = []
    for idx, char in enumerate(input_text):
        # save all the non-quotation characters
        if char != "'":
            _input_text_tmp.append(char)
        # remove the quotations at the beginning or end
        elif idx == 0 or idx == len(input_text) - 1:
            continue
        # remove the quotations not surrounded by letters on both sides
        elif not input_text[idx - 1].isalpha() or not input_text[idx + 1].isalpha():
            # if a quotation is surrounded by a letter on the left and a blank on the right, turn it into a comma
            if input_text[idx - 1].isalpha() and input_text[idx + 1] == " ":
                _input_text_tmp.append(",")
            # non-letter and non-blank character -> punctuation marks
            # turn the quotations surrounded by two punctuation marks into a blank
            elif is_punc(input_text[idx - 1]) and is_punc(input_text[idx + 1]):
                _input_text_tmp.append(" ")
            # in other cases, remove it
            else:
                continue
        # save the intra-word quotations
        else:
            _input_text_tmp.append(char)
    input_text = "".join(_input_text_tmp)

    # 8th stage: question and exclamation marks
    # remove duplicated questions
    input_text = re.sub("([.,!?]\s*)+!", "!", input_text)
    input_text = re.sub(
        "([.,!?]\s*)+\?", "?", input_text
    )  # remove duplicated exclamations
    # remove duplicated periods
    input_text = re.sub("([.,!?]\s*)+\.", ".", input_text)
    # remove duplicated commas
    input_text = re.sub("([.,!?]\s*)+,", ",", input_text)

    # remove the blanks and punctuation marks at the beginning
    while input_text.startswith(" ") or is_punc(input_text[0]):
        input_text = "".join(input_text[1:])
    # remove the blanks at the end
    while input_text.endswith(" "):
        input_text = "".join(input_text[:-1])

    # remove useless blanks
    _input_text_tmp = []
    for idx, char in enumerate(input_text):
        if char == " ":
            # remove consecutive blanks and replace them by a single blank
            if input_text[idx + 1] == " ":
                continue
            # remove the blanks surrounded by letters on the left and punctuations on the right
            elif _input_text_tmp[-1].isalpha() and is_punc(input_text[idx + 1]):
                continue
        elif (is_punc(char) and char != "'") and idx < len(input_text) - 1:
            # add a space between punctuation marks on the left and letters on the right
            if input_text[idx + 1].isalpha():
                _input_text_tmp.append(f"{char} ")
                continue
            # only retain the last one of consecutive punctuation marks
            elif is_punc(input_text[idx + 1]):
                continue
        _input_text_tmp.append(char)
    input_text = "".join(_input_text_tmp)

    # remain all the punctuation marks
    if txt_format == "punc":
        return input_text

    # remove all the punctuation marks other than single-quotations
    elif txt_format == "no-punc":
        # remove all the punctuation symbols other than single quotes
        return "".join(
            [char for char in input_text if char.isalpha() or char in ["'", " "]]
        )

    else:
        raise ValueError(
            f"txt_format must be one of 'punc' or 'no-punc'. But got {txt_format}!"
        )