def en_text_process(input_text: str, txt_format: str) -> str:
"""
The function that processes the text strings for TTS datasets to the specified text format.
Currently, available text formats:
punc:
Letter: lowercase
Punctuation: single quotes, commas, periods, hyphens
no-punc:
Letter: lowercase
Punctuation: single quotes
Args:
input_text: str
Unprocessed raw sentence from the TTS datasets
txt_format: str
The text format you want the processed sentence to have
Returns:
Processed sentence string by your specified text format.
"""
def is_punc(input_char: str):
return not (input_char.isalpha() or input_char == " ")
# 1st stage: turn capital letters into their lower cases
input_text = input_text.lower()
# 2nd stage: convert non-English letters into English counterparts
input_text = input_text.replace("è", "e")
input_text = input_text.replace("é", "e")
input_text = input_text.replace("ê", "e")
input_text = input_text.replace("â", "a")
input_text = input_text.replace("à", "a")
input_text = input_text.replace("ü", "u")
input_text = input_text.replace("ñ", "n")
input_text = input_text.replace("ô", "o")
input_text = input_text.replace("æ", "ae")
input_text = input_text.replace("œ", "oe")
# 3rd stage: convert all kinds of the quotes into half-angle single quotes '’'
input_text = input_text.replace("’", "'")
input_text = input_text.replace("‘", "'")
input_text = input_text.replace("“", "'")
input_text = input_text.replace("”", "'")
input_text = input_text.replace('"', "'")
input_text = input_text.replace("''", "'")
# 4th stage: process colons and semicolons
input_text = input_text.replace(
":'", ","
) # for the colons followed by a quote, turn them into commas
input_text = input_text.replace(":", ",")
input_text = input_text.replace(";", ".")
# 5th stage: process double-hyphens and em dashes
input_text = input_text.replace("--", "-")
input_text = input_text.replace("—", "-")
input_text = input_text.replace("¯", "-")
input_text = input_text.replace("-", ",")
input_text = input_text.replace("/", ".")
# 7th stage: replace all the punctuation marks other than ',', '.', '\'', '!', '?' by a space
_input_text_tmp = []
for char in input_text:
if not char.isalpha() and char not in [",", ".", "'", "!", "?"]:
_input_text_tmp.append(" ")
continue
_input_text_tmp.append(char)
input_text = "".join(_input_text_tmp)
# deal with single quotations by different cases
_input_text_tmp = []
for idx, char in enumerate(input_text):
# save all the non-quotation characters
if char != "'":
_input_text_tmp.append(char)
# remove the quotations at the beginning or end
elif idx == 0 or idx == len(input_text) - 1:
continue
# remove the quotations not surrounded by letters on both sides
elif not input_text[idx - 1].isalpha() or not input_text[idx + 1].isalpha():
# if a quotation is surrounded by a letter on the left and a blank on the right, turn it into a comma
if input_text[idx - 1].isalpha() and input_text[idx + 1] == " ":
_input_text_tmp.append(",")
# non-letter and non-blank character -> punctuation marks
# turn the quotations surrounded by two punctuation marks into a blank
elif is_punc(input_text[idx - 1]) and is_punc(input_text[idx + 1]):
_input_text_tmp.append(" ")
# in other cases, remove it
else:
continue
# save the intra-word quotations
else:
_input_text_tmp.append(char)
input_text = "".join(_input_text_tmp)
# 8th stage: question and exclamation marks
# remove duplicated questions
input_text = re.sub("([.,!?]\s*)+!", "!", input_text)
input_text = re.sub(
"([.,!?]\s*)+\?", "?", input_text
) # remove duplicated exclamations
# remove duplicated periods
input_text = re.sub("([.,!?]\s*)+\.", ".", input_text)
# remove duplicated commas
input_text = re.sub("([.,!?]\s*)+,", ",", input_text)
# remove the blanks and punctuation marks at the beginning
while input_text.startswith(" ") or is_punc(input_text[0]):
input_text = "".join(input_text[1:])
# remove the blanks at the end
while input_text.endswith(" "):
input_text = "".join(input_text[:-1])
# remove useless blanks
_input_text_tmp = []
for idx, char in enumerate(input_text):
if char == " ":
# remove consecutive blanks and replace them by a single blank
if input_text[idx + 1] == " ":
continue
# remove the blanks surrounded by letters on the left and punctuations on the right
elif _input_text_tmp[-1].isalpha() and is_punc(input_text[idx + 1]):
continue
elif (is_punc(char) and char != "'") and idx < len(input_text) - 1:
# add a space between punctuation marks on the left and letters on the right
if input_text[idx + 1].isalpha():
_input_text_tmp.append(f"{char} ")
continue
# only retain the last one of consecutive punctuation marks
elif is_punc(input_text[idx + 1]):
continue
_input_text_tmp.append(char)
input_text = "".join(_input_text_tmp)
# remain all the punctuation marks
if txt_format == "punc":
return input_text
# remove all the punctuation marks other than single-quotations
elif txt_format == "no-punc":
# remove all the punctuation symbols other than single quotes
return "".join(
[char for char in input_text if char.isalpha() or char in ["'", " "]]
)
else:
raise ValueError(
f"txt_format must be one of 'punc' or 'no-punc'. But got {txt_format}!"
)