tts/tts.py

# rm test.wav; python tts.py; play test.wav

# Sources:
# — https://huggingface.co/facebook/fastspeech2-en-ljspeech
# — https://github.com/AI-Guru/arxiv-reader

from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
import scipy
import numpy as np
import IPython.display as ipd

def main():
    """
    Defined starting point of source code.
    """

    models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
        "facebook/fastspeech2-en-ljspeech",
        arg_overrides={"vocoder": "hifigan", "fp16": False}
    )

    TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg)
    generator = task.build_generator(models, cfg)

    full_wave_file = []
    rate = 44100
    sentences = []

    # Read input file
    with open(f"input.txt", "r") as f:
        lines = f.readlines()

    # Convert to sentences
    for line in lines:
        line = line.replace("-", " - ")
        line = line.replace("/", ", ")
        line = line.replace("—", ". ")
        line = line.replace(":", ". ")
        line = line.replace(";", ". ")
        line = line.replace("(", ". ")
        line = line.replace(")", ". ")
        for x in line.split(". "):
            sentences.append(x.strip())
            sentences.append("<PAUSE>")

    # Synthesis text
    for text in sentences:
        if text == "":
            continue

        if text == "<PAUSE>":
            full_wave_file.extend(np.zeros(rate))
            continue

        sample = TTSHubInterface.get_model_input(task, text)
        wav, rate = TTSHubInterface.get_prediction(task, models[0], generator, sample)

        wav = wav.numpy()
        full_wave_file.extend(wav)

    full_wave_file = np.array(full_wave_file, dtype=np.float32)
    scipy.io.wavfile.write("test.wav", rate, full_wave_file)

if __name__ == "__main__":
    main()