# rm test.wav; python tts.py; play test.wav

# Sources:
# — https://huggingface.co/facebook/fastspeech2-en-ljspeech
# — https://github.com/AI-Guru/arxiv-reader

from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
import scipy
import numpy as np
import IPython.display as ipd


models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
    "facebook/fastspeech2-en-ljspeech",
    arg_overrides={"vocoder": "hifigan", "fp16": False}
)

TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg)
generator = task.build_generator(models, cfg)

full_wave_file = []
rate = 44100
sentences = []

# Read input file
with open(f"input.txt", "r") as f:
    lines = f.readlines()

# Convert to sentences
for line in lines:
    line = line.replace("-", " - ")
    line = line.replace("/", ", ")
    line = line.replace("—", ". ")
    line = line.replace(":", ". ")
    line = line.replace(";", ". ")
    line = line.replace("(", ". ")
    line = line.replace(")", ". ")
    for x in line.split(". "):
        # print(x)
        sentences.append(x.strip())
        # print(sentences)
        sentences.append("<PAUSE>")
        # print(sentences)

# Synthesis text
for text in sentences:
    if text == "":
        continue

    if text == "<PAUSE>":
        full_wave_file.extend(np.zeros(rate))
        continue

    sample = TTSHubInterface.get_model_input(task, text)
    wav, rate = TTSHubInterface.get_prediction(task, models[0], generator, sample)
    
    wav = wav.numpy()
    full_wave_file.extend(wav)

full_wave_file = np.array(full_wave_file, dtype=np.float32)
scipy.io.wavfile.write("test.wav", rate, full_wave_file)