# rm test.wav; python tts.py; play test.wav # Sources: # — https://huggingface.co/facebook/fastspeech2-en-ljspeech # — https://github.com/AI-Guru/arxiv-reader from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub from fairseq.models.text_to_speech.hub_interface import TTSHubInterface import scipy import numpy as np import IPython.display as ipd models, cfg, task = load_model_ensemble_and_task_from_hf_hub( "facebook/fastspeech2-en-ljspeech", arg_overrides={"vocoder": "hifigan", "fp16": False} ) TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg) generator = task.build_generator(models, cfg) full_wave_file = [] rate = 44100 sentences = [] # Read input file with open(f"input.txt", "r") as f: lines = f.readlines() # Convert to sentences for line in lines: line = line.replace("-", " - ") line = line.replace("/", ", ") line = line.replace("—", ". ") line = line.replace(":", ". ") line = line.replace(";", ". ") line = line.replace("(", ". ") line = line.replace(")", ". ") for x in line.split(". "): # print(x) sentences.append(x.strip()) # print(sentences) sentences.append("") # print(sentences) # Synthesis text for text in sentences: if text == "": continue if text == "": full_wave_file.extend(np.zeros(rate)) continue sample = TTSHubInterface.get_model_input(task, text) wav, rate = TTSHubInterface.get_prediction(task, models[0], generator, sample) wav = wav.numpy() full_wave_file.extend(wav) full_wave_file = np.array(full_wave_file, dtype=np.float32) scipy.io.wavfile.write("test.wav", rate, full_wave_file)