Initial commit

2022-08-30 09:56:27 +02:00
parent 7e1aaf0406
commit 3bf5181687
1 changed files with 59 additions and 0 deletions
@@ -0,0 +1,59 @@
 # rm test.wav; python TTS.py; play test.wav
 # Sources:
 # — https://github.com/AI-Guru/arxiv-reader
 # — https://huggingface.co/facebook/fastspeech2-en-ljspeech
 from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
 from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
 import scipy
 import numpy as np
 import IPython.display as ipd
 import re
 models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
    "facebook/fastspeech2-en-ljspeech",
    arg_overrides={"vocoder": "hifigan", "fp16": False}
 )
 TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg)
 generator = task.build_generator(models, cfg)
 full_wave_file = []
 rate = 44100
 sentences = []
 # Read input file
 with open(f"input.txt", "r") as f:
    lines = f.readlines()
 # Convert to sentences
 for line in lines:
    line = line.replace("-", " - ")
    line = line.replace("—", ". ")
    line = line.replace(";", ". ")
    for x in line.split(". "):
        # print(x)
        sentences.append(x.strip())
        # print(sentences)
        sentences.append("<PAUSE>")
        # print(sentences)
 # Synthesis text
 for text in sentences:
    if text == "":
        continue
    if text == "<PAUSE>":
        full_wave_file.extend(np.zeros(rate))
        continue
    sample = TTSHubInterface.get_model_input(task, text)
    wav, rate = TTSHubInterface.get_prediction(task, models[0], generator, sample)
    wav = wav.numpy()
    full_wave_file.extend(wav)
 full_wave_file = np.array(full_wave_file, dtype=np.float32)
 scipy.io.wavfile.write("test.wav", rate, full_wave_file)