Initial commit

This commit is contained in:
Heiko J Schick
2022-08-30 09:56:27 +02:00
parent 7e1aaf0406
commit 3bf5181687
+59
View File
@@ -0,0 +1,59 @@
# rm test.wav; python TTS.py; play test.wav
# Sources:
# — https://github.com/AI-Guru/arxiv-reader
# — https://huggingface.co/facebook/fastspeech2-en-ljspeech
from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
import scipy
import numpy as np
import IPython.display as ipd
import re
models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
"facebook/fastspeech2-en-ljspeech",
arg_overrides={"vocoder": "hifigan", "fp16": False}
)
TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg)
generator = task.build_generator(models, cfg)
full_wave_file = []
rate = 44100
sentences = []
# Read input file
with open(f"input.txt", "r") as f:
lines = f.readlines()
# Convert to sentences
for line in lines:
line = line.replace("-", " - ")
line = line.replace("", ". ")
line = line.replace(";", ". ")
for x in line.split(". "):
# print(x)
sentences.append(x.strip())
# print(sentences)
sentences.append("<PAUSE>")
# print(sentences)
# Synthesis text
for text in sentences:
if text == "":
continue
if text == "<PAUSE>":
full_wave_file.extend(np.zeros(rate))
continue
sample = TTSHubInterface.get_model_input(task, text)
wav, rate = TTSHubInterface.get_prediction(task, models[0], generator, sample)
wav = wav.numpy()
full_wave_file.extend(wav)
full_wave_file = np.array(full_wave_file, dtype=np.float32)
scipy.io.wavfile.write("test.wav", rate, full_wave_file)