Initial commit

2022-08-30 09:56:27 +02:00
parent 7e1aaf0406
commit 3bf5181687
1 changed files with 59 additions and 0 deletions
@@ -0,0 +1,59 @@
+# rm test.wav; python TTS.py; play test.wav
+
+# Sources:
+# — https://github.com/AI-Guru/arxiv-reader
+# — https://huggingface.co/facebook/fastspeech2-en-ljspeech
+
+from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
+from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
+import scipy
+import numpy as np
+import IPython.display as ipd
+import re
+
+
+models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
+    "facebook/fastspeech2-en-ljspeech",
+    arg_overrides={"vocoder": "hifigan", "fp16": False}
+)
+
+TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg)
+generator = task.build_generator(models, cfg)
+
+full_wave_file = []
+rate = 44100
+sentences = []
+
+# Read input file
+with open(f"input.txt", "r") as f:
+    lines = f.readlines()
+
+# Convert to sentences
+for line in lines:
+    line = line.replace("-", " - ")
+    line = line.replace("—", ". ")
+    line = line.replace(";", ". ")
+    for x in line.split(". "):
+        # print(x)
+        sentences.append(x.strip())
+        # print(sentences)
+        sentences.append("<PAUSE>")
+        # print(sentences)
+
+# Synthesis text
+for text in sentences:
+    if text == "":
+        continue
+
+    if text == "<PAUSE>":
+        full_wave_file.extend(np.zeros(rate))
+        continue
+
+    sample = TTSHubInterface.get_model_input(task, text)
+    wav, rate = TTSHubInterface.get_prediction(task, models[0], generator, sample)
+    
+    wav = wav.numpy()
+    full_wave_file.extend(wav)
+
+full_wave_file = np.array(full_wave_file, dtype=np.float32)
+scipy.io.wavfile.write("test.wav", rate, full_wave_file)