Repository: R2D2FISH/glados-tts Branch: main Commit: 1f49d17cd409 Files: 24 Total size: 21.2 KB Directory structure: gitextract_gq5vqgst/ ├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── engine.py ├── glados.py ├── models/ │ ├── emb/ │ │ ├── glados_p1.pt │ │ └── glados_p2.pt │ ├── en_us_cmudict_ipa_forward.pt │ ├── glados-new.pt │ ├── glados.pt │ ├── vocoder-cpu-hq.pt │ ├── vocoder-cpu-lq.pt │ └── vocoder-gpu.pt ├── requirements.txt └── utils/ ├── __init__.py ├── text/ │ ├── LICENSE │ ├── __init__.py │ ├── cleaners.py │ ├── numbers.py │ ├── recipes.py │ ├── symbols.py │ └── tokenizer.py └── tools.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitattributes ================================================ *.pt filter=lfs diff=lfs merge=lfs -text ================================================ FILE: .gitignore ================================================ **/__pycache__/** *.wav audio/* ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2022 Timmy Knight Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # GLaDOS Text-to-speech (TTS) Voice Generator Neural network based TTS Engine. If you want to just play around with the TTS, this works as stand-alone. ```console python3 glados-tts/glados.py ``` the TTS Engine can also be used remotely on a machine more powerful then the Pi to process in house TTS: (executed from glados-tts directory ```console python3 engine-remote.py ``` Default port is 8124 Be sure to update settings.env variable in your main Glados-voice-assistant directory: ``` TTS_ENGINE_API = http://192.168.1.3:8124/synthesize/ ``` ## Training (New Model) The Tacotron and ForwardTacotron models were trained as multispeaker models on two datasets separated into three speakers. LJSpeech (13,100 lines), and then on the heavily modified version of the Ellen McClain dataset, separated into Portal 1 and 2 voices (with punctuation and corrections added manually). The lines from the end of Portal 1 after the cores get knocked off were counted as Portal 2 lines. ## Training (Old Model) The initial, regular Tacotron model was trained first on LJSpeech, and then on a heavily modified version of the Ellen McClain dataset (all non-Portal 2 voice lines removed, punctuation added). * The Forward Tacotron model was only trained on about 600 voice lines. * The HiFiGAN model was generated through transfer learning from the sample. * All models have been optimized and quantized. ## Installation Instruction If you want to install the TTS Engine on your machine, please follow the steps below. 1. Download the model files from [`Google Drive`](https://drive.google.com/file/d/1TRJtctjETgVVD5p7frSVPmgw8z8FFtjD/view?usp=sharing) and unzip into the repo folder 2. Install the required Python packages, e.g., by running `pip install -r requirements.txt` ================================================ FILE: engine.py ================================================ import sys import os sys.path.insert(0, os.getcwd()+'/glados_tts') import torch from utils.tools import prepare_text from scipy.io.wavfile import write import time from glados import tts_runner print("\033[1;94mINFO:\033[;97m Initializing TTS Engine...") glados = tts_runner(False, True) def glados_tts(text, key=False, alpha=1.0): if(key): output_file = ('audio/GLaDOS-tts-temp-output-'+key+'.wav') else: output_file = ('audio/GLaDOS-tts-temp-output.wav') glados.run_tts(text, alpha).export(output_file, format = "wav") return True # If the script is run directly, assume remote engine if __name__ == "__main__": # Remote Engine Veritables PORT = 8124 CACHE = True from flask import Flask, request, send_file import urllib.parse import shutil print("\033[1;94mINFO:\033[;97m Initializing TTS Server...") app = Flask(__name__) @app.route('/synthesize/', defaults={'text': ''}) @app.route('/synthesize/') def synthesize(text): if(text == ''): return 'No input' line = urllib.parse.unquote(request.url[request.url.find('synthesize/')+11:]) filename = "GLaDOS-tts-"+line.replace(" ", "-") filename = filename.replace("!", "") filename = filename.replace("°c", "degrees celcius") filename = filename.replace(",", "")+".wav" file = os.getcwd()+'/audio/'+filename # Check for Local Cache if(os.path.isfile(file)): # Update access time. This will allow for routine cleanups os.utime(file, None) print("\033[1;94mINFO:\033[;97m The audio sample sent from cache.") return send_file(file) # Generate New Sample key = str(time.time())[7:] if(glados_tts(line, key)): tempfile = os.getcwd()+'/audio/GLaDOS-tts-temp-output-'+key+'.wav' # If the line isn't too long, store in cache if(len(line) < 200 and CACHE): shutil.move(tempfile, file) else: return send_file(tempfile) os.remove(tempfile) return send_file(file) else: return 'TTS Engine Failed' cli = sys.modules['flask.cli'] cli.show_server_banner = lambda *x: None app.run(host="0.0.0.0", port=PORT) ================================================ FILE: glados.py ================================================ import torch import numpy as np from utils.tools import prepare_text from scipy.io.wavfile import write import time import tempfile import subprocess from pydub import AudioSegment from pydub.playback import play from nltk import download from nltk.tokenize import sent_tokenize from sys import modules as mod try: import winsound except ImportError: from subprocess import call print("Initializing TTS Engine...") kwargs = { 'stdout':subprocess.PIPE, 'stderr':subprocess.PIPE, 'stdin':subprocess.PIPE } class tts_runner: def __init__(self, use_p1: bool=False, log: bool=False): self.log = log if use_p1: self.emb = torch.load('models/emb/glados_p1.pt') else: self.emb = torch.load('models/emb/glados_p2.pt') # Select the device if torch.cuda.is_available(): self.device = 'cuda' elif torch.is_vulkan_available(): self.device = 'vulkan' else: self.device = 'cpu' # Load models self.glados = torch.jit.load('models/glados-new.pt') self.vocoder = torch.jit.load('models/vocoder-gpu.pt', map_location=self.device) for i in range(2): init = self.glados.generate_jit(prepare_text(str(i)), self.emb, 1.0) init_mel = init['mel_post'].to(self.device) init_vo = self.vocoder(init_mel) def run_tts(self, text, alpha: float=1.0) -> AudioSegment: x = prepare_text(text) with torch.no_grad(): # Generate generic TTS-output old_time = time.time() tts_output = self.glados.generate_jit(x, self.emb, alpha) if self.log: print("Forward Tacotron took " + str((time.time() - old_time) * 1000) + "ms") # Use HiFiGAN as vocoder to make output sound like GLaDOS old_time = time.time() mel = tts_output['mel_post'].to(self.device) audio = self.vocoder(mel) if self.log: print("HiFiGAN took " + str((time.time() - old_time) * 1000) + "ms") # Normalize audio to fit in wav-file audio = audio.squeeze() audio = audio * 32768.0 audio = audio.cpu().numpy().astype('int16') output_file = tempfile.TemporaryFile() write(output_file, 22050, audio) sound = AudioSegment.from_wav(output_file) output_file.close() return sound def speak_one_line(self, audio, name: str): audio.export(name, format = "wav") if 'winsound' in mod: winsound.PlaySound(name, winsound.SND_FILENAME | winsound.SND_ASYNC) else: try: subprocess.Popen(["play", name], **kwargs) except FileNotFoundError: try: subprocess.Popen(["aplay", name], **kwargs) except FileNotFoundError: subprocess.Popen(["pw-play", name], **kwargs) def speak(self, text, alpha: float=1.0, save: bool=False, delay: float=0.1): download('punkt',quiet=self.log) sentences = sent_tokenize(text) audio = self.run_tts(sentences[0]) pause = AudioSegment.silent(duration=delay) old_line = AudioSegment.silent(duration=1.0) + audio self.speak_one_line(old_line, "old_line.wav") old_time = time.time() old_dur = old_line.duration_seconds new_dur = old_dur if len(sentences) > 1: for idx in range(1, len(sentences)): if idx % 2 == 1: new_line = self.run_tts(sentences[idx]) audio = audio + pause + new_line new_dur = new_line.duration_seconds else: old_line = self.run_tts(sentences[idx]) audio = audio + pause + old_line new_dur = old_line.duration_seconds time_left = old_dur - time.time() + old_time if time_left <= 0 and self.log: print("Processing is slower than realtime!") else: time.sleep(time_left + delay) if idx % 2 == 1: self.speak_one_line(new_line, "new_line.wav") else: self.speak_one_line(old_line, "old_line.wav") old_time = time.time() old_dur = new_dur else: time.sleep(old_dur + 0.1) audio.export("output.wav", format = "wav") time_left = old_dur - time.time() + old_time if time_left >= 0: time.sleep(time_left + delay) if __name__ == "__main__": glados = tts_runner(False, True) while True: text = input("Input: ") if len(text) > 0: glados.speak(text, True) ================================================ FILE: models/emb/glados_p1.pt ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:de04091401bd98a69c45dadcaddda515232edf341ef39b9892e12bbad0e5be00 size 1777 ================================================ FILE: models/emb/glados_p2.pt ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:ec519ed9ae6612dd6606d59f856d0c471b034752df770365fa0c0e50e1f05fb1 size 1777 ================================================ FILE: models/en_us_cmudict_ipa_forward.pt ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:cadce3d77597b55e772799cb46994ab29a460f1a62a87207b52f3cdb29894e02 size 65637046 ================================================ FILE: models/glados-new.pt ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:e75a18e03517bd204130dfa7bd37e200be342219af1e1b537aa818401c041ed4 size 116160936 ================================================ FILE: models/glados.pt ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:e857f8eec966d76219c9bb49bd271b91ab9f115c7a033cb32082743ec9e0b97c size 75644711 ================================================ FILE: models/vocoder-cpu-hq.pt ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:68145c03288fe13f634d24d2f47238c39462cb10acb0b2ceaaa64317d74e392e size 55758413 ================================================ FILE: models/vocoder-cpu-lq.pt ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:97b0c527cfd3485359afda80b6ca1b3cf22343b4e289deccbe98bcb8e438bd71 size 5867188 ================================================ FILE: models/vocoder-gpu.pt ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:16aedb72be562822126bf155b43612ceb2836dd5e8ed5b241617aa322f30c4ce size 55753209 ================================================ FILE: requirements.txt ================================================ deep_phonemizer~=0.0.19 Flask~=3.0.0 inflect~=6.0.4 nltk~=3.8.1 numpy~=1.23.5 pandas~=2.0.0 pydub~=0.25.1 scipy~=1.10.1 torch~=2.0.0 tqdm~=4.65.0 Unidecode~=1.3.6 ================================================ FILE: utils/__init__.py ================================================ ================================================ FILE: utils/text/LICENSE ================================================ Copyright (c) 2017 Keith Ito Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: utils/text/__init__.py ================================================ ================================================ FILE: utils/text/cleaners.py ================================================ import re from typing import Dict, Any from unidecode import unidecode from utils.text.numbers import normalize_numbers from utils.text.symbols import phonemes_set from dp.phonemizer import Phonemizer import torch # Regular expression matching whitespace: _whitespace_re = re.compile(r'\s+') # List of (regular expression, replacement) pairs for abbreviations: _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ ('mrs', 'misess'), ('mr', 'mister'), ('dr', 'doctor'), ('st', 'saint'), ('co', 'company'), ('jr', 'junior'), ('maj', 'major'), ('gen', 'general'), ('drs', 'doctors'), ('rev', 'reverend'), ('lt', 'lieutenant'), ('hon', 'honorable'), ('sgt', 'sergeant'), ('capt', 'captain'), ('esq', 'esquire'), ('ltd', 'limited'), ('col', 'colonel'), ('ft', 'fort') ]] def expand_abbreviations(text): for regex, replacement in _abbreviations: text = re.sub(regex, replacement, text) return text def collapse_whitespace(text): return re.sub(_whitespace_re, ' ', text) def no_cleaners(text): return text def english_cleaners(text): text = unidecode(text) text = normalize_numbers(text) text = expand_abbreviations(text) return text class Cleaner: def __init__(self, cleaner_name: str, use_phonemes: bool, lang: str) -> None: if cleaner_name == 'english_cleaners': self.clean_func = english_cleaners elif cleaner_name == 'no_cleaners': self.clean_func = no_cleaners else: raise ValueError(f'Cleaner not supported: {cleaner_name}! ' f'Currently supported: [\'english_cleaners\', \'no_cleaners\']') self.use_phonemes = use_phonemes self.lang = lang if use_phonemes: self.phonemize = Phonemizer.from_checkpoint('models/en_us_cmudict_ipa_forward.pt') def __call__(self, text: str) -> str: text = self.clean_func(text) if self.use_phonemes: text = self.phonemize(text, lang='en_us') text = ''.join([p for p in text if p in phonemes_set]) text = collapse_whitespace(text) text = text.strip() return text @classmethod def from_config(cls, config: Dict[str, Any]) -> 'Cleaner': return Cleaner( cleaner_name=config['preprocessing']['cleaner_name'], use_phonemes=config['preprocessing']['use_phonemes'], lang=config['preprocessing']['language'] ) ================================================ FILE: utils/text/numbers.py ================================================ """ from https://github.com/keithito/tacotron """ import inflect import re _inflect = inflect.engine() _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') _number_re = re.compile(r'[0-9]+') def _remove_commas(m): return m.group(1).replace(',', '') def _expand_decimal_point(m): return m.group(1).replace('.', ' point ') def _expand_dollars(m): match = m.group(1) parts = match.split('.') if len(parts) > 2: return match + ' dollars' # Unexpected format dollars = int(parts[0]) if parts[0] else 0 cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 if dollars and cents: dollar_unit = 'dollar' if dollars == 1 else 'dollars' cent_unit = 'cent' if cents == 1 else 'cents' return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) elif dollars: dollar_unit = 'dollar' if dollars == 1 else 'dollars' return '%s %s' % (dollars, dollar_unit) elif cents: cent_unit = 'cent' if cents == 1 else 'cents' return '%s %s' % (cents, cent_unit) else: return 'zero dollars' def _expand_ordinal(m): return _inflect.number_to_words(m.group(0)) def _expand_number(m): num = int(m.group(0)) if num > 1000 and num < 3000: if num == 2000: return 'two thousand' elif num > 2000 and num < 2010: return 'two thousand ' + _inflect.number_to_words(num % 100) elif num % 100 == 0: return _inflect.number_to_words(num // 100) + ' hundred' else: return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') else: return _inflect.number_to_words(num, andword='') def normalize_numbers(text): text = re.sub(_comma_number_re, _remove_commas, text) text = re.sub(_pounds_re, r'\1 pounds', text) text = re.sub(_dollars_re, _expand_dollars, text) text = re.sub(_decimal_number_re, _expand_decimal_point, text) text = re.sub(_ordinal_re, _expand_ordinal, text) text = re.sub(_number_re, _expand_number, text) return text ================================================ FILE: utils/text/recipes.py ================================================ from multiprocessing import Pool from pathlib import Path from typing import Tuple import pandas as pd import tqdm from utils.files import get_files DEFAULT_SPEAKER_NAME = 'default_speaker' def read_metadata(path: Path, metafile: str, format: str, n_workers=1) -> Tuple[dict, dict]: if format == 'ljspeech': return read_ljspeech_format(path/metafile, multispeaker=False) elif format == 'ljspeech_multi': return read_ljspeech_format(path/metafile, multispeaker=True) elif format == 'vctk': return read_vctk_format(path, n_workers=n_workers) elif format == 'pandas': return read_pandas_format(path/metafile) else: raise ValueError(f'Metafile has unexpected ending: {path.stem}, expected [.csv, .tsv]"') def read_ljspeech_format(path: Path, multispeaker: bool = False) -> Tuple[dict, dict]: if not path.is_file(): raise ValueError(f'Could not find metafile: {path}, ' f'please make sure that you set the correct path and metafile name!') text_dict = {} speaker_dict = {} with open(str(path), encoding='utf-8') as f: for line in f: split = line.split('|') speaker_name = split[-2] if multispeaker and len(split) > 2 else DEFAULT_SPEAKER_NAME file_id, text = split[0], split[-1] text_dict[file_id] = text.replace('\n', '') speaker_dict[file_id] = speaker_name return text_dict, speaker_dict def read_vctk_format(path: Path, n_workers: int, extension='.txt') -> Tuple[dict, dict]: files = get_files(path, extension=extension) text_dict = {} speaker_dict = {} pool = Pool(processes=n_workers) for i, (file, text) in tqdm.tqdm(enumerate(pool.imap_unordered(read_line, files), 1), total=len(files)): text_id = file.name.replace(extension, '') speaker_id = file.parent.stem text_dict[text_id] = text.replace('\n', '') speaker_dict[text_id] = speaker_id return text_dict, speaker_dict def read_pandas_format(path: Path) -> Tuple[dict, dict]: if not path.is_file(): raise ValueError(f'Could not find metafile: {path}, ' f'please make sure that you set the correct path and metafile name!') df = pd.read_csv(str(path), sep='\t', encoding='utf-8') text_dict = {} speaker_dict = {} for index, row in df.iterrows(): id = row['file_id'] text_dict[id] = row['text'] speaker_dict[id] = row['speaker_id'] return text_dict, speaker_dict def read_line(file: Path) -> Tuple[Path, str]: with open(str(file), encoding='utf-8') as f: line = f.readlines()[0] return file, line ================================================ FILE: utils/text/symbols.py ================================================ """ from https://github.com/keithito/tacotron """ ''' Defines the set of symbols used in text input to the model. The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. ''' _pad = '_' _punctuation = '!\'(),.:;? ' _special = '-' # Phonemes _vowels = 'iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻ' _non_pulmonic_consonants = 'ʘɓǀɗǃʄǂɠǁʛ' _pulmonic_consonants = 'pbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟ' _suprasegmentals = 'ˈˌːˑ' _other_symbols = 'ʍwɥʜʢʡɕʑɺɧ' _diacrilics = 'ɚ˞ɫ' _extra_phons = ['g', 'ɝ', '̃', '̍', '̥', '̩', '̯', '͡'] # some extra symbols that I found in from wiktionary ipa annotations phonemes = list( _pad + _punctuation + _special + _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics) + _extra_phons phonemes_set = set(phonemes) silent_phonemes_indices = [i for i, p in enumerate(phonemes) if p in _pad + _punctuation] ================================================ FILE: utils/text/tokenizer.py ================================================ from typing import List from utils.text.symbols import phonemes class Tokenizer: def __init__(self) -> None: self.symbol_to_id = {s: i for i, s in enumerate(phonemes)} self.id_to_symbol = {i: s for i, s in enumerate(phonemes)} def __call__(self, text: str) -> List[int]: return [self.symbol_to_id[t] for t in text if t in self.symbol_to_id] def decode(self, sequence: List[int]) -> str: text = [self.id_to_symbol[s] for s in sequence if s in self.id_to_symbol] return ''.join(text) ================================================ FILE: utils/tools.py ================================================ import torch from utils.text.cleaners import Cleaner from utils.text.tokenizer import Tokenizer def prepare_text(text: str)->str: if not ((text[-1] == '.') or (text[-1] == '?') or (text[-1] == '!')): text = text + '.' cleaner = Cleaner('english_cleaners', True, 'en-us') tokenizer = Tokenizer() return torch.as_tensor(tokenizer(cleaner(text)), dtype=torch.long, device='cpu').unsqueeze(0)