[
  {
    "path": ".gitattributes",
    "content": "*.pt filter=lfs diff=lfs merge=lfs -text\n"
  },
  {
    "path": ".gitignore",
    "content": "**/__pycache__/**\n*.wav\naudio/*"
  },
  {
    "path": "LICENSE",
    "content": "MIT License\n\nCopyright (c) 2022 Timmy Knight\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "README.md",
    "content": "# GLaDOS Text-to-speech (TTS) Voice Generator\nNeural network based TTS Engine.\n\nIf you want to just play around with the TTS, this works as stand-alone.\n```console\npython3 glados-tts/glados.py\n```\n\nthe TTS Engine can also be used remotely on a machine more powerful then the Pi to process in house TTS: (executed from glados-tts directory\n```console\npython3 engine-remote.py\n```\n\nDefault port is 8124\nBe sure to update settings.env variable in your main Glados-voice-assistant directory:\n```\nTTS_ENGINE_API\t\t\t= http://192.168.1.3:8124/synthesize/\n```\n\n\n## Training (New Model)\nThe Tacotron and ForwardTacotron models were trained as multispeaker models on two datasets separated into three speakers. LJSpeech (13,100 lines), and then on the heavily modified version of the Ellen McClain dataset, separated into Portal 1 and 2 voices (with punctuation and corrections added manually). The lines from the end of Portal 1 after the cores get knocked off were counted as Portal 2 lines.\n\n\n## Training (Old Model)\nThe initial, regular Tacotron model was trained first on LJSpeech, and then on a heavily modified version of the Ellen McClain dataset (all non-Portal 2 voice lines removed, punctuation added).\n\n* The Forward Tacotron model was only trained on about 600 voice lines.\n* The HiFiGAN model was generated through transfer learning from the sample.\n* All models have been optimized and quantized.\n\n\n\n## Installation Instruction\nIf you want to install the TTS Engine on your machine, please follow the steps\nbelow.\n\n1. Download the model files from [`Google Drive`](https://drive.google.com/file/d/1TRJtctjETgVVD5p7frSVPmgw8z8FFtjD/view?usp=sharing) and unzip into the repo folder\n2. Install the required Python packages, e.g., by running `pip install -r\n   requirements.txt`\n"
  },
  {
    "path": "engine.py",
    "content": "import sys\nimport os\nsys.path.insert(0, os.getcwd()+'/glados_tts')\n\nimport torch\nfrom utils.tools import prepare_text\nfrom scipy.io.wavfile import write\nimport time\n\nfrom glados import tts_runner\n\t\t\nprint(\"\\033[1;94mINFO:\\033[;97m Initializing TTS Engine...\")\n\nglados = tts_runner(False, True)\n\ndef glados_tts(text, key=False, alpha=1.0):\n\n\tif(key):\n\t\toutput_file = ('audio/GLaDOS-tts-temp-output-'+key+'.wav')\n\telse:\n\t\toutput_file = ('audio/GLaDOS-tts-temp-output.wav')\n\n\tglados.run_tts(text, alpha).export(output_file, format = \"wav\")\n\treturn True\n\n\n# If the script is run directly, assume remote engine\nif __name__ == \"__main__\":\n\t\n\t# Remote Engine Veritables\n\tPORT = 8124\n\tCACHE = True\n\n\tfrom flask import Flask, request, send_file\n\timport urllib.parse\n\timport shutil\n\t\n\tprint(\"\\033[1;94mINFO:\\033[;97m Initializing TTS Server...\")\n\t\n\tapp = Flask(__name__)\n\n\t@app.route('/synthesize/', defaults={'text': ''})\n\t@app.route('/synthesize/<path:text>')\n\tdef synthesize(text):\n\t\tif(text == ''): return 'No input'\n\t\t\n\t\tline = urllib.parse.unquote(request.url[request.url.find('synthesize/')+11:])\n\t\tfilename = \"GLaDOS-tts-\"+line.replace(\" \", \"-\")\n\t\tfilename = filename.replace(\"!\", \"\")\n\t\tfilename = filename.replace(\"°c\", \"degrees celcius\")\n\t\tfilename = filename.replace(\",\", \"\")+\".wav\"\n\t\tfile = os.getcwd()+'/audio/'+filename\n\t\t\n\t\t# Check for Local Cache\n\t\tif(os.path.isfile(file)):\n\t\t\n\t\t\t# Update access time. This will allow for routine cleanups\n\t\t\tos.utime(file, None)\n\t\t\tprint(\"\\033[1;94mINFO:\\033[;97m The audio sample sent from cache.\")\n\t\t\treturn send_file(file)\n\t\t\t\n\t\t# Generate New Sample\n\t\tkey = str(time.time())[7:]\n\t\tif(glados_tts(line, key)):\n\t\t\ttempfile = os.getcwd()+'/audio/GLaDOS-tts-temp-output-'+key+'.wav'\n\t\t\t\t\t\t\n\t\t\t# If the line isn't too long, store in cache\n\t\t\tif(len(line) < 200 and CACHE):\n\t\t\t\tshutil.move(tempfile, file)\n\t\t\telse:\n\t\t\t\treturn send_file(tempfile)\n\t\t\t\tos.remove(tempfile)\n\t\t\t\t\n\t\t\treturn send_file(file)\n\t\t\t\t\n\t\telse:\n\t\t\treturn 'TTS Engine Failed'\n\t\t\t\n\tcli = sys.modules['flask.cli']\n\tcli.show_server_banner = lambda *x: None\n\tapp.run(host=\"0.0.0.0\", port=PORT)\n"
  },
  {
    "path": "glados.py",
    "content": "import torch\nimport numpy as np\nfrom utils.tools import prepare_text\nfrom scipy.io.wavfile import write\nimport time\nimport tempfile\nimport subprocess\nfrom pydub import AudioSegment\nfrom pydub.playback import play\nfrom nltk import download\nfrom nltk.tokenize import sent_tokenize\nfrom sys import modules as mod\ntry:\n    import winsound\nexcept ImportError:\n    from subprocess import call\nprint(\"Initializing TTS Engine...\")\n\nkwargs = {\n    'stdout':subprocess.PIPE,\n    'stderr':subprocess.PIPE,\n    'stdin':subprocess.PIPE\n}\n\nclass tts_runner:\n    def __init__(self, use_p1: bool=False, log: bool=False):\n        self.log = log\n        if use_p1:\n            self.emb = torch.load('models/emb/glados_p1.pt')\n        else:\n            self.emb = torch.load('models/emb/glados_p2.pt')\n        # Select the device\n        if torch.cuda.is_available():\n            self.device = 'cuda'\n        elif torch.is_vulkan_available():\n            self.device = 'vulkan'\n        else:\n            self.device = 'cpu'\n\n        # Load models\n        self.glados = torch.jit.load('models/glados-new.pt')\n        self.vocoder = torch.jit.load('models/vocoder-gpu.pt', map_location=self.device)\n        for i in range(2):\n            init = self.glados.generate_jit(prepare_text(str(i)), self.emb, 1.0)\n            init_mel = init['mel_post'].to(self.device)\n            init_vo = self.vocoder(init_mel)\n\n    def run_tts(self, text, alpha: float=1.0) -> AudioSegment:\n        x = prepare_text(text)\n\n        with torch.no_grad():\n\n            # Generate generic TTS-output\n            old_time = time.time()\n            tts_output = self.glados.generate_jit(x, self.emb, alpha)\n            if self.log:\n                print(\"Forward Tacotron took \" + str((time.time() - old_time) * 1000) + \"ms\")\n\n            # Use HiFiGAN as vocoder to make output sound like GLaDOS\n            old_time = time.time()\n            mel = tts_output['mel_post'].to(self.device)\n            audio = self.vocoder(mel)\n            if self.log:\n                print(\"HiFiGAN took \" + str((time.time() - old_time) * 1000) + \"ms\")\n\n            # Normalize audio to fit in wav-file\n            audio = audio.squeeze()\n            audio = audio * 32768.0\n            audio = audio.cpu().numpy().astype('int16')\n            output_file = tempfile.TemporaryFile()\n            write(output_file, 22050, audio)\n            sound = AudioSegment.from_wav(output_file)\n            output_file.close()\n            return sound\n\n    def speak_one_line(self, audio, name: str):\n        audio.export(name, format = \"wav\")\n        if 'winsound' in mod:\n            winsound.PlaySound(name, winsound.SND_FILENAME | winsound.SND_ASYNC)\n        else:\n            try:\n                subprocess.Popen([\"play\", name], **kwargs)\n            except FileNotFoundError:\n                try:\n                    subprocess.Popen([\"aplay\", name], **kwargs)\n                except FileNotFoundError:\n                    subprocess.Popen([\"pw-play\", name], **kwargs)\n\n\n    def speak(self, text, alpha: float=1.0, save: bool=False, delay: float=0.1):\n        download('punkt',quiet=self.log)\n        sentences = sent_tokenize(text)\n        audio = self.run_tts(sentences[0])\n        pause = AudioSegment.silent(duration=delay)\n        old_line = AudioSegment.silent(duration=1.0) + audio\n        self.speak_one_line(old_line, \"old_line.wav\")\n        old_time = time.time()\n        old_dur = old_line.duration_seconds\n        new_dur = old_dur\n        if len(sentences) > 1:\n            for idx in range(1, len(sentences)):\n                if idx % 2 == 1:\n                    new_line = self.run_tts(sentences[idx])\n                    audio = audio + pause + new_line\n                    new_dur = new_line.duration_seconds\n                else:\n                    old_line = self.run_tts(sentences[idx])\n                    audio = audio + pause + old_line\n                    new_dur = old_line.duration_seconds\n                time_left = old_dur - time.time() + old_time\n                if time_left <= 0 and self.log:\n                    print(\"Processing is slower than realtime!\")\n                else:\n                    time.sleep(time_left + delay)\n                if idx % 2 == 1:\n                    self.speak_one_line(new_line, \"new_line.wav\")\n                else:\n                    self.speak_one_line(old_line, \"old_line.wav\")\n                old_time = time.time()\n                old_dur = new_dur\n        else:\n            time.sleep(old_dur + 0.1)\n\n        audio.export(\"output.wav\", format = \"wav\")\n        time_left = old_dur - time.time() + old_time\n        if time_left >= 0:\n            time.sleep(time_left + delay)\n\nif __name__ == \"__main__\":\n    glados = tts_runner(False, True)\n    while True:\n        text = input(\"Input: \")\n        if len(text) > 0:\n            glados.speak(text, True)\n"
  },
  {
    "path": "models/emb/glados_p1.pt",
    "content": "version https://git-lfs.github.com/spec/v1\noid sha256:de04091401bd98a69c45dadcaddda515232edf341ef39b9892e12bbad0e5be00\nsize 1777\n"
  },
  {
    "path": "models/emb/glados_p2.pt",
    "content": "version https://git-lfs.github.com/spec/v1\noid sha256:ec519ed9ae6612dd6606d59f856d0c471b034752df770365fa0c0e50e1f05fb1\nsize 1777\n"
  },
  {
    "path": "models/en_us_cmudict_ipa_forward.pt",
    "content": "version https://git-lfs.github.com/spec/v1\noid sha256:cadce3d77597b55e772799cb46994ab29a460f1a62a87207b52f3cdb29894e02\nsize 65637046\n"
  },
  {
    "path": "models/glados-new.pt",
    "content": "version https://git-lfs.github.com/spec/v1\noid sha256:e75a18e03517bd204130dfa7bd37e200be342219af1e1b537aa818401c041ed4\nsize 116160936\n"
  },
  {
    "path": "models/glados.pt",
    "content": "version https://git-lfs.github.com/spec/v1\noid sha256:e857f8eec966d76219c9bb49bd271b91ab9f115c7a033cb32082743ec9e0b97c\nsize 75644711\n"
  },
  {
    "path": "models/vocoder-cpu-hq.pt",
    "content": "version https://git-lfs.github.com/spec/v1\noid sha256:68145c03288fe13f634d24d2f47238c39462cb10acb0b2ceaaa64317d74e392e\nsize 55758413\n"
  },
  {
    "path": "models/vocoder-cpu-lq.pt",
    "content": "version https://git-lfs.github.com/spec/v1\noid sha256:97b0c527cfd3485359afda80b6ca1b3cf22343b4e289deccbe98bcb8e438bd71\nsize 5867188\n"
  },
  {
    "path": "models/vocoder-gpu.pt",
    "content": "version https://git-lfs.github.com/spec/v1\noid sha256:16aedb72be562822126bf155b43612ceb2836dd5e8ed5b241617aa322f30c4ce\nsize 55753209\n"
  },
  {
    "path": "requirements.txt",
    "content": "deep_phonemizer~=0.0.19\nFlask~=3.0.0\ninflect~=6.0.4\nnltk~=3.8.1\nnumpy~=1.23.5\npandas~=2.0.0\npydub~=0.25.1\nscipy~=1.10.1\ntorch~=2.0.0\ntqdm~=4.65.0\nUnidecode~=1.3.6\n"
  },
  {
    "path": "utils/__init__.py",
    "content": ""
  },
  {
    "path": "utils/text/LICENSE",
    "content": "Copyright (c) 2017 Keith Ito\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in\nall copies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\nTHE SOFTWARE.\n"
  },
  {
    "path": "utils/text/__init__.py",
    "content": ""
  },
  {
    "path": "utils/text/cleaners.py",
    "content": "import re\nfrom typing import Dict, Any\n\nfrom unidecode import unidecode\n\nfrom utils.text.numbers import normalize_numbers\nfrom utils.text.symbols import phonemes_set\n\nfrom dp.phonemizer import Phonemizer\nimport torch\n\n# Regular expression matching whitespace:\n_whitespace_re = re.compile(r'\\s+')\n\n# List of (regular expression, replacement) pairs for abbreviations:\n_abbreviations = [(re.compile('\\\\b%s\\\\.' % x[0], re.IGNORECASE), x[1]) for x in [\n    ('mrs', 'misess'),\n    ('mr', 'mister'),\n    ('dr', 'doctor'),\n    ('st', 'saint'),\n    ('co', 'company'),\n    ('jr', 'junior'),\n    ('maj', 'major'),\n    ('gen', 'general'),\n    ('drs', 'doctors'),\n    ('rev', 'reverend'),\n    ('lt', 'lieutenant'),\n    ('hon', 'honorable'),\n    ('sgt', 'sergeant'),\n    ('capt', 'captain'),\n    ('esq', 'esquire'),\n    ('ltd', 'limited'),\n    ('col', 'colonel'),\n    ('ft', 'fort')\n]]\n\n\ndef expand_abbreviations(text):\n    for regex, replacement in _abbreviations:\n        text = re.sub(regex, replacement, text)\n    return text\n\n\ndef collapse_whitespace(text):\n    return re.sub(_whitespace_re, ' ', text)\n\n\ndef no_cleaners(text):\n    return text\n\n\ndef english_cleaners(text):\n    text = unidecode(text)\n    text = normalize_numbers(text)\n    text = expand_abbreviations(text)\n    return text\n\n\nclass Cleaner:\n\n    def __init__(self,\n                 cleaner_name: str,\n                 use_phonemes: bool,\n                 lang: str) -> None:\n        if cleaner_name == 'english_cleaners':\n            self.clean_func = english_cleaners\n        elif cleaner_name == 'no_cleaners':\n            self.clean_func = no_cleaners\n        else:\n            raise ValueError(f'Cleaner not supported: {cleaner_name}! '\n                             f'Currently supported: [\\'english_cleaners\\', \\'no_cleaners\\']')\n        self.use_phonemes = use_phonemes\n        self.lang = lang\n        if use_phonemes:\n            self.phonemize = Phonemizer.from_checkpoint('models/en_us_cmudict_ipa_forward.pt')\n\n    def __call__(self, text: str) -> str:\n        text = self.clean_func(text)\n        if self.use_phonemes:\n            text = self.phonemize(text, lang='en_us')\n            text = ''.join([p for p in text if p in phonemes_set])\n        text = collapse_whitespace(text)\n        text = text.strip()\n        return text\n\n    @classmethod\n    def from_config(cls, config: Dict[str, Any]) -> 'Cleaner':\n        return Cleaner(\n            cleaner_name=config['preprocessing']['cleaner_name'],\n            use_phonemes=config['preprocessing']['use_phonemes'],\n            lang=config['preprocessing']['language']\n        )\n"
  },
  {
    "path": "utils/text/numbers.py",
    "content": "\"\"\" from https://github.com/keithito/tacotron \"\"\"\n\nimport inflect\nimport re\n\n\n_inflect = inflect.engine()\n_comma_number_re = re.compile(r'([0-9][0-9\\,]+[0-9])')\n_decimal_number_re = re.compile(r'([0-9]+\\.[0-9]+)')\n_pounds_re = re.compile(r'£([0-9\\,]*[0-9]+)')\n_dollars_re = re.compile(r'\\$([0-9\\.\\,]*[0-9]+)')\n_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')\n_number_re = re.compile(r'[0-9]+')\n\n\ndef _remove_commas(m):\n  return m.group(1).replace(',', '')\n\n\ndef _expand_decimal_point(m):\n  return m.group(1).replace('.', ' point ')\n\n\ndef _expand_dollars(m):\n  match = m.group(1)\n  parts = match.split('.')\n  if len(parts) > 2:\n    return match + ' dollars'  # Unexpected format\n  dollars = int(parts[0]) if parts[0] else 0\n  cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0\n  if dollars and cents:\n    dollar_unit = 'dollar' if dollars == 1 else 'dollars'\n    cent_unit = 'cent' if cents == 1 else 'cents'\n    return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)\n  elif dollars:\n    dollar_unit = 'dollar' if dollars == 1 else 'dollars'\n    return '%s %s' % (dollars, dollar_unit)\n  elif cents:\n    cent_unit = 'cent' if cents == 1 else 'cents'\n    return '%s %s' % (cents, cent_unit)\n  else:\n    return 'zero dollars'\n\n\ndef _expand_ordinal(m):\n  return _inflect.number_to_words(m.group(0))\n\n\ndef _expand_number(m):\n  num = int(m.group(0))\n  if num > 1000 and num < 3000:\n    if num == 2000:\n      return 'two thousand'\n    elif num > 2000 and num < 2010:\n      return 'two thousand ' + _inflect.number_to_words(num % 100)\n    elif num % 100 == 0:\n      return _inflect.number_to_words(num // 100) + ' hundred'\n    else:\n      return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')\n  else:\n    return _inflect.number_to_words(num, andword='')\n\n\ndef normalize_numbers(text):\n  text = re.sub(_comma_number_re, _remove_commas, text)\n  text = re.sub(_pounds_re, r'\\1 pounds', text)\n  text = re.sub(_dollars_re, _expand_dollars, text)\n  text = re.sub(_decimal_number_re, _expand_decimal_point, text)\n  text = re.sub(_ordinal_re, _expand_ordinal, text)\n  text = re.sub(_number_re, _expand_number, text)\n  return text\n"
  },
  {
    "path": "utils/text/recipes.py",
    "content": "from multiprocessing import Pool\nfrom pathlib import Path\nfrom typing import Tuple\n\nimport pandas as pd\nimport tqdm\n\nfrom utils.files import get_files\n\nDEFAULT_SPEAKER_NAME = 'default_speaker'\n\n\ndef read_metadata(path: Path,\n                  metafile: str,\n                  format: str,\n                  n_workers=1) -> Tuple[dict, dict]:\n    if format == 'ljspeech':\n        return read_ljspeech_format(path/metafile, multispeaker=False)\n    elif format == 'ljspeech_multi':\n        return read_ljspeech_format(path/metafile, multispeaker=True)\n    elif format == 'vctk':\n        return read_vctk_format(path, n_workers=n_workers)\n    elif format == 'pandas':\n        return read_pandas_format(path/metafile)\n    else:\n        raise ValueError(f'Metafile has unexpected ending: {path.stem}, expected [.csv, .tsv]\"')\n\n\ndef read_ljspeech_format(path: Path, multispeaker: bool = False) -> Tuple[dict, dict]:\n    if not path.is_file():\n        raise ValueError(f'Could not find metafile: {path}, '\n                         f'please make sure that you set the correct path and metafile name!')\n    text_dict = {}\n    speaker_dict = {}\n    with open(str(path), encoding='utf-8') as f:\n        for line in f:\n            split = line.split('|')\n            speaker_name = split[-2] if multispeaker and len(split) > 2 else DEFAULT_SPEAKER_NAME\n            file_id, text = split[0], split[-1]\n            text_dict[file_id] = text.replace('\\n', '')\n            speaker_dict[file_id] = speaker_name\n    return text_dict, speaker_dict\n\n\ndef read_vctk_format(path: Path,\n                     n_workers: int,\n                     extension='.txt') -> Tuple[dict, dict]:\n    files = get_files(path, extension=extension)\n    text_dict = {}\n    speaker_dict = {}\n    pool = Pool(processes=n_workers)\n    for i, (file, text) in tqdm.tqdm(enumerate(pool.imap_unordered(read_line, files), 1), total=len(files)):\n        text_id = file.name.replace(extension, '')\n        speaker_id = file.parent.stem\n        text_dict[text_id] = text.replace('\\n', '')\n        speaker_dict[text_id] = speaker_id\n    return text_dict, speaker_dict\n\n\ndef read_pandas_format(path: Path) -> Tuple[dict, dict]:\n    if not path.is_file():\n        raise ValueError(f'Could not find metafile: {path}, '\n                         f'please make sure that you set the correct path and metafile name!')\n    df = pd.read_csv(str(path), sep='\\t', encoding='utf-8')\n    text_dict = {}\n    speaker_dict = {}\n    for index, row in df.iterrows():\n        id = row['file_id']\n        text_dict[id] = row['text']\n        speaker_dict[id] = row['speaker_id']\n    return text_dict, speaker_dict\n\n\ndef read_line(file: Path) -> Tuple[Path, str]:\n    with open(str(file), encoding='utf-8') as f:\n        line = f.readlines()[0]\n    return file, line\n"
  },
  {
    "path": "utils/text/symbols.py",
    "content": "\"\"\" from https://github.com/keithito/tacotron \"\"\"\n\n'''\nDefines the set of symbols used in text input to the model.\n\nThe default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. '''\n\n_pad = '_'\n_punctuation = '!\\'(),.:;? '\n_special = '-'\n\n# Phonemes\n_vowels = 'iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻ'\n_non_pulmonic_consonants = 'ʘɓǀɗǃʄǂɠǁʛ'\n_pulmonic_consonants = 'pbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟ'\n_suprasegmentals = 'ˈˌːˑ'\n_other_symbols = 'ʍwɥʜʢʡɕʑɺɧ'\n_diacrilics = 'ɚ˞ɫ'\n_extra_phons = ['g', 'ɝ', '̃', '̍', '̥', '̩', '̯', '͡']  # some extra symbols that I found in from wiktionary ipa annotations\n\nphonemes = list(\n   _pad + _punctuation + _special + _vowels + _non_pulmonic_consonants\n   + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics) + _extra_phons\n\nphonemes_set = set(phonemes)\nsilent_phonemes_indices = [i for i, p in enumerate(phonemes) if p in _pad + _punctuation]"
  },
  {
    "path": "utils/text/tokenizer.py",
    "content": "from typing import List\n\nfrom utils.text.symbols import phonemes\n\n\nclass Tokenizer:\n\n    def __init__(self) -> None:\n        self.symbol_to_id = {s: i for i, s in enumerate(phonemes)}\n        self.id_to_symbol = {i: s for i, s in enumerate(phonemes)}\n\n    def __call__(self, text: str) -> List[int]:\n        return [self.symbol_to_id[t] for t in text if t in self.symbol_to_id]\n\n    def decode(self, sequence: List[int]) -> str:\n        text = [self.id_to_symbol[s] for s in sequence if s in self.id_to_symbol]\n        return ''.join(text)"
  },
  {
    "path": "utils/tools.py",
    "content": "import torch\n\nfrom utils.text.cleaners import Cleaner\nfrom utils.text.tokenizer import Tokenizer\n\ndef prepare_text(text: str)->str:\n    if not ((text[-1] == '.') or (text[-1] == '?') or (text[-1] == '!')):\n        text = text + '.'\n    cleaner = Cleaner('english_cleaners', True, 'en-us')\n    tokenizer = Tokenizer()\n    return torch.as_tensor(tokenizer(cleaner(text)), dtype=torch.long, device='cpu').unsqueeze(0)\n"
  }
]