[
  {
    "path": ".gitignore",
    "content": "egs/lj/exp\negs/lj/LJSpeech1.1\n*.log\n*.pyc\n__pycache__/\n*/__pycache__/\n*.zip\n*/*.zip\nsource/\nmake.bat\nMakefile\n*.swp\n"
  },
  {
    "path": "LICENSE",
    "content": "                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright 2022 Tencent\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n"
  },
  {
    "path": "README.md",
    "content": "# Bilateral Denoising Diffusion Models (BDDMs)\n\n[![GitHub Stars](https://img.shields.io/github/stars/tencent-ailab/bddm?style=social)](https://github.com/tencent-ailab/bddm)\n![visitors](https://visitor-badge.glitch.me/badge?page_id=tencent-ailab/bddm)\n[![arXiv](https://img.shields.io/badge/arXiv-Paper-green.svg)](https://arxiv.org/abs/2203.13508)\n[![demo](https://img.shields.io/badge/demo-Samples-orange.svg)](https://bilateral-denoising-diffusion-model.github.io)\n\nThis is the official PyTorch implementation of the following paper:\n\n> **BDDM: BILATERAL DENOISING DIFFUSION MODELS FOR FAST AND HIGH-QUALITY SPEECH SYNTHESIS** \\\n> Max W. Y. Lam, Jun Wang, Dan Su, Dong Yu\n\n> **Abstract**: *Diffusion probabilistic models (DPMs) and their extensions have emerged as competitive generative models yet confront challenges of efficient sampling. We propose a new bilateral denoising diffusion model (BDDM) that parameterizes both the forward and reverse processes with a schedule network and a score network, which can train with a novel bilateral modeling objective. We show that the new surrogate objective can achieve a lower bound of the log marginal likelihood tighter than a conventional surrogate. We also find that BDDM allows inheriting pre-trained score network parameters from any DPMs and consequently enables speedy and stable learning of the schedule network and optimization of a noise schedule for sampling. Our experiments demonstrate that BDDMs can generate high-fidelity audio samples with as few as three sampling steps. Moreover, compared to other state-of-the-art diffusion-based neural vocoders, BDDMs produce comparable or higher quality samples indistinguishable from human speech, notably with only seven sampling steps (143x faster than WaveGrad and 28.6x faster than DiffWave).*\n\n> **Paper**: Published at ICLR 2022 on [OpenReview](https://openreview.net/pdf?id=L7wzpQttNO)\n\n![BDDM](bddm.png)\n\nThis implementation supports model training and audio generation, and also provides the pre-trained models for the benchmark [LJSpeech](https://keithito.com/LJ-Speech-Dataset/) and [VCTK](https://datashare.ed.ac.uk/handle/10283/2651) dataset.\n\nVisit our [demo page](https://bilateral-denoising-diffusion-model.github.io) for audio samples.\n\n### Updates:\n- **May 20, 2021:** Released our follow-up work [FastDiff](https://github.com/Rongjiehuang/FastDiff) on GitHub, where we futher optimized the speed-and-quality trade-off.\n- **May 10, 2021:** Added the experiment configurations and model checkpoints for the VCTK dataset.\n- **May 9, 2021:** Added the searched noise schedules for the LJSpeech and VCTK datasets.\n- **March 20, 2021:** Released the PyTorch implementation of BDDM with pre-trained models for the LJSpeech dataset.\n\n### Recipes:\n\n- (Option 1) To train the BDDM scheduling network yourself, you can download the pre-trained score network from [philsyn/DiffWave-Vocoder](https://github.com/philsyn/DiffWave-Vocoder/blob/master/exp/ch128_T200_betaT0.02/logs/checkpoint/1000000.pkl) (provided at ```egs/lj/DiffWave.pkl```), and follow the training steps below. **(Start from Step I.)**\n- (Option 2) To search for noise schedules using BDDM, we provide a pre-trained BDDM for LJSpeech at ```egs/lj/DiffWave-GALR.pkl``` and for VCTK at ```egs/vctk/DiffWave-GALR.pkl``` . **(Start from Step III.)**\n- (Option 3) To directly generate samples using BDDM, we provide the searched schedules for LJSpeech at ```egs/lj/noise_schedules``` and for VCTK at ```egs/vctk/noise_schedules``` (check ```conf.yml``` for the respective configurations). **(Start from Step IV.)**\n\n\n## Getting Started\n\nWe provide an example of how you can generate high-fidelity samples using BDDMs.\n\nTo try BDDM on your own dataset, simply clone this repo in your local machine provided with NVIDIA GPU + CUDA cuDNN and follow the below intructions.\n\n### Dependencies\n\n- [pytorch](https://github.com/pytorch/pytorch)>=1.7.1\n- [librosa](https://github.com/librosa/librosa)>=0.7.1\n- [pystoi](https://github.com/mpariente/pystoi)==0.3.3\n- [pypesq](https://github.com/youngjamespark/python-pypesq)==0.2.0\n\n### Step I. Data Preparation and Configuraion ### \n\nDownload the [LJSpeech](https://keithito.com/LJ-Speech-Dataset/) dataset.\n\nFor training, we first need to setup a file **conf.yml** for configuring the data loader, the score and the schedule networks, the training procedure, the noise scheduling and sampling parameters.\n\n**Note:** Appropriately modify the paths in ```\"train_data_dir\"``` and ```\"valid_data_dir\"``` for training; and the path in ```\"gen_data_dir\"``` for sampling. All dir paths should be link to a directory that store the waveform audios (in **.wav**) or the Mel-spectrogram files (in **.mel**).\n\n### Step II. Training a Schedule Network ###\n\nSuppose that a well-trained score network (theta) is stored at ```$theta_path```, we start by modifying ```\"load\": $theta_path``` in **conf.yml**.\n\nAfter modifying the relevant hyperparameters for a schedule network (especially ```\"tau\"```), we can train the schedule network (f_phi in paper) using:\n\n```bash\n# Training on device 0 (supports multi-GPU training)\nsh train.sh 0 conf.yml\n```\n\n**Note**: In practice, we found that **10K** training steps would be enough to obtain a promising scheduling network. This normally takes no more than half an hour for training with one GPU.\n\n### Step III. Searching for Noise Schedules ###\n\nGiven a well-trained BDDM (theta, phi), we can now run the noise scheduling algorithm to find the best schedule (optimizing the trade-off between quality and speed).\n\nFirst, we set ```\"load\"``` in ```conf.yml``` to the path of the trained BDDM.\n\nAfter setting the maximum number of sampling steps in scheduling (```\"N\"```), we run:\n\n```bash\n# Scheduling on device 0 (only supports single-GPU scheduling)\nsh schedule.sh 0 conf.yml\n```\n\n### Step IV. Evaluation or Generation ###\n\nFor evaluation, we set ```\"gen_data_dir\"``` in ```conf.yml``` to the path of a directory that stores the test set of audios (in ```.wav```).\n\nFor generation, we set ```\"gen_data_dir\"``` in ```conf.yml``` to the path of a directory that stores the Mel-spectrogram (by default in ```.mel``` generated by [TacotronSTFT](https://github.com/NVIDIA/tacotron2/blob/master/layers.py) or by our dataset loader ```bddm/loader/dataset.py```).\n\nThen, we run:\n\n```bash\n# Generation/evaluation on device 0 (only supports single-GPU generation)\nsh generate.sh 0 conf.yml\n```\n\n## Acknowledgements\nThis implementation uses parts of the code from the following Github repos:\\\n[Tacotron2](https://github.com/NVIDIA/tacotron2)\\\n[DiffWave-Vocoder](https://github.com/philsyn/DiffWave-Vocoder)\\\nas described in our code.\n\n## Citations ##\n\n```\n@inproceedings{lam2022bddm,\n  title={BDDM: Bilateral Denoising Diffusion Models for Fast and High-Quality Speech Synthesis},\n  author={Lam, Max WY and Wang, Jun and Su, Dan and Yu, Dong},\n  booktitle={International Conference on Learning Representations},\n  year={2022}\n}\n```\n\n## License ##\n\nCopyright 2022 Tencent\n\nLicensed under the Apache License, Version 2.0 (the \"License\");\nyou may not use this file except in compliance with the License.\nYou may obtain a copy of the License at\n\nhttp://www.apache.org/licenses/LICENSE-2.0\n\nUnless required by applicable law or agreed to in writing, software\ndistributed under the License is distributed on an \"AS IS\" BASIS,\nWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nSee the License for the specific language governing permissions and\nlimitations under the License.\n\n## Disclaimer ##\n\nThis is not an officially supported Tencent product.\n"
  },
  {
    "path": "bddm/loader/__init__.py",
    "content": "#!/bin/env python\n# -*- coding: utf-8 -*-\n########################################################################\n#\n#  Loader\n#\n#  Author: Max W. Y. Lam (maxwylam@tencent.com)\n#  Copyright (c) 2021Tencent. All Rights Reserved\n#\n########################################################################\n"
  },
  {
    "path": "bddm/loader/dataset.py",
    "content": "#!/bin/env python\n# -*- coding: utf-8 -*-\n########################################################################\n#\n#  Dataset and DataLoader for Neural Vocoding\n#\n#  Author: Max W. Y. Lam (maxwylam@tencent.com)\n#  Copyright (c) 2021Tencent. All Rights Reserved\n#\n########################################################################\n\n\nimport os\nfrom pathlib import Path\nimport numpy as np\nimport torch\nimport torch.nn.functional as F\nimport torch.utils.data as data\nfrom scipy.io.wavfile import read\n\nfrom .stft import TacotronSTFT\n\n\nMAX_WAV_VALUE = 32768\n\n\nclass SpectrogramDataset(data.Dataset):\n\n    def __init__(self, data_dir, n_gpus, is_sampling, sampling_rate,\n            seg_len, fil_len, hop_len, win_len, mel_fmin, mel_fmax):\n        \"\"\"\n        A torch.data.Dataset class that loads the audio files for training\n            or loads the Mel-spectrogram files for sampling.\n\n        Parameters:\n            data_dir (str):      the path to the directory storing .wav/.mel files\n            n_gpus (int):        the number of GPUs for training\n            is_sampling (bool):  whether the dataset is used for sampling or not\n            sampling_rate (int): the sampling rate of audios\n            seg_len (int):       the segment length (number of samples) for training\n            fil_len (int):       the filter length for computing STFT\n            hop_len (int):       the hop length for computing STFT\n            win_len (int):       the window length for computing STFT\n            mel_fmin (int):      the minimum frequency for computing STFT\n            mel_fmax (int):      the maximum frequency for computing STFT\n        \"\"\"\n        self.n_gpus = n_gpus\n        self.is_sampling = is_sampling\n        self.seg_len = seg_len\n        self.hop_len = hop_len\n        self.n_mels = self.seg_len // self.hop_len\n        self.sampling_rate = sampling_rate\n\n        if is_sampling:\n            # Find all Mel-spectrogram files in the given data directory\n            self.mel_files = self.find_all_mels_in_dir(data_dir)\n            if len(self.mel_files) == 0:\n                # Find audios when no pre-computed mel spectrograms can be found\n                self.audio_files = self.find_all_wavs_in_dir(data_dir)\n                # Note that no mel file is loaded for generation\n                self.mel_files = None\n            else:\n                # Note that no audio file is loaded for generation\n                self.audio_files = None\n        else:\n            # Find all audio files in the given data directory\n            self.audio_files = self.find_all_wavs_in_dir(data_dir)\n\n        # Use the standard STFT operation defined in Tacotron 2\n        self.stft = TacotronSTFT(filter_length=fil_len,\n                                 hop_length=hop_len,\n                                 win_length=win_len,\n                                 sampling_rate=sampling_rate,\n                                 mel_fmin=mel_fmin,\n                                 mel_fmax=mel_fmax)\n        self.reset()\n\n    def reset(self):\n        \"\"\"\n        Reset the loader by shuffling the file list\n        \"\"\"\n        if self.is_sampling and self.audio_files is None:\n            np.random.shuffle(self.mel_files)\n        else:\n            np.random.shuffle(self.audio_files)\n            self.n_mels = self.seg_len // self.hop_len\n            self.n_mels = int(self.n_mels * (1 + np.random.rand()))\n            # Make sure the number of samples is divisible by n_gpus\n            if len(self.audio_files) % self.n_gpus != 0:\n                remainder = len(self.audio_files) % self.n_gpus\n                self.audio_files = self.audio_files[:-remainder]\n\n    def find_all_wavs_in_dir(self, data_dir):\n        \"\"\"\n        Load all .wav files in data_dir\n\n        Parameters:\n            data_dir (str):   the path to the directory storing .wav files\n        Returns:\n            files_list (list): the list of wav file paths\n        \"\"\"\n        files = [f for f in Path(data_dir).glob('*.wav')]\n        if len(files) == 0:\n            files = [f for f in Path(data_dir).glob('*_wav.npy')]\n        return files\n\n    def find_all_mels_in_dir(self, data_dir):\n        \"\"\"\n        Load all .mel files in data_dir\n\n        Parameters:\n            data_dir (str):   the path to the directory storing .mel files\n        Returns:\n            files_list (list): the list of mel file paths\n        \"\"\"\n        files = [f for f in Path(data_dir).glob('*.mel')]\n        return files\n\n    def crop_audio_and_mel(self, audio, mel_spec):\n        \"\"\"\n        Randomly crop audio and mel_spec into a fixed-length segment\n\n        Parameters:\n            audio (tensor):    the full audio\n            mel_spec (tensor): the full mel-spectrogram computed using TacotronSTFT\n        Returns:\n            audio (tensor):    the cropped audio\n            mel_spec (tensor): the cropped mel-spectrogram\n        \"\"\"\n        n_mels = self.n_mels\n        seg_len = n_mels * self.hop_len\n        if audio.size(-1) >= seg_len:\n            if mel_spec.size(-1) > n_mels:\n                max_mel_start = mel_spec.size(-1) - n_mels\n                mel_start = np.random.randint(0, max_mel_start)\n                mel_spec = mel_spec[..., mel_start:mel_start+n_mels]\n                audio_start = mel_start * self.hop_len\n                audio = audio[..., audio_start:audio_start+seg_len]\n            elif mel_spec.size(-1) == n_mels:\n                audio = audio[..., :seg_len]\n            else:\n                audio = audio[..., :seg_len]\n                mel_spec = F.pad(mel_spec, (0, n_mels - mel_spec.size(-1)), 'constant', 0)\n        else:\n            audio = F.pad(audio, (0, seg_len - audio.size(-1)), 'constant', 0)\n            mel_spec = F.pad(mel_spec, (0, n_mels - mel_spec.size(-1)), 'constant', 0)\n        return audio, mel_spec\n\n    def __getitem__(self, index):\n        \"\"\"\n        Get a pair of data (mel-spectrogram, audio) given an index\n\n        Parameters:\n            index (int):       the index for loading one sample\n        Returns:\n            audio_key (str):   the audio key\n            mel_spec (tensor): the mel-spectrogram computed using TacotronSTFT\n            audio (tensor):    the ground-truth audio\n        \"\"\"\n        if self.is_sampling and self.audio_files is None:\n            # Load Mel-spectrogram for sampling\n            mel_spec = torch.load(self.mel_files[index], map_location='cpu').float()\n            if mel_spec.ndim == 3:\n                mel_spec = mel_spec[0]\n            audio_key = str(self.mel_files[index])\n            # Try to find the paired source audio\n            wav_path = str(self.mel_files[index])[:-4]+'.wav'\n            if os.path.isfile(wav_path):\n                sampling_rate, audio = read(wav_path)\n                audio = torch.from_numpy(audio[None]).float() / MAX_WAV_VALUE\n                assert sampling_rate == self.sampling_rate\n                return audio_key, mel_spec, audio\n            else:\n                return audio_key, mel_spec, []\n\n        mel_spec = None\n        # Load the audio file to torch.FloatTensor\n        if str(self.audio_files[index])[-3:] == 'npy':\n            audio = np.load(self.audio_files[index])[0]\n            mel_spec = np.load(str(self.audio_files[index]).replace('wav', 'mel'))[0]\n            # Load into torch.FloatTensor\n            audio = torch.from_numpy(audio).float()\n            mel_spec = torch.from_numpy(mel_spec).float()\n        else:\n            sampling_rate, audio = read(self.audio_files[index])\n            # Make sure the sampling rate is correctly defined\n            assert sampling_rate == self.sampling_rate\n            # Normalize the audio into [-1, 1]\n            audio = audio / MAX_WAV_VALUE\n            # Load into torch.FloatTensor\n            audio = torch.from_numpy(audio).float()\n        # Compute Mel-spectrogram (shape = [T] -> [filter_length, L])\n        if mel_spec is None:\n            audio = audio[None]\n            mel_spec = self.stft.mel_spectrogram(audio)\n            mel_spec = torch.squeeze(mel_spec, 0)\n\n        if not self.is_sampling:\n            audio, mel_spec = self.crop_audio_and_mel(audio, mel_spec)\n\n        if self.is_sampling:\n            # Save ground-truth Mel-spectrogram into .mel file\n            torch.save(mel_spec, str(self.audio_files[index])[:-4]+'.mel')\n            return str(self.audio_files[index]), mel_spec, audio\n\n        return mel_spec, audio\n\n    def __len__(self):\n        \"\"\"\n        Get the number of data\n\n        Returns:\n            data_len (int): the number of .wav/.mel files found in data_dir\n        \"\"\"\n        if self.audio_files is None:\n            return len(self.mel_files)\n        return len(self.audio_files)\n\n\ndef create_train_and_valid_dataloader(config):\n    \"\"\"\n    Create two torch.data.DataLoader for training and validation\n\n    Parameters:\n        config (namespace):     BDDM Configuration\n    Returns:\n        tr_loader (DataLoader): the data loader for training\n        vl_loader (DataLoader): the data loader for validation\n    \"\"\"\n    n_gpus = 1\n    if 'WORLD_SIZE' in os.environ.keys():\n        n_gpus = int(os.environ['WORLD_SIZE'])\n    conf_keys = SpectrogramDataset.__init__.__code__.co_varnames\n    data_config = {k: v for k, v in vars(config).items() if k in conf_keys}\n    data_config[\"data_dir\"] = config.train_data_dir\n    data_config[\"is_sampling\"] = False\n    data_config[\"n_gpus\"] = n_gpus\n    dataset = SpectrogramDataset(**data_config)\n    assert len(dataset) > 0, f\"Error: No .wav can be found at {config.train_data_dir} !\"\n    sampler = data.distributed.DistributedSampler(dataset) if n_gpus > 1 else None\n    tr_loader = data.DataLoader(dataset,\n                                sampler=sampler,\n                                batch_size=config.batch_size,\n                                num_workers=config.n_worker,\n                                pin_memory=False,\n                                drop_last=True)\n    data_config[\"data_dir\"] = config.valid_data_dir\n    dataset = SpectrogramDataset(**data_config)\n    assert len(dataset) > 0, f\"Error: No .wav can be found at {config.valid_data_dir} !\"\n    sampler = data.distributed.DistributedSampler(dataset) if n_gpus > 1 else None\n    vl_loader = data.DataLoader(dataset,\n                                sampler=sampler,\n                                batch_size=1,\n                                num_workers=config.n_worker,\n                                pin_memory=False)\n    return tr_loader, vl_loader\n\n\ndef create_generation_dataloader(config):\n    \"\"\"\n    Create a torch.data.DataLoader for generation\n\n    Parameters:\n        config (namespace):      BDDM Configuration\n    Returns:\n        gen_loader (DataLoader): the data loader for generation\n    \"\"\"\n    conf_keys = SpectrogramDataset.__init__.__code__.co_varnames\n    data_config = {k: v for k, v in vars(config).items() if k in conf_keys}\n    data_config[\"data_dir\"] = config.gen_data_dir\n    data_config[\"is_sampling\"] = True\n    data_config[\"n_gpus\"] = 1\n    dataset = SpectrogramDataset(**data_config)\n    gen_loader = data.DataLoader(dataset,\n                                 batch_size=1,  # variable audio length\n                                 num_workers=config.n_worker,\n                                 pin_memory=False)\n    return gen_loader\n\n\nif __name__ == \"__main__\":\n    from argparse import Namespace\n    data_config = {\n        \"data_dir\": \"LJSpeech-1.1/train_wavs\",\n        \"sampling_rate\": 22050,\n        \"training\": True,\n        \"seg_len\": 22050,\n        \"fil_len\": 1024,\n        \"hop_len\": 256,\n        \"win_len\": 1024,\n        \"mel_fmin\": 0.0,\n        \"mel_fmax\": 8000.0,\n        \"extra_key\": \"extra_val\"\n    }\n    data_config = Namespace(**data_config)\n    loader = create_train_and_valid_dataloader(data_config)\n    for batch in loader:\n        print(len(batch))\n        print(batch[0].shape, batch[1].shape)\n        break\n"
  },
  {
    "path": "bddm/loader/stft.py",
    "content": "\"\"\"\nBSD 3-Clause License\n\nCopyright (c) 2017, Prem Seetharaman\nAll rights reserved.\n\n* Redistribution and use in source and binary forms, with or without\n  modification, are permitted provided that the following conditions are met:\n\n* Redistributions of source code must retain the above copyright notice,\n  this list of conditions and the following disclaimer.\n\n* Redistributions in binary form must reproduce the above copyright notice, this\n  list of conditions and the following disclaimer in the\n  documentation and/or other materials provided with the distribution.\n\n* Neither the name of the copyright holder nor the names of its\n  contributors may be used to endorse or promote products derived from this\n  software without specific prior written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND\nANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED\nWARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\nDISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR\nANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES\n(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;\nLOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON\nANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\nSOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\"\"\"\n\nimport numpy as np\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom torch.autograd import Variable\nfrom scipy.signal import get_window\nfrom librosa.filters import mel as mel_func\nfrom librosa.util import pad_center\n\n\nclass STFT(nn.Module):\n    \"\"\"adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft\"\"\"\n    def __init__(self, filter_length=800, hop_length=200, win_length=800,\n                 window='hann'):\n        super(STFT, self).__init__()\n        self.filter_length = filter_length\n        self.hop_length = hop_length\n        self.win_length = win_length\n        self.window = window\n        self.forward_transform = None\n        scale = self.filter_length / self.hop_length\n        fourier_basis = np.fft.fft(np.eye(self.filter_length))\n\n        cutoff = int((self.filter_length / 2 + 1))\n        fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]),\n                                   np.imag(fourier_basis[:cutoff, :])])\n\n        forward_basis = torch.FloatTensor(fourier_basis[:, None, :])\n        inverse_basis = torch.FloatTensor(\n            np.linalg.pinv(scale * fourier_basis).T[:, None, :])\n\n        if window is not None:\n            assert(filter_length >= win_length)\n            # get window and zero center pad it to filter_length\n            fft_window = get_window(window, win_length, fftbins=True)\n            fft_window = pad_center(fft_window, filter_length)\n            fft_window = torch.from_numpy(fft_window).float()\n\n            # window the bases\n            forward_basis *= fft_window\n            inverse_basis *= fft_window\n\n        self.register_buffer('forward_basis', forward_basis.float())\n        self.register_buffer('inverse_basis', inverse_basis.float())\n\n    def transform(self, input_data):\n        num_batches = input_data.size(0)\n        num_samples = input_data.size(1)\n\n        self.num_samples = num_samples\n\n        # similar to librosa, reflect-pad the input\n        input_data = input_data.view(num_batches, 1, num_samples)\n        input_data = F.pad(\n            input_data.unsqueeze(1),\n            (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),\n            mode='reflect')\n        input_data = input_data.squeeze(1)\n\n        forward_transform = F.conv1d(\n            input_data,\n            Variable(self.forward_basis, requires_grad=False),\n            stride=self.hop_length,\n            padding=0)\n\n        cutoff = int((self.filter_length / 2) + 1)\n        real_part = forward_transform[:, :cutoff, :]\n        imag_part = forward_transform[:, cutoff:, :]\n\n        magnitude = torch.sqrt(real_part**2 + imag_part**2)\n        phase = torch.autograd.Variable(\n            torch.atan2(imag_part.data, real_part.data))\n\n        return magnitude, phase\n\n    def forward(self, input_data):\n        self.magnitude, self.phase = self.transform(input_data)\n        reconstruction = self.inverse(self.magnitude, self.phase)\n        return reconstruction\n\n\nclass TacotronSTFT(nn.Module):\n    \"\"\"\n    Adapted from https://github.com/NVIDIA/tacotron2/blob/master/layers.py\n    \"\"\"\n    def __init__(self, filter_length=1024, hop_length=256, win_length=1024,\n                 n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,\n                 mel_fmax=8000.0):\n        super(TacotronSTFT, self).__init__()\n        self.n_mel_channels = n_mel_channels\n        self.sampling_rate = sampling_rate\n        self.stft_fn = STFT(filter_length, hop_length, win_length)\n        mel_basis = mel_func(\n            sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)\n        mel_basis = torch.from_numpy(mel_basis).float()\n        self.register_buffer('mel_basis', mel_basis)\n\n    def spectral_normalize(self, x):\n        return torch.log(torch.clamp(x, min=1e-5))\n\n    def spectral_de_normalize(self, x):\n        return torch.exp(x)\n\n    def mel_spectrogram(self, y):\n        \"\"\"\n        Compute mel-spectrograms from a batch of waves\n        \"\"\"\n        assert(torch.min(y.data) >= -1)\n        assert(torch.max(y.data) <= 1)\n\n        magnitudes, _ = self.stft_fn.transform(y)\n        magnitudes = magnitudes.data\n        mel_output = torch.matmul(self.mel_basis, magnitudes)\n        mel_output = self.spectral_normalize(mel_output)\n        return mel_output\n"
  },
  {
    "path": "bddm/models/__init__.py",
    "content": "#!/bin/env python\n# -*- coding: utf-8 -*-\n########################################################################\n#\n#  Globally Attentive Locally Recurrent (GALR) Networks\n#  (https://arxiv.org/abs/2101.05014)\n#\n#  Author: Max W. Y. Lam (maxwylam@tencent.com)\n#  Copyright (c) 2021Tencent. All Rights Reserved\n#\n########################################################################\n\n\nfrom .diffwave import DiffWave\nfrom .galr import GALR\n\n\ndef get_score_network(config):\n    if config.score_net == 'DiffWave':\n        conf_keys = DiffWave.__init__.__code__.co_varnames\n        model_config = {k: v for k, v in vars(config).items() if k in conf_keys}\n        return DiffWave(**model_config)\n\n\ndef get_schedule_network(config):\n    if config.schedule_net == 'GALR':\n        conf_keys = GALR.__init__.__code__.co_varnames\n        model_config = {k: v for k, v in vars(config).items() if k in conf_keys}\n        return GALR(**model_config)\n"
  },
  {
    "path": "bddm/models/diffwave.py",
    "content": "#!/bin/env python\n# -*- coding: utf-8 -*-\n########################################################################\n#\n#  DiffWave: A Versatile Diffusion Model for Audio Synthesis\n#  (https://arxiv.org/abs/2009.09761)\n#  Modified from https://github.com/philsyn/DiffWave-Vocoder\n#\n#  Author: Max W. Y. Lam (maxwylam@tencent.com)\n#  Copyright (c) 2021Tencent. All Rights Reserved\n#\n########################################################################\n\n\nimport math\nimport numpy as np\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\ndef calc_diffusion_step_embedding(diffusion_steps, diffusion_step_embed_dim_in):\n    \"\"\"\n    Embed a diffusion step $t$ into a higher dimensional space\n        E.g. the embedding vector in the 128-dimensional space is\n        [sin(t * 10^(0*4/63)), ... , sin(t * 10^(63*4/63)),\n         cos(t * 10^(0*4/63)), ... , cos(t * 10^(63*4/63))]\n\n    Parameters:\n        diffusion_steps (torch.long tensor, shape=(batchsize, 1)):\n                                    diffusion steps for batch data\n        diffusion_step_embed_dim_in (int, default=128):\n                                    dimensionality of the embedding space for discrete diffusion steps\n    Returns:\n        the embedding vectors (torch.tensor, shape=(batchsize, diffusion_step_embed_dim_in)):\n    \"\"\"\n\n    assert diffusion_step_embed_dim_in % 2 == 0\n\n    half_dim = diffusion_step_embed_dim_in // 2\n    _embed = np.log(10000) / (half_dim - 1)\n    _embed = torch.exp(torch.arange(half_dim) * -_embed).cuda()\n    _embed = diffusion_steps * _embed\n    diffusion_step_embed = torch.cat((torch.sin(_embed),\n                                      torch.cos(_embed)), 1)\n    return diffusion_step_embed\n\n\n\"\"\"\nBelow scripts were borrowed from\nhttps://github.com/philsyn/DiffWave-Vocoder/blob/master/WaveNet.py\n\"\"\"\n\n\ndef swish(x):\n    return x * torch.sigmoid(x)\n\n\n# dilated conv layer with kaiming_normal initialization\n# from https://github.com/ksw0306/FloWaveNet/blob/master/modules.py\nclass Conv(nn.Module):\n    def __init__(self, in_channels, out_channels, kernel_size=3, dilation=1):\n        super().__init__()\n        self.padding = dilation * (kernel_size - 1) // 2\n        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size,\n                              dilation=dilation, padding=self.padding)\n        self.conv = nn.utils.weight_norm(self.conv)\n        nn.init.kaiming_normal_(self.conv.weight)\n\n    def forward(self, x):\n        out = self.conv(x)\n        return out\n\n\n# conv1x1 layer with zero initialization\n# from https://github.com/ksw0306/FloWaveNet/blob/master/modules.py but the scale parameter is removed\nclass ZeroConv1d(nn.Module):\n    def __init__(self, in_channel, out_channel):\n        super().__init__()\n        self.conv = nn.Conv1d(in_channel, out_channel, kernel_size=1, padding=0)\n        self.conv.weight.data.zero_()\n        self.conv.bias.data.zero_()\n\n    def forward(self, x):\n        out = self.conv(x)\n        return out\n\n\n# every residual block (named residual layer in paper)\n# contains one noncausal dilated conv\nclass ResidualBlock(nn.Module):\n    def __init__(self, res_channels, skip_channels, dilation,\n                 diffusion_step_embed_dim_out):\n        super().__init__()\n        self.res_channels = res_channels\n\n        # Use a FC layer for diffusion step embedding\n        self.fc_t = nn.Linear(diffusion_step_embed_dim_out, self.res_channels)\n\n        # Dilated conv layer\n        self.dilated_conv_layer = Conv(self.res_channels, 2 * self.res_channels,\n                                       kernel_size=3, dilation=dilation)\n\n        # Add mel spectrogram upsampler and conditioner conv1x1 layer\n        self.upsample_conv2d = nn.ModuleList()\n        for s in [16, 16]:\n            conv_trans2d = nn.ConvTranspose2d(1, 1, (3, 2 * s),\n                                              padding=(1, s // 2),\n                                              stride=(1, s))\n            conv_trans2d = nn.utils.weight_norm(conv_trans2d)\n            nn.init.kaiming_normal_(conv_trans2d.weight)\n            self.upsample_conv2d.append(conv_trans2d)\n\n        # 80 is mel bands\n        self.mel_conv = Conv(80, 2 * self.res_channels, kernel_size=1)\n\n        # Residual conv1x1 layer, connect to next residual layer\n        self.res_conv = nn.Conv1d(res_channels, res_channels, kernel_size=1)\n        self.res_conv = nn.utils.weight_norm(self.res_conv)\n        nn.init.kaiming_normal_(self.res_conv.weight)\n\n        # Skip conv1x1 layer, add to all skip outputs through skip connections\n        self.skip_conv = nn.Conv1d(res_channels, skip_channels, kernel_size=1)\n        self.skip_conv = nn.utils.weight_norm(self.skip_conv)\n        nn.init.kaiming_normal_(self.skip_conv.weight)\n\n    def forward(self, input_data):\n        x, mel_spec, diffusion_step_embed = input_data\n        h = x\n        batch_size, n_channels, seq_len = x.shape\n        assert n_channels == self.res_channels\n\n        # Add in diffusion step embedding\n        part_t = self.fc_t(diffusion_step_embed)\n        part_t = part_t.view([batch_size, self.res_channels, 1])\n        h += part_t\n\n        # Dilated conv layer\n        h = self.dilated_conv_layer(h)\n\n        # Upsample spectrogram to size of audio\n        mel_spec = torch.unsqueeze(mel_spec, dim=1)\n        mel_spec = F.leaky_relu(self.upsample_conv2d[0](mel_spec), 0.4, inplace=False)\n        mel_spec = F.leaky_relu(self.upsample_conv2d[1](mel_spec), 0.4, inplace=False)\n        mel_spec = torch.squeeze(mel_spec, dim=1)\n\n        assert mel_spec.size(2) >= seq_len\n        if mel_spec.size(2) > seq_len:\n            mel_spec = mel_spec[:, :, :seq_len]\n\n        mel_spec = self.mel_conv(mel_spec)\n        h += mel_spec\n\n        # Gated-tanh nonlinearity\n        out = torch.tanh(h[:, :self.res_channels, :]) * torch.sigmoid(h[:, self.res_channels:, :])\n\n        # Residual and skip outputs\n        res = self.res_conv(out)\n        assert x.shape == res.shape\n        skip = self.skip_conv(out)\n\n        # Normalize for training stability\n        return (x + res) * math.sqrt(0.5), skip\n\n\nclass ResidualGroup(nn.Module):\n    def __init__(self, res_channels, skip_channels, num_res_layers, dilation_cycle,\n                 diffusion_step_embed_dim_in,\n                 diffusion_step_embed_dim_mid,\n                 diffusion_step_embed_dim_out):\n        super().__init__()\n        self.num_res_layers = num_res_layers\n        self.diffusion_step_embed_dim_in = diffusion_step_embed_dim_in\n\n        # Use the shared two FC layers for diffusion step embedding\n        self.fc_t1 = nn.Linear(diffusion_step_embed_dim_in, diffusion_step_embed_dim_mid)\n        self.fc_t2 = nn.Linear(diffusion_step_embed_dim_mid, diffusion_step_embed_dim_out)\n\n        # Stack all residual blocks with dilations 1, 2, ... , 512, ... , 1, 2, ..., 512\n        self.residual_blocks = nn.ModuleList()\n        for n in range(self.num_res_layers):\n            self.residual_blocks.append(\n                ResidualBlock(res_channels, skip_channels,\n                               dilation=2 ** (n % dilation_cycle),\n                               diffusion_step_embed_dim_out=diffusion_step_embed_dim_out))\n\n    def forward(self, input_data):\n        x, mel_spectrogram, diffusion_steps = input_data\n\n        # Embed diffusion step t\n        diffusion_step_embed = calc_diffusion_step_embedding(\n            diffusion_steps, self.diffusion_step_embed_dim_in)\n        diffusion_step_embed = swish(self.fc_t1(diffusion_step_embed))\n        diffusion_step_embed = swish(self.fc_t2(diffusion_step_embed))\n\n        # Pass all residual layers\n        h = x\n        skip = 0\n        for n in range(self.num_res_layers):\n            # Use the output from last residual layer\n            h, skip_n = self.residual_blocks[n]((h, mel_spectrogram, diffusion_step_embed))\n            # Accumulate all skip outputs\n            skip += skip_n\n\n        # Normalize for training stability\n        return skip * math.sqrt(1.0 / self.num_res_layers)\n\n\nclass DiffWave(nn.Module):\n    def __init__(self, in_channels, res_channels, skip_channels, out_channels,\n                 num_res_layers, dilation_cycle,\n                 diffusion_step_embed_dim_in,\n                 diffusion_step_embed_dim_mid,\n                 diffusion_step_embed_dim_out):\n        super().__init__()\n\n        # Initial conv1x1 with relu\n        self.init_conv = nn.Sequential(Conv(in_channels, res_channels, kernel_size=1), nn.ReLU(inplace=False))\n        # All residual layers\n        self.residual_layer = ResidualGroup(res_channels,\n                                            skip_channels,\n                                            num_res_layers,\n                                            dilation_cycle,\n                                            diffusion_step_embed_dim_in,\n                                            diffusion_step_embed_dim_mid,\n                                            diffusion_step_embed_dim_out)\n        # Final conv1x1 -> relu -> zeroconv1x1\n        self.final_conv = nn.Sequential(Conv(skip_channels, skip_channels, kernel_size=1),\n                                        nn.ReLU(inplace=False), ZeroConv1d(skip_channels, out_channels))\n\n    def forward(self, input_data):\n        audio, mel_spectrogram, diffusion_steps = input_data\n        x = audio\n        x = self.init_conv(x).clone()\n        x = self.residual_layer((x, mel_spectrogram, diffusion_steps))\n        return self.final_conv(x)\n"
  },
  {
    "path": "bddm/models/galr.py",
    "content": "#!/bin/env python\n# -*- coding: utf-8 -*-\n########################################################################\n#\n#  Globally Attentive Locally Recurrent (GALR) Networks\n#  (https://arxiv.org/abs/2101.05014)\n#\n#  Author: Max W. Y. Lam (maxwylam@tencent.com)\n#  Copyright (c) 2021Tencent. All Rights Reserved\n#\n########################################################################\n\n\nimport math\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom torch.autograd import Variable\n\n\nclass Encoder(nn.Module):\n\n    def __init__(self, enc_dim, win_len):\n        \"\"\"\n        1D Convoluation based Waveform Encoder\n\n        Parameters:\n            enc_dim (int): Dimension of each frame (e.g. choice in paper: ``128``).\n            win_len (int): Window length for processing raw signal samples (e.g. a\n                common choice: ``16``). By default, the windows are half-overlapping.\n        \"\"\"\n        super().__init__()\n        # 1D convolutional layer\n        self.enc_conv = nn.Conv1d(1, enc_dim,\n                                  kernel_size=win_len,\n                                  stride=win_len//2, bias=False)\n\n    def forward(self, signals):\n        \"\"\"\n        Non-linearly encode signals from raw waveform to frame sequences.\n\n        Parameters:\n            signals (tensor): A batch of signals in shape `[B, T]`, where `T` is the\n                maximum length of these `B` signals.\n        Returns:\n            frames (tensor): A batch of encoded feature (a.k.a. frames) sequences in shape\n                `[B, D, L]`, where `B` is the batch size, `D` is the encoded feature\n                dimension (enc_dim), and `L` is the length of this feature sequence.\n        \"\"\"\n        frames = F.relu(self.enc_conv(signals.unsqueeze(1)))\n        return frames\n\n\nclass BiLSTMproj(nn.Module):\n\n    def __init__(self, enc_dim, hid_dim):\n        \"\"\"\n        Locally Recurrent Layer (Sec 2.2.1 in https://arxiv.org/abs/2101.05014).\n            It consists of a bi-directional LSTM followed by a linear projection.\n\n        Parameters:\n            enc_dim (int): Dimension of each frame (e.g. choice in paper: ``128``).\n            hid_dim (int): Number of hidden nodes used in the Bi-LSTM.\n        \"\"\"\n        super().__init__()\n        # Bi-LSTM with learnable (h_0, c_0) state\n        self.rnn = nn.LSTM(enc_dim, hid_dim,\n                           1, dropout=0, batch_first=True, bidirectional=True)\n        self.cell_init = nn.Parameter(torch.rand(1, 1, hid_dim))\n        self.hidden_init = nn.Parameter(torch.rand(1, 1, hid_dim))\n\n        # Linear projection layer\n        self.proj = nn.Linear(hid_dim * 2, enc_dim)\n\n    def forward(self, intra_segs):\n        \"\"\"\n        Process through a locally recurrent layer along the intra-segment\n            direction.\n\n        Parameters:\n        \tframes (tensor): A batch of intra-segments in shape `[B*S, K, D]`, where\n                `B` is the batch size, `S` is the number of segments, 'K' is the\n                segment length (seg_len) and `D` is the feature dimension (enc_dim).\n        Returns:\n            lr_output (tensor): A batch of processed segments with the same shape as the input.\n        \"\"\"\n        batch_size_seq_len = intra_segs.size(0)\n        cell = self.cell_init.repeat(2, batch_size_seq_len, 1)\n        hidden = self.hidden_init.repeat(2, batch_size_seq_len, 1)\n        rnn_output, _ = self.rnn(intra_segs, (hidden, cell))\n        lr_output = self.proj(rnn_output)\n        return lr_output\n\n\nclass AttnPositionalEncoding(nn.Module):\n\n    def __init__(self, enc_dim, attn_max_len=5000):\n        \"\"\"\n        Positional Encoding for Multi-Head Attention\n\n        Parameters:\n            enc_dim (int): Dimension of each frame (e.g. choice in paper: ``128``).\n            attn_max_len (int): Maximum length of the sequence to be processed by\n                multi-head attention.\n        \"\"\"\n        super().__init__()\n        pe = torch.zeros(attn_max_len, enc_dim)\n        position = torch.arange(0, attn_max_len, dtype=torch.float).unsqueeze(1)\n        div_term = torch.exp(torch.arange(0, enc_dim, 2).float() * (-math.log(10000.0)/enc_dim))\n        pe[:, 0::2] = torch.sin(position * div_term)\n        pe[:, 1::2] = torch.cos(position * div_term)\n        pe = pe.unsqueeze(0).transpose(0, 1)\n        self.register_buffer('pe', pe)\n\n    def forward(self, x):\n        \"\"\"\n        Compute positional encoding\n\n        Parameters:\n        \tx (tensor): the sequence to be addded with the positional encoding vector\n        Returns:\n            output (tensor): the encoded input\n        \"\"\"\n        output = x + self.pe[:x.size(0), :]\n        return output\n\n\nclass GlobalAttnLayer(nn.Module):\n\n    def __init__(self, enc_dim, n_attn_head, attn_dropout):\n        \"\"\"\n        Globally Attentive Layer (Sec 2.2.2 in\n            https://arxiv.org/abs/2101.05014)\n\n        Parameters:\n            enc_dim (int): Dimension of each frame (e.g. choice in paper: ``128``).\n            n_attn_head (int): Number of heads for the multi-head attention. (e.g.\n                choice in paper: ``8``)\n            attn_dropout (float): Dropout rate for multi-head attention (e.g. choice in\n                paper: ``0.1``).\n        \"\"\"\n        super().__init__()\n        self.attn = nn.MultiheadAttention(enc_dim,\n            n_attn_head, dropout=attn_dropout)\n        self.norm = nn.LayerNorm(enc_dim)\n        self.dropout = nn.Dropout(attn_dropout)\n\n    def forward(self, inter_segs):\n        \"\"\"\n        Process through a globally attentive layer along the inter-segment\n            direction.\n\n        Parameters:\n        \tinter_segs (tensor): A batch of inter-segments in shape `[S, B*K, D]`, where\n                `B` is the batch size, `S` is the number of segments, 'K' is the\n                segment length (seg_len) and `D` is the feature dimension (enc_dim).\n        Returns:\n            output (tensor): A batch of processed segments with the same shape as the input.\n        \"\"\"\n        output, _ = self.attn(inter_segs, inter_segs, inter_segs)\n        output = self.norm(output + self.dropout(output))\n        return output\n\n\nclass DeepGlobalAttnLayer(nn.Module):\n\n    def __init__(self, enc_dim, n_attn_head, attn_dropout, n_attn_layer=1):\n        \"\"\"\n        A Stack of Globally Attentive Layers (Sec 2.2.2 in\n            https://arxiv.org/abs/2101.05014)\n\n        Parameters:\n            enc_dim (int): Dimension of each frame (e.g. choice in paper: ``128``).\n            n_attn_head (int): Number of heads for the multi-head attention. (e.g.\n                choice in paper: ``8``)\n            attn_dropout (float): Dropout rate for multi-head attention (e.g. choice in\n                paper: ``0.1``).\n            n_attn_layer (int): Number of globally attentive layers stacked. The\n                setting in paper by default is ``1``.\n        \"\"\"\n        super().__init__()\n        self.attn_in_norm = nn.LayerNorm(enc_dim)\n        self.pos_enc = AttnPositionalEncoding(enc_dim)\n        self.attn_layer = nn.ModuleList([\n            GlobalAttnLayer(enc_dim, n_attn_head, attn_dropout) for _ in range(n_attn_layer)])\n\n    def forward(self, inter_segs):\n        \"\"\"\n        Process through a stack of globally attentive layers.\n\n        Parameters:\n        \tinter_segs (tensor): A batch of inter-segments in shape `[S, B*K, D]`, where\n            `B` is the batch size, `S` is the number of segments, 'K' is the\n            segment length (seg_len) and `D` is the feature dimension (enc_dim).\n        Returns:\n        \t output (tensor): A batch of processed segments with the same shape as the input.\n        \"\"\"\n        output = self.attn_in_norm(inter_segs)\n        output = self.pos_enc(output)\n        for block in self.attn_layer:\n            output = block(output)\n        return output\n\n\nclass GALRBlock(nn.Module):\n\n    def __init__(self, enc_dim, hid_dim, seg_len, low_dim=8, n_attn_head=8, attn_dropout=0.1):\n        \"\"\"\n        Globally Attentive Locally Recurrent (GALR) Block (Sec. 2.2 in\n            https://arxiv.org/abs/2101.05014)\n\n        Parameters:\n            enc_dim (int): Dimension of each frame (e.g. choice in paper: ``128``).\n            hid_dim (int): Number of hidden nodes used in the Bi-LSTM.\n            seg_len (int): Segment length for processing frame sequence (e.g. a\n                ``64`` when win_len is ``16``). By default, the segments are\n                half-overlapping.\n            low_dim (int): Lower dimension for speeding up GALR. (Sec. 2.2.3)\n                (e.g. ``8``  when seg_len is ``64``).\n            n_attn_head (int): Number of heads for the multi-head attention. (e.g.\n                choice in paper: ``8``)\n            attn_dropout (float): Dropout rate for multi-head attention (e.g. choice in\n                paper: ``0.1``).\n        \"\"\"\n        super().__init__()\n        self.low_dim = low_dim\n        self.local_rnn = BiLSTMproj(enc_dim, hid_dim)\n        self.local_norm = nn.GroupNorm(1, enc_dim)\n        self.global_rnn = DeepGlobalAttnLayer(enc_dim, n_attn_head, attn_dropout)\n        self.global_norm = nn.GroupNorm(1, enc_dim)\n        self.ld_map = nn.Linear(seg_len, low_dim)\n        self.ld_inv_map = nn.Linear(low_dim, seg_len)\n\n    def forward(self, segments):\n        \"\"\"\n        Process through a GALR block.\n\n        Parameters:\n        \tsegments (tensor): A batch of 3D segments in shape `[B, D, K, S]`, where\n                `B` is the batch size, `S` is the number of segments, 'K' is the\n                segment length (seg_len) and `D` is the feature dimension (enc_dim).\n        Returns:\n        \t segments (tensor): A batch of processed segments with the same shape as the input.\n        \"\"\"\n        batch_size, feat_dim, seg_len, n_segs = segments.size()\n        # Change the sequence direction for intra-segment processing\n        local_input = segments.transpose(1, 3).reshape(batch_size * n_segs, seg_len, feat_dim)\n        # Process through a locally recurrent layer\n        local_output = self.local_rnn(local_input)\n        # Reshape to match the dimensionality of the input for residual connection\n        local_output = local_output.view(batch_size, n_segs, seg_len, feat_dim).transpose(1, 3).contiguous()\n        # Add a layer normalization before the residual connection\n        local_output = self.local_norm(local_output)\n        # Add residual connection\n        segments = segments + local_output\n\n        # Change the sequence direction for intra-segment processing\n        global_input = segments.permute(3, 2, 0, 1).contiguous().view(n_segs, seg_len, batch_size, feat_dim)\n        # Perform low-dimensional mapping for speeding up GALR\n        global_input = self.ld_map(global_input.transpose(1, -1))\n        # Reshape for intra-segment processing (sequence, batch size, feature dim)\n        global_input = global_input.transpose(1, -1).contiguous().view(n_segs, -1, feat_dim)\n        # Process through a globally attentive layer\n        global_output = self.global_rnn(global_input)\n        # Reshape for low-dimensional inverse mapping\n        global_output = global_output.view(n_segs, self.low_dim, -1, feat_dim).transpose(1, -1)\n        # Map the low-dimensional features back to the original size\n        global_output = self.ld_inv_map(global_output)\n        # Reshape to match the dimensionality of the input for residual connection\n        global_output = global_output.permute(2, 1, 3, 0).contiguous()\n        # Add a layer normalization before the residual connection\n        global_output = self.global_norm(global_output)\n        # Add residual connection\n        segments = segments + global_output\n\n        return segments\n\n\nclass _GALR(nn.Module):\n\n    def __init__(self, n_block, enc_dim, hid_dim, win_len, seg_len,\n            low_dim=8, n_attn_head=8, attn_dropout=0.1):\n        \"\"\"\n        Globally Attentive Locally Recurrent (GALR) Networks\n            (https://arxiv.org/abs/2101.05014)\n\n        Parameters:\n            n_block (int): Number of GALR blocks (e.g. choice in paper: ``6``).\n            enc_dim (int): Dimension of each frame (e.g. choice in paper: ``128``).\n            hid_dim (int): Number of hidden nodes used in the Bi-LSTM.\n            win_len (int): Window length for processing raw signal samples (e.g. a\n                common choice: ``8``). By default, the windows are half-overlapping.\n            seg_len (int): Segment length for processing frame sequence (e.g. a\n                ``64`` when win_len is ``8``). By default, the segments are half-overlapping.\n            low_dim (int): Lower dimension for speeding up GALR. (Sec. 2.2.3)\n                (e.g. ``8``  when seg_len is ``64``).\n            n_attn_head (int): Number of heads for the multi-head attention. (e.g.\n                choice in paper: ``8``)\n            attn_dropout (flaot): Dropout rate for multi-head attention (e.g. choice in\n                paper: ``0.1``).\n        \"\"\"\n        super().__init__()\n        self.win_len = win_len\n        self.seg_len = seg_len\n        self.encoder = Encoder(enc_dim, win_len)\n        self.bottleneck = nn.Conv1d(enc_dim, enc_dim, 1, bias=False)\n        # GALR blocks\n        self.blocks = nn.ModuleList([\n            GALRBlock(enc_dim, hid_dim, seg_len, low_dim, n_attn_head, attn_dropout)\n                for i in range(n_block)])\n        # Many-to-one gated layer applied to GALR's last block\n        self.block_gate = nn.Sequential(\n            nn.PReLU(),\n            nn.Conv2d(enc_dim, 1, 1)\n        )\n\n    def forward(self, noisy_signals):\n        \"\"\"\n        Process through a GALR network for noise estimation\n\n        Parameters:\n        \tnoisy_signals (tensor): A batch of 1D signals in shape `[B, T]`, where\n                `B` is the batch size, `T` is the maximum length of the `B` signals.\n        Returns:\n        \test_ratios (tensor): A batch of scalar noise scale ratios in `[B, 1]` shape.\n        \"\"\"\n        noisy_signals_padded, _ = self.pad_zeros(noisy_signals)\n        mix_frames = self.encoder(noisy_signals_padded)\n        mix_frames = self.bottleneck(mix_frames)\n        block_feature, _ = self.split_feature(mix_frames)\n        for block in self.blocks:\n            block_feature = block(block_feature)\n        est_segments = self.block_gate(block_feature)\n        est_ratios = torch.sigmoid(est_segments.mean([2, 3]))\n        est_ratios = est_ratios.mean(1, keepdim=True)\n        return est_ratios\n\n    def pad_zeros(self, signals):\n        \"\"\"\n        Pad a batch of signals with zeros before encoding.\n\n        Parameters:\n        \tsignals (tensor): A batch of 1D signals in shape `[B, T]`, where `B` is\n                the batch size, `T` is the maximum length of the `B` signals.\n        Returns:\n        \tsignals (tensor): A batch of padded signals and the length of zeros used for\n                padding. (in shape `[B, T]`)\n            rest (int): the redundant spaces created in padding\n        \"\"\"\n        batch_size, sig_len = signals.shape\n        self.hop_size = self.win_len // 2\n        rest = self.win_len - (self.hop_size + sig_len % self.win_len) % self.win_len\n        if rest > 0:\n            pad = torch.zeros(batch_size, rest).type(signals.type())\n            signals = torch.cat([signals, pad], 1)\n        pad_aux = torch.zeros(batch_size, self.hop_size).type(signals.type())\n        signals = torch.cat([pad_aux, signals, pad_aux], 1)\n        return signals, rest\n\n    def pad_segment(self, frames):\n        \"\"\"\n        Pad a batch of frames with zeros before segmentation.\n\n        Parameters:\n        \tframes (tensor): A batch of 2D frames in shape `[B, D, L]`, where `B` is\n                the batch size, `D` is the encoded feature dimension (enc_dim), and\n                `L` is the length of this feature sequence.\n        Returns:\n        \tframes (tensor): A batch of padded frames and the length of zeros used for\n                padding. (in shape `[B, D, L]`)\n            rest (int): the redundant spaces created in padding\n        \"\"\"\n        batch_size, feat_dim, seq_len = frames.shape\n        stride = self.seg_len // 2\n        rest = self.seg_len - (stride + seq_len % self.seg_len) % self.seg_len\n        if rest > 0:\n            pad = Variable(torch.zeros(batch_size, feat_dim, rest)).type(frames.type())\n            frames = torch.cat([frames, pad], 2)\n        pad_aux = Variable(torch.zeros(batch_size, feat_dim, stride)).type(frames.type())\n        frames = torch.cat([pad_aux, frames, pad_aux], 2)\n        return frames, rest\n\n    def split_feature(self, frames):\n        \"\"\"\n        Perform segmentation by dividing every K (seg_len) consecutive frames\n            into S segments.\n\n        Parameters:\n        \tframes (tensor): A batch of 2D frames in shape `[B, D, L]`, where `B` is\n                the batch size, `D` is the encoded feature Dension (enc_D), and\n                `L` is the length of this feature sequence.\n        Returns:\n        \tsegments (tensor): A batch of 3D segments and the length of zeros used for\n                padding. (in shape `[B, D, K, S]`)\n            rest (int): the redundant spaces created in padding\n        \"\"\"\n        frames, rest = self.pad_segment(frames)\n        batch_size, feat_dim, _ = frames.shape\n        stride = self.seg_len // 2\n        lsegs = frames[:, :, :-stride].contiguous().view(batch_size, feat_dim, -1, self.seg_len)\n        rsegs = frames[:, :, stride:].contiguous().view(batch_size, feat_dim, -1, self.seg_len)\n        segments = torch.cat([lsegs, rsegs], -1).view(batch_size, feat_dim, -1, self.seg_len)\n        segments = segments.transpose(2, 3).contiguous()\n        return segments, rest\n\n\nclass GALR(nn.Module):\n\n    def __init__(self, blocks=2, input_dim=128, hidden_dim=128, window_length=8, segment_size=64):\n        \"\"\"\n        GALR Schedule Network\n\n        Parameters:\n            blocks (int): Number of GALR blocks (e.g. choice in paper: ``6``).\n            input_dim (int): Dimension of each frame (e.g. choice in paper: ``128``).\n            hidden_dim (int): Number of hidden nodes used in the Bi-LSTM.\n            window_length (int): Window length for processing raw signal samples (e.g. a\n                common choice: ``8``). By default, the windows are half-overlapping.\n            segment_size (int): Segment length for processing frame sequence (e.g. a\n                ``64`` when win_len is ``8``). By default, the segments are half-overlapping.\n        \"\"\"\n        super().__init__()\n        self.ratio_nn = _GALR(blocks, input_dim, hidden_dim, window_length, segment_size)\n\n    def forward(self, audio, scales):\n        \"\"\"\n        Estimate the next beta scale using the GALR schedule network\n\n        Parameters:\n        \taudio (tensor): A batch of 1D signals in shape `[B, T]`, where\n                `B` is the batch size, `T` is the maximum length of the `B` signals.\n            scales (list): [beta_nxt, delta] for computing the upper bound\n        Returns:\n        \tbeta (tensor): A batch of scalar noise scale ratios in `[B, 1, 1]` shape.\n        \"\"\"\n        beta_nxt, delta = scales\n        bounds = torch.cat([beta_nxt, delta], 1)\n        mu, _ = torch.min(bounds, 1)\n        mu = mu[:, None]\n        ratio = self.ratio_nn(audio)\n        beta = mu * ratio\n        return beta.view(audio.size(0), 1, 1)\n"
  },
  {
    "path": "bddm/sampler/__init__.py",
    "content": "#!/bin/env python\n# -*- coding: utf-8 -*-\n########################################################################\n#\n#  Sampler\n#\n#  Author: Max W. Y. Lam (maxwylam@tencent.com)\n#  Copyright (c) 2021Tencent. All Rights Reserved\n#\n########################################################################\n\nfrom .sampler import Sampler\n"
  },
  {
    "path": "bddm/sampler/sampler.py",
    "content": "#!/bin/env python\n# -*- coding: utf-8 -*-\n########################################################################\n#\n#  BDDM Sampler (Supports Noise Scheduling and Sampling)\n#\n#  Author: Max W. Y. Lam (maxwylam@tencent.com)\n#  Copyright (c) 2021Tencent. All Rights Reserved\n#\n########################################################################\n\n\nfrom __future__ import absolute_import\n\nimport os\nimport librosa\n\nimport torch\nimport numpy as np\nfrom scipy.io.wavfile import write as wavwrite\nfrom pystoi import stoi\nfrom pypesq import pesq\n\nfrom bddm.utils.log_utils import log\nfrom bddm.utils.check_utils import check_score_network\nfrom bddm.utils.check_utils import check_schedule_network\nfrom bddm.utils.diffusion_utils import compute_diffusion_params\nfrom bddm.utils.diffusion_utils import map_noise_scale_to_time_step\nfrom bddm.models import get_score_network, get_schedule_network\nfrom bddm.loader.dataset import create_generation_dataloader\nfrom bddm.loader.dataset import create_train_and_valid_dataloader\n\n\nclass Sampler(object):\n\n    metric2index = {\"PESQ\": 0, \"STOI\": 1}\n\n    def __init__(self, config):\n        \"\"\"\n        Sampler Class, implements the Noise Scheduling and Sampling algorithms in BDDMs\n\n        Parameters:\n            config (namespace): BDDM Configuration\n        \"\"\"\n        self.config = config\n        self.exp_dir = config.exp_dir\n        self.clip = config.grad_clip\n        self.load = config.load\n        self.model = get_score_network(config).cuda().eval()\n        self.schedule = None\n        # Initialize diffusion parameters using a pre-specified linear schedule\n        noise_schedule = torch.linspace(config.beta_0, config.beta_T, config.T).cuda()\n        self.diff_params = compute_diffusion_params(noise_schedule)\n        if self.config.command != 'train':\n            # Find schedule net, if not trained then use DDPM or DDIM sampling mode\n            schedule_net_trained, schedule_net_path = check_schedule_network(config)\n            if not schedule_net_trained:\n                _, score_net_path = check_score_network(config)\n                assert score_net_path is not None, 'Error: No score network can be found!'\n                self.config.load = score_net_path\n            else:\n                self.config.load = schedule_net_path\n                self.model.schedule_net = get_schedule_network(config).cuda().eval()\n\n        # Perform noise scheduling when noise schedule file (.schedule) is not given\n        if self.config.command == 'generate' and self.config.sampling_noise_schedule != '':\n            # Generation mode given pre-searched noise schedule\n            self.schedule = torch.load(self.config.sampling_noise_schedule, map_location='cpu')\n\n        self.reset()\n\n    def reset(self):\n        \"\"\"\n        Reset sampling environment\n        \"\"\"\n        if self.config.command != 'train' and self.load != '':\n            package = torch.load(self.load, map_location=lambda storage, loc: storage.cuda())\n            self.model.load_state_dict(package['model_state_dict'])\n            log('Loaded checkpoint %s' % self.load, self.config)\n\n        if self.config.command == 'generate':\n            # Given Mel-spectrogram directory for speech vocoding\n            self.gen_loader = create_generation_dataloader(self.config)\n        else:\n            # Sample a reference audio sample for noise scheduling\n            _, self.vl_loader = create_train_and_valid_dataloader(self.config)\n            self.draw_reference_data_pair()\n\n    def draw_reference_data_pair(self):\n        \"\"\"\n        Draw a new input-output pair for noise scheduling\n        \"\"\"\n        self.ref_spec, self.ref_audio = next(iter(self.vl_loader))\n        self.ref_spec, self.ref_audio = self.ref_spec.cuda(), self.ref_audio.cuda()\n\n    def generate(self):\n        \"\"\"\n        Start the generation process\n        \"\"\"\n        generate_dir = os.path.join(self.exp_dir, 'generated')\n        os.makedirs(generate_dir, exist_ok=True)\n        scores = {metric: [] for metric in self.metric2index}\n        for filepath, mel_spec, audio in self.gen_loader:\n            mel_spec = mel_spec.cuda()\n            generated_audio, n_steps = self.sampling(schedule=self.schedule,\n                                                     condition=mel_spec,\n                                                     ddim=self.config.use_ddim_steps)\n            audio_key = '.'.join(filepath[0].split('/')[-1].split('.')[:-1])\n\n            if len(audio) > 0:\n                # Assess the generated audio with the reference audio\n                self.ref_audio = audio\n                score = self.assess(generated_audio, audio_key=audio_key)\n                for metric in self.metric2index:\n                    scores[metric].append(score[self.metric2index[metric]])\n\n            model_name = 'BDDM' if self.schedule is not None else (\n                'DDIM' if self.config.use_ddim_steps else 'DDPM')\n            generated_file = os.path.join(generate_dir,\n                '%s_by_%s-%d.wav'%(audio_key, model_name, n_steps))\n            wavwrite(generated_file, self.config.sampling_rate,\n                     generated_audio.squeeze().cpu().numpy())\n            log('Generated '+generated_file, self.config)\n        log('Avg Scores: PESQ = %.3f +/- %.3f, %.5f +/- %.5f'%(\n            np.mean(scores['PESQ']), 1.96 * np.std(scores['PESQ']),\n            np.mean(scores['STOI']), 1.96 * np.std(scores['STOI'])), self.config)\n\n    def noise_scheduling(self, ddim=False):\n        \"\"\"\n        Start the noise scheduling process\n\n        Parameters:\n            ddim (bool): whether to use the DDIM's p_theta for noise scheduling or not\n        Returns:\n            ts_infer (tensor): the step indices estimated by BDDM\n            a_infer (tensor):  the alphas estimated by BDDM\n            b_infer (tensor):  the betas estimated by BDDM\n            s_infer (tensor):  the std. deviations estimated by BDDM\n        \"\"\"\n        max_steps = self.diff_params[\"N\"]\n        alpha = self.diff_params[\"alpha\"]\n        alpha_param = self.diff_params[\"alpha_param\"]\n        beta_param = self.diff_params[\"beta_param\"]\n        min_beta = self.diff_params[\"beta\"].min()\n        betas = []\n        x = torch.normal(0, 1, size=self.ref_audio.shape).cuda()\n        with torch.no_grad():\n            b_cur = torch.ones(1, 1, 1).cuda() * beta_param\n            a_cur = torch.ones(1, 1, 1).cuda() * alpha_param\n            for n in range(max_steps - 1, -1, -1):\n                step = map_noise_scale_to_time_step(a_cur.squeeze().item(), alpha)\n                if step >= 0:\n                    betas.append(b_cur.squeeze().item())\n                else:\n                    break\n                ts = (step * torch.ones((1, 1))).cuda()\n                e = self.model((x, self.ref_spec.clone(), ts,))\n                a_nxt = a_cur / (1 - b_cur).sqrt()\n                if ddim:\n                    c1 = a_nxt / a_cur\n                    c2 = -(1 - a_cur**2.).sqrt() * c1\n                    x = c1 * x + c2 * e\n                    c3 = (1 - a_nxt**2.).sqrt()\n                    x = x + c3 * e\n                else:\n                    x = x - b_cur / torch.sqrt(1 - a_cur**2.) * e\n                    x = x / torch.sqrt(1 - b_cur)\n                    if n > 0:\n                        z = torch.normal(0, 1, size=self.ref_audio.shape).cuda()\n                        x = x + torch.sqrt((1 - a_nxt**2.) / (1 - a_cur**2.) * b_cur) * z\n                a_nxt, beta_nxt = a_cur, b_cur\n                a_cur = a_nxt / (1 - beta_nxt).sqrt()\n                if a_cur > 1:\n                    break\n                b_cur = self.model.schedule_net(x.squeeze(1),\n                    (beta_nxt.view(-1, 1), (1 - a_cur**2.).view(-1, 1)))\n                if b_cur.squeeze().item() < min_beta:\n                    break\n        b_infer = torch.FloatTensor(betas[::-1]).cuda()\n        a_infer = 1 - b_infer\n        s_infer = b_infer + 0\n        for n in range(1, len(b_infer)):\n            a_infer[n] *= a_infer[n-1]\n            s_infer[n] *= (1 - a_infer[n-1]) / (1 - a_infer[n])\n        a_infer = torch.sqrt(a_infer)\n        s_infer = torch.sqrt(s_infer)\n\n        # Mapping noise scales to time steps\n        ts_infer = []\n        for n in range(len(b_infer)):\n            step = map_noise_scale_to_time_step(a_infer[n], alpha)\n            if step >= 0:\n                ts_infer.append(step)\n        ts_infer = torch.FloatTensor(ts_infer)\n        return ts_infer, a_infer, b_infer, s_infer\n\n    def sampling(self, schedule=None, condition=None,\n                 ddim=0, return_sequence=False, audio_size=None):\n        \"\"\"\n        Perform the sampling algorithm\n\n        Parameters:\n            schedule (list):        the [ts_infer, a_infer, b_infer, s_infer]\n                                    returned by the noise scheduling algorithm\n            condition (tensor):     the condition for computing scores\n            ddim (bool):            whether to use the DDIM for sampling or not\n            return_sequence (bool): whether returning all steps' samples or not\n            audio_size (list):      the shape of the audio to be sampled\n        Returns:\n            xs (list):              (if return_sequence) the list of generated audios\n            x (tensor):             the generated audio(s) in shape=audio_size\n            N (int):                the number of sampling steps\n        \"\"\"\n\n        n_steps = self.diff_params[\"T\"]\n        if condition is None:\n            condition = self.ref_spec\n\n        if audio_size is None:\n            audio_length = condition.size(-1) * self.config.hop_len\n            audio_size = (1, 1, audio_length)\n\n        if schedule is None:\n            if ddim > 1:\n                # Use DDIM (linear) for sampling ({ddim} steps)\n                ts_infer = torch.linspace(0, n_steps - 1, ddim).long()\n                a_infer = self.diff_params[\"alpha\"].index_select(0, ts_infer.cuda())\n                b_infer = self.diff_params[\"beta\"].index_select(0, ts_infer.cuda())\n                s_infer = self.diff_params[\"sigma\"].index_select(0, ts_infer.cuda())\n            else:\n                # Use DDPM for sampling (complete T steps)\n                # P.S. if ddim = 1, run DDIM reverse process for T steps\n                ts_infer = torch.linspace(0, n_steps - 1, n_steps)\n                a_infer = self.diff_params[\"alpha\"]\n                b_infer = self.diff_params[\"beta\"]\n                s_infer = self.diff_params[\"sigma\"]\n        else:\n            ts_infer, a_infer, b_infer, s_infer = schedule\n\n        sampling_steps = len(ts_infer)\n\n        x = torch.normal(0, 1, size=audio_size).cuda()\n        if return_sequence:\n            xs = []\n        with torch.no_grad():\n            for n in range(sampling_steps - 1, -1, -1):\n                if sampling_steps > 50 and (sampling_steps - n) % 50 == 0:\n                    # Log progress per 50 steps when sampling_steps is large\n                    log('\\tComputed %d / %d steps !'%(\n                        sampling_steps - n, sampling_steps), self.config)\n                ts = (ts_infer[n] * torch.ones((1, 1))).cuda()\n                e = self.model((x, condition, ts,))\n                if ddim:\n                    if n > 0:\n                        a_nxt = a_infer[n - 1]\n                    else:\n                        a_nxt = a_infer[n] / (1 - b_infer[n]).sqrt()\n                    c1 = a_nxt / a_infer[n]\n                    c2 = -(1 - a_infer[n]**2.).sqrt() * c1\n                    c3 = (1 - a_nxt**2.).sqrt()\n                    x = c1 * x + (c2 + c3) * e\n                else:\n                    x = x - b_infer[n] / torch.sqrt(1 - a_infer[n]**2.) * e\n                    x = x / torch.sqrt(1 - b_infer[n])\n                    if n > 0:\n                        z = torch.normal(0, 1, size=audio_size).cuda()\n                        x = x + s_infer[n] * z\n                if return_sequence:\n                    xs.append(x)\n        if return_sequence:\n            return xs\n        return x, sampling_steps\n\n    def noise_scheduling_with_params(self, alpha_param, beta_param):\n        \"\"\"\n        Run noise scheduling for once given the (alpha_param, beta_param) pair\n\n        Parameters:\n            alpha_param (float): a hyperparameter defining the alpha_hat value at step N\n            beta_param (float):  a hyperparameter defining the beta_hat value at step N\n        \"\"\"\n        log('TRY alpha_param=%.2f, beta_param=%.2f:'%(\n            alpha_param, beta_param), self.config)\n        # Define the pair key\n        key = '%.2f,%.2f' % (alpha_param, beta_param)\n        # Set alpha_param and beta_param in self.diff_params\n        self.diff_params['alpha_param'] = alpha_param\n        self.diff_params['beta_param'] = beta_param\n        # Use DDPM reverse process for noise scheduling\n        ddpm_schedule = self.noise_scheduling(ddim=False)\n        log(\"\\tSearched a %d-step schedule using DDPM reverse process\" % (\n            len(ddpm_schedule[0])), self.config)\n        generated_audio, _ = self.sampling(schedule=ddpm_schedule)\n        # Compute objective scores\n        ddpm_score = self.assess(generated_audio)\n        # Get the number of sampling steps with this schedule\n        steps = len(ddpm_schedule[0])\n        # Compare the performance with previous same-step schedule using the metric\n        if steps not in self.steps2score:\n            # Save the first schedule with this number of steps\n            self.steps2score[steps] = [key, ] + ddpm_score\n            self.steps2schedule[steps] = ddpm_schedule\n            log('\\tFound the first %d-step schedule: (PESQ, STOI) = (%.2f, %.3f)'%(\n                steps, ddpm_score[0], ddpm_score[1]), self.config)\n        elif ddpm_score[0] > self.steps2score[steps][1] and ddpm_score[1] > self.steps2score[steps][2]:\n            # Found a better same-step schedule achieving a higher score\n            log('\\tFound a better %d-step schedule: (PESQ, STOI) = (%.2f, %.3f) -> (%.2f, %.3f)'%(\n                steps, self.steps2score[steps][1], self.steps2score[steps][2],\n                ddpm_score[0], ddpm_score[1]), self.config)\n            self.steps2score[steps] = [key, ] + ddpm_score\n            self.steps2schedule[steps] = ddpm_schedule\n        # Use DDIM reverse process for noise scheduling\n        ddim_schedule = self.noise_scheduling(ddim=True)\n        log(\"\\tSearched a %d-step schedule using DDIM reverse process\" % (\n            len(ddim_schedule[0])), self.config)\n        generated_audio, _ = self.sampling(schedule=ddim_schedule)\n        # Compute objective scores\n        ddim_score = self.assess(generated_audio)\n        # Get the number of sampling steps with this schedule\n        steps = len(ddim_schedule[0])\n        # Compare the performance with previous same-step schedule using the metric\n        if steps not in self.steps2score:\n            # Save the first schedule with this number of steps\n            self.steps2score[steps] = [key, ] + ddim_score\n            self.steps2schedule[steps] = ddim_schedule\n            log('\\tFound the first %d-step schedule: (PESQ, STOI) = (%.2f, %.3f)'%(\n                steps, ddim_score[0], ddim_score[1]), self.config)\n        elif ddim_score[0] > self.steps2score[steps][1] and ddim_score[1] > self.steps2score[steps][2]:\n            # Found a better same-step schedule achieving a higher score\n            log('\\tFound a better %d-step schedule: (PESQ, STOI) = (%.2f, %.3f) -> (%.2f, %.3f)'%(\n                steps, self.steps2score[steps][1], self.steps2score[steps][2],\n                ddim_score[0], ddim_score[1]), self.config)\n            self.steps2score[steps] = [key, ] + ddim_score\n            self.steps2schedule[steps] = ddim_schedule\n\n    def noise_scheduling_without_params(self):\n        \"\"\"\n        Search for the best noise scheduling hyperparameters: (alpha_param, beta_param)\n        \"\"\"\n        # Noise scheduling mode, given N\n        self.reverse_process = 'BDDM'\n        assert 'N' in vars(self.config).keys(), 'Error: N is undefined for BDDM!'\n        self.diff_params[\"N\"] = self.config.N\n        # Init search result dictionaries\n        self.steps2schedule, self.steps2score = {}, {}\n        search_bins = int(self.config.bddm_search_bins)\n        # Define search range of alpha_param\n        alpha_last = self.diff_params[\"alpha\"][-1].squeeze().item()\n        alpha_first = self.diff_params[\"alpha\"][0].squeeze().item()\n        alpha_diff = (alpha_first - alpha_last) / (search_bins + 1)\n        alpha_param_list = [alpha_last + alpha_diff * (i + 1) for i in range(search_bins)]\n        # Define search range of beta_param\n        beta_diff = 1. / (search_bins + 1)\n        beta_param_list = [beta_diff * (i + 1) for i in range(search_bins)]\n        # Search for beta_param and alpha_param, take O(search_bins^2)\n        for beta_param in beta_param_list:\n            for alpha_param in alpha_param_list:\n                if alpha_param > (1 - beta_param) ** 0.5:\n                    # Invalid range\n                    continue\n                # Update the scores and noise schedules with (alpha_param, beta_param)\n                self.noise_scheduling_with_params(alpha_param, beta_param)\n        # Lastly, repeat the random starting point (x_hat_N) and choose the best schedule\n        noise_schedule_dir = os.path.join(self.exp_dir, 'noise_schedules')\n        os.makedirs(noise_schedule_dir, exist_ok=True)\n        steps_list = list(self.steps2score.keys())\n        for steps in steps_list:\n            log(\"-\"*80, self.config)\n            log(\"Select the best out of %d x_hat_N ~ N(0,I) for %d steps:\"%(\n                self.config.noise_scheduling_attempts, steps), self.config)\n            # Get current best pair\n            key = self.steps2score[steps][0]\n            # Get back the best (alpha_param, beta_param) pair for a given steps\n            alpha_param, beta_param = list(map(float, key.split(',')))\n            # Repeat K times for a given number of steps\n            for _ in range(self.config.noise_scheduling_attempts):\n                # Random +/- 5%\n                _alpha_param = alpha_param * (0.95 + np.random.rand() * 0.1)\n                _beta_param = beta_param * (0.95 + np.random.rand() * 0.1)\n                # Update the scores and noise schedules with (alpha_param, beta_param)\n                self.noise_scheduling_with_params(_alpha_param, _beta_param)\n        # Save the best searched noise schedule ({N}steps_{key}_{metric}{best_score}.ns)\n        for steps in sorted(self.steps2score.keys(), reverse=True):\n            filepath = os.path.join(noise_schedule_dir, '%dsteps_PESQ%.2f_STOI%.3f.ns'%(\n                steps, self.steps2score[steps][1], self.steps2score[steps][2]))\n            torch.save(self.steps2schedule[steps], filepath)\n            log(\"Saved searched schedule: %s\" % filepath, self.config)\n\n    def assess(self, generated_audio, audio_key=None):\n        \"\"\"\n        Assess the generated audio using objective metrics: PESQ and STOI.\n            P.S. Users should first install pypesq and pystoi using pip\n\n        Parameters:\n            generated_audio (tensor): the generated audio to be assessed\n            audio_key (str):          the key of the respective audio\n        Returns:\n            pesq_score (float):       the PESQ score (the higher the better)\n            stoi_score (float):       the STOI score (the higher the better)\n        \"\"\"\n        est_audio = generated_audio.squeeze().cpu().numpy()\n        ref_audio = self.ref_audio.squeeze().cpu().numpy()\n        if est_audio.shape[-1] > ref_audio.shape[-1]:\n            est_audio = est_audio[..., :ref_audio.shape[-1]]\n        else:\n            ref_audio = ref_audio[..., :est_audio.shape[-1]]\n        # Compute STOI using PySTOI\n        # PySTOI (https://github.com/mpariente/pystoi)\n        stoi_score = stoi(ref_audio, est_audio, self.config.sampling_rate, extended=False)\n        # Resample audio to 16K Hz to compute PESQ using PyPESQ (supports only 8K / 16K)\n        # PyPESQ (https://github.com/vBaiCai/python-pesq)\n        if self.config.sampling_rate != 16000:\n            est_audio_16k = librosa.resample(est_audio, self.config.sampling_rate, 16000)\n            ref_audio_16k = librosa.resample(ref_audio, self.config.sampling_rate, 16000)\n            pesq_score = pesq(ref_audio_16k, est_audio_16k, 16000)\n        else:\n            pesq_score = pesq(ref_audio, est_audio, 16000)\n        # Log scores\n        log('\\t%sScores: PESQ = %.3f, STOI = %.5f'%(\n            '' if audio_key is None else audio_key+' ', pesq_score, stoi_score), self.config)\n        # Return scores: the higher the better\n        return [pesq_score, stoi_score]\n"
  },
  {
    "path": "bddm/trainer/__init__.py",
    "content": "#!/bin/env python\n# -*- coding: utf-8 -*-\n########################################################################\n#\n#  Trainer\n#\n#  Author: Max W. Y. Lam (maxwylam@tencent.com)\n#  Copyright (c) 2021Tencent. All Rights Reserved\n#\n########################################################################\n\n\nfrom .trainer import Trainer\n"
  },
  {
    "path": "bddm/trainer/bmuf.py",
    "content": "#!/bin/env python\n# -*- coding: utf-8 -*-\n########################################################################\n#\n#  BMUF Multi-GPU Training Method\n#\n#  Author: Max W. Y. Lam (maxwylam@tencent.com)\n#  Copyright (c) 2021Tencent. All Rights Reserved\n#\n########################################################################\n\n\nimport os\nimport sys\nimport psutil\nimport torch\nimport torch.nn as nn\nimport torch.distributed as dist\n\n\nclass BmufTrainer(object):\n\n    def __init__(self, master_node, rank, world_size, model, block_momentum, block_lr):\n        \"\"\"\n        Basic BMUF Trainer Class, implements Nesterov Block Momentum\n\n        Parameters:\n            master_node (int):      master node index, zero in most cases\n            rank (int):             local rank, eg, 0-7 if 8GPUs are used\n            world_size (int):       total number of workers\n            model (nn.Module):      PyTorch model\n            block_momentum (float): block momentum value\n            block_lr (float):       block learning rate\n        \"\"\"\n        assert torch.cuda.is_available(), \"Distributed mode requires CUDA.\"\n        self.master_node = 0  # By default, use device 0 as master node\n        self.model = model\n        self.rank = rank\n        self.world_size = world_size\n        self.block_momentum = block_momentum\n        self.block_lr = block_lr\n        dist.init_process_group(backend=\"nccl\", init_method=\"env://\")\n        param_vec = nn.utils.parameters_to_vector(model.parameters())\n        self.param = param_vec.data.clone()\n        dist.broadcast(tensor=self.param, src=self.master_node, async_op=False)\n        num_param = self.param.numel()\n        self.delta_prev = torch.FloatTensor([0]*num_param).to(self.param.device)\n        if self.rank == self.master_node:\n            self.delta_prev = torch.FloatTensor([0]*num_param).cuda(self.master_node)\n        else:\n            self.delta_prev = None\n            self._copy_vec_to_param(self.param)\n        # for fining the best model among different ranks\n        self.min_val_loss = torch.FloatTensor([float('inf')]).to(self.param.device)\n\n    def check_all_processes_running(self):\n        \"\"\"\n        Check whether all processes are healthy\n        \"\"\"\n        pid = os.getpid()\n        for p in range(pid-self.rank, pid-self.rank+self.world_size):\n            if not psutil.pid_exists(p):\n                sys.exit(-1)\n\n    def get_average_valid_loss(self, val_loss):\n        \"\"\"\n        Get average validatin loss  through all reduce operations\n\n        Parameters:\n            val_loss (Tensor): calculated validation loss in each process\n        Returns:\n            save_or_not (bool): whether save the current validated model or not\n        \"\"\"\n        save_or_not = False\n        dist.all_reduce(tensor=val_loss)\n        val_loss = val_loss / float(self.world_size)\n        # save the min valid loss among all gpus\n        if 'min_val_loss' not in self.__dict__ or val_loss < self.min_val_loss:\n            self.min_val_loss = val_loss.clone()\n            save_or_not = True\n        return save_or_not\n\n    def update_and_sync(self, val_loss=None):\n        \"\"\"\n        Update and synchronize block gradients\n\n        Parameters:\n            val_loss (tensor): calculated validation loss in each process\n        Returns:\n            save_or_not (bool): whether save the current validated model or not\n        \"\"\"\n        self.check_all_processes_running()\n        save_or_not = False\n        if val_loss is not None:\n            save_or_not = self.get_average_valid_loss(val_loss)\n\n        cur_param_vec = nn.utils.parameters_to_vector(self.model.parameters()).data\n        delta = self.param - cur_param_vec\n        # Gather block gradients into delta\n        dist.reduce(tensor=delta, dst=self.master_node)\n\n        # Check if model params are still healthy\n        if torch.isnan(delta).sum().item():\n            print('Found nan, exit!', flush=True)\n            sys.exit(-1)\n        if self.rank == self.master_node:\n            # Local rank is master node\n            delta = delta / float(self.world_size)\n            self.delta_prev = self.block_momentum * self.delta_prev + \\\n                              (self.block_lr * (1 - self.block_momentum) * delta)\n            self.param -= (1+self.block_momentum) * self.delta_prev\n        dist.broadcast(tensor=self.param, src=self.master_node, async_op=False)\n        self._copy_vec_to_param(self.param)\n        return save_or_not\n\n    def _copy_vec_to_param(self, vec):\n        \"\"\"\n        Copy a vectorized array to the model parameters\n\n        Parameters:\n            vec (tensor): a single vector represents the parameters of a model.\n        \"\"\"\n        # Ensure vec of type Tensor\n        if not isinstance(vec, torch.Tensor):\n            raise TypeError('expected torch.Tensor, but got: {}'\n                            .format(torch.typename(vec)))\n        # Pointer for slicing the vector for each parameter\n        pointer = 0\n        for param in self.model.parameters():\n            # The length of the parameter\n            num_param = param.numel()\n            # Slice the vector, reshape it, and replace the old data of the parameter\n            param.data = param.data.copy_(vec[pointer:pointer + num_param]\n                                          .view_as(param).data)\n            # Increment the pointer\n            pointer += num_param\n"
  },
  {
    "path": "bddm/trainer/ema.py",
    "content": "#!/bin/env python\n# -*- coding: utf-8 -*-\n########################################################################\n#\n#  EMA Helper Class\n#\n#  Author: Max W. Y. Lam (maxwylam@tencent.com)\n#  Copyright (c) 2021Tencent. All Rights Reserved\n#\n########################################################################\n\n\nimport torch.nn as nn\n\n\nclass EMAHelper(object):\n\n    def __init__(self, mu=0.999):\n        \"\"\"\n        Exponential Moving Average Training Helper Class\n\n        Parameters:\n            mu (float): decaying rate\n        \"\"\"\n        self.mu = mu\n        self.shadow = {}\n\n    def register(self, module):\n        \"\"\"\n        Register module by copying all learnable parameters to self.shadow\n\n        Parameters:\n            module (nn.Module): model to be trained\n        \"\"\"\n        if isinstance(module, nn.DataParallel):\n            module = module.module\n        for name, param in module.named_parameters():\n            if param.requires_grad:\n                self.shadow[name] = param.data.clone()\n\n    def update(self, module):\n        \"\"\"\n        Update self.shadow using the module parameters\n\n        Parameters:\n            module (nn.Module): model in training\n        \"\"\"\n        if isinstance(module, nn.DataParallel):\n            module = module.module\n        for name, param in module.named_parameters():\n            if param.requires_grad:\n                self.shadow[name].data = (\n                    1. - self.mu) * param.data + self.mu * self.shadow[name].data\n\n    def ema(self, module):\n        \"\"\"\n        Copy self.shadow to the module parameters\n\n        Parameters:\n            module (nn.Module): model in training\n        \"\"\"\n        if isinstance(module, nn.DataParallel):\n            module = module.module\n        for name, param in module.named_parameters():\n            if param.requires_grad:\n                param.data.copy_(self.shadow[name].data)\n\n    def ema_copy(self, module):\n        \"\"\"\n        Initialize a new module using self.shadow as the parameters\n\n        Parameters:\n            module (nn.Module): model in training\n        \"\"\"\n        if isinstance(module, nn.DataParallel):\n            inner_module = module.module\n            module_copy = type(inner_module)(\n                inner_module.config).to(inner_module.config.device)\n            module_copy.load_state_dict(inner_module.state_dict())\n            module_copy = nn.DataParallel(module_copy)\n        else:\n            module_copy = type(module)(module.config).to(module.config.device)\n            module_copy.load_state_dict(module.state_dict())\n        self.ema(module_copy)\n        return module_copy\n\n    def state_dict(self):\n        \"\"\"\n        Get self.shadow as the state dict\n\n        Returns:\n            shadow (dict): state dict\n        \"\"\"\n        return self.shadow\n\n    def load_state_dict(self, state_dict):\n        \"\"\"\n        Load a state dict to self.shadow\n\n        Parameters:\n            state dict (dict): state dict to be copied to self.shadow\n        \"\"\"\n        self.shadow = state_dict\n"
  },
  {
    "path": "bddm/trainer/loss.py",
    "content": "#!/bin/env python\n# -*- coding: utf-8 -*-\n########################################################################\n#\n#  Implements Training Losses for BDDMs\n#\n#  Author: Max W. Y. Lam (maxwylam@tencent.com)\n#  Copyright (c) 2021Tencent. All Rights Reserved\n#\n########################################################################\n\n\nimport torch\nimport torch.nn as nn\n\n\nclass ScoreLoss(nn.Module):\n\n    def __init__(self, config, diff_params):\n        \"\"\"\n        Score Loss Class, implements DDPM's simplified loss (see Eq. 5 in BDDM's paper)\n\n        Parameters:\n            config (namespace): BDDM Configuration\n            diff_params (dict): Dictionary that stores pre-computed diffusion parameters\n        \"\"\"\n        super().__init__()\n        self.config = config\n        self.num_steps = diff_params[\"T\"]\n        self.alpha = diff_params[\"alpha\"]\n\n    def forward(self, model, mels, audios):\n        \"\"\"\n        Compute the training loss for learning theta\n\n        Parameters:\n            model (nn.Module):   the score network\n            mels (tensor):       shape=(batch size, frames, spectrogram dim)\n            audios (tensor):     shape=(batch size, 1, length of audio)\n        Returns:\n            score loss (tensor): shape=(batch size,)\n        \"\"\"\n        batch_size = audios.size(0)\n        ts = torch.randint(low=0, high=self.num_steps, size=(batch_size, 1, 1)).cuda()\n        noise_scales = self.alpha[ts]\n        z = torch.normal(0, 1, size=audios.shape).cuda()\n        noisy_audios = noise_scales * audios + (1 - noise_scales**2.).sqrt() * z\n        e = model((noisy_audios, mels, ts.view(batch_size, 1),))\n        # Use WaveGrad's L1Loss for speech generation\n        theta_loss = nn.L1Loss(reduction='none')(e, z[:, :, :e.size(-1)]).mean([1, 2])\n        return theta_loss\n\n\nclass StepLoss(nn.Module):\n\n    def __init__(self, config, diff_params):\n        \"\"\"\n        Step Loss Class, implements BDDM's step loss (see Eq. 14 in BDDM's paper)\n\n        Parameters:\n            config (namespace): BDDM Configuration\n            diff_params (dict): Dictionary that stores pre-computed diffusion parameters\n        \"\"\"\n        super().__init__()\n        self.config = config\n        self.num_steps = diff_params[\"T\"]\n        self.alpha = diff_params[\"alpha\"]\n        self.tau = diff_params[\"tau\"]\n\n    def forward(self, model, mels, audios):\n        \"\"\"\n        Compute the training loss for learning phi\n\n        Parameters:\n            model (nn.Module):  the score network & the schedule network\n            mels (tensor):      shape=(batch size, frames, spectrogram dim)\n            audios (tensor):    shape=(batch size, 1, length of audio)\n        Returns:\n            step loss (tensor): shape=(batch size,)\n        \"\"\"\n        batch_size = audios.size(0)\n        ts = torch.randint(self.tau, self.num_steps-self.tau, size=(batch_size,)).cuda()\n        alpha_cur = self.alpha.index_select(0, ts).view(batch_size, 1, 1)\n        alpha_nxt = self.alpha.index_select(0, ts+self.tau).view(batch_size, 1, 1)\n        b_nxt = 1 - (alpha_nxt / alpha_cur)**2.\n        delta = (1 - alpha_cur**2.).sqrt()\n        z = torch.normal(0, 1, size=audios.shape).cuda()\n        noisy_audios = alpha_cur * audios + delta * z\n        e = model((noisy_audios, mels, ts.view(batch_size, 1),))\n        beta_bounds = (b_nxt.view(batch_size, 1), delta.view(batch_size, 1)**2.)\n        b_hat = model.schedule_net(noisy_audios.squeeze(1), beta_bounds)\n        delta, b_hat, z, e = delta.squeeze(1), b_hat.squeeze(1), z.squeeze(1), e.squeeze(1)\n        phi_loss = delta**2. / (2. * (delta**2. - b_hat))\n        phi_loss = phi_loss * (z - b_hat / (delta**2.) * e).square()\n        phi_loss = phi_loss + torch.log(1e-8 + delta**2. / (b_hat + 1e-8)) / 4.\n        phi_loss = phi_loss.sum(-1) + (b_hat / delta**2 - 1) / 2. * audios.size(-1)\n        return phi_loss\n"
  },
  {
    "path": "bddm/trainer/trainer.py",
    "content": "#!/bin/env python\n# -*- coding: utf-8 -*-\n########################################################################\n#\n#  BDDM Trainer (Support Multi-GPU Training using BMUF method)\n#\n#  Author: Max W. Y. Lam (maxwylam@tencent.com)\n#  Copyright (c) 2021Tencent. All Rights Reserved\n#\n########################################################################\n\n\nfrom __future__ import absolute_import\n\nimport os\nimport time\nimport copy\nimport torch\n\nfrom bddm.sampler import Sampler\nfrom bddm.trainer.ema import EMAHelper\nfrom bddm.trainer.bmuf import BmufTrainer\nfrom bddm.trainer.loss import ScoreLoss, StepLoss\nfrom bddm.utils.log_utils import log\nfrom bddm.utils.check_utils import check_score_network\nfrom bddm.utils.diffusion_utils import compute_diffusion_params\nfrom bddm.models import get_score_network, get_schedule_network\nfrom bddm.loader.dataset import create_train_and_valid_dataloader\n\n\nclass Trainer(object):\n\n    def __init__(self, config):\n        \"\"\"\n        Trainer Class, implements a general multi-GPU training framework in PyTorch\n\n        Parameters:\n            config (namespace): BDDM Configuration\n        \"\"\"\n        self.config = config\n        self.exp_dir = config.exp_dir\n        self.clip = config.grad_clip\n        self.load = config.load\n        self.model = get_score_network(config).cuda()\n        # Define training target\n        if self.config.resume_training and self.load != '':\n            self.training_target = 'score_nets'\n        else:\n            score_net_trained, score_net_path = check_score_network(config)\n            self.training_target = 'schedule_nets' if score_net_trained else 'score_nets'\n        torch.autograd.set_detect_anomaly(True)\n        # Initialize diffusion parameters using a pre-specified linear schedule\n        noise_schedule = torch.linspace(config.beta_0, config.beta_T, config.T).cuda()\n        self.diff_params = compute_diffusion_params(noise_schedule)\n        if self.training_target == 'schedule_nets':\n            if self.load == '':\n                self.load = score_net_path\n            self.diff_params[\"tau\"] = config.tau\n            for p in self.model.parameters():\n                p.requires_grad = False\n            # Define the schedule net as a sub-module of the score net for convenience\n            self.model.schedule_net = get_schedule_network(config).cuda()\n            self.loss_func = StepLoss(config, self.diff_params)\n            self.n_training_steps = config.schedule_net_training_steps\n            # In practice using batch size = 1 would lead to much lower step loss\n            config.batch_size = 1\n            model_to_train = self.model.schedule_net\n        else:\n            self.loss_func = ScoreLoss(config, self.diff_params)\n            self.n_training_steps = config.score_net_training_steps\n            model_to_train = self.model\n        # Define optimizer\n        self.optimizer = torch.optim.AdamW(model_to_train.parameters(),\n            lr=config.lr, weight_decay=config.weight_decay, amsgrad=True)\n        # Define EMA training helper\n        self.ema_helper = EMAHelper(mu=config.ema_rate)\n        self.ema_helper.register(model_to_train)\n        # Initialize BMUF trainer\n        self.world_size = int(os.environ['WORLD_SIZE'])\n        self.bmuf_trainer = BmufTrainer(0, config.local_rank, self.world_size,\n            model_to_train, config.bmuf_block_momentum, config.bmuf_block_lr)\n        self.sync_period = config.bmuf_sync_period\n        self.device = torch.device(\"cuda:{}\".format(config.local_rank))\n        self.local_rank = config.local_rank\n        # Get data loaders\n        self.tr_loader, self.vl_loader = create_train_and_valid_dataloader(config)\n        if self.training_target == 'score_nets':\n            # Define a Sampler for quality validation (should be added after BMUF)\n            self.valid_sampler = Sampler(config)\n            self.valid_sampler.model = get_score_network(config).cuda()\n        self.reset()\n\n    def reset(self):\n        \"\"\"\n        Reset training environment\n        \"\"\"\n        self.tr_loss, self.vl_loss = [], []\n        self.training_step = 0\n        if self.load != '':\n            package = torch.load(self.load, map_location=lambda storage, loc: storage.cuda())\n            init_state_dict = self.model.state_dict()\n            mismatch_params = set()\n            # Remove the checkpoint params that are not found in model\n            for key in list(package['model_state_dict'].keys()):\n                if key not in init_state_dict.keys():\n                    param = copy.deepcopy(package['model_state_dict'][key])\n                    del package['model_state_dict'][key]\n                    log('ignored: %s in checkpoint not found in model'%key, self.config)\n                elif package['model_state_dict'][key].size() != init_state_dict[key].size():\n                    log(package['model_state_dict'][key].size(), self.config)\n                    log(init_state_dict[key].size(), self.config)\n                    log('ignored: %s in checkpoint size mismatched'%key, self.config)\n                    del package['model_state_dict'][key]\n            # Replace the ignored checkpoint params by the init params\n            for key in list(init_state_dict.keys()):\n                if key not in package['model_state_dict'].keys():\n                    mismatch_params.add(key)\n                    log('ignored: %s in model not found in checkpoint'%key, self.config)\n                    package['model_state_dict'][key] = init_state_dict[key]\n            self.model.load_state_dict(package['model_state_dict'])\n            if self.config.resume_training and len(mismatch_params) == 0:\n                # Load steps to resume training\n                if self.training_target == 'score_nets' and 'score_net_training_step' in package:\n                    self.training_step = package['score_net_training_step']\n                elif self.training_target == 'schedule_nets' and 'schedule_net_training_step' in package:\n                    self.training_step = package['schedule_net_training_step']\n            if self.config.freeze_checkpoint_params and len(mismatch_params) > 0:\n                # Only update new parameters defined in model\n                for key, param in self.model.named_parameters():\n                    if key not in mismatch_params:\n                        param.requires_grad = False\n            log('Loaded checkpoint %s' % self.load, self.config)\n        # Create save folder\n        os.makedirs(self.exp_dir, exist_ok=True)\n        self.prev_val_loss, self.min_val_loss = float(\"inf\"), float(\"inf\")\n        self.val_no_impv, self.halving = 0, 0\n\n    def train(self):\n        \"\"\"\n        Start the main training process\n        \"\"\"\n        best_state = copy.deepcopy(self.model.state_dict())\n        while self.training_step < self.n_training_steps:\n            # Train one epoch\n            log(\"Start training %s from step %d .......\"%(\n                self.training_target, self.training_step), self.config)\n            self.bmuf_trainer.check_all_processes_running()\n            self.model.train()\n            start = time.time()\n            tr_avg_loss = self._run_one_epoch(validate=False)\n            log('-' * 85, self.config)\n            log('Train Summary | Step {} | Time {:.2f}s | Train Loss {:.5f}'.format(\n                self.training_step, time.time()-start, tr_avg_loss), self.config)\n            log('-' * 85, self.config)\n            # Start validation\n            log('Start validation ......', self.config)\n            self.model.eval()\n            start = time.time()\n            with torch.no_grad():\n                val_loss = self._run_one_epoch(validate=True)\n            log('-' * 85, self.config)\n            log('Valid Summary | Step {} | Time {:.2f}s | Valid Loss {:.5f}'.format(\n                self.training_step, time.time()-start, val_loss.item()), self.config)\n            log('-' * 85, self.config)\n            save_or_not = self.bmuf_trainer.update_and_sync(val_loss=val_loss)\n            if save_or_not:\n                self.val_no_impv = 0\n                if self.bmuf_trainer.rank == self.bmuf_trainer.master_node:\n                    model_serialized = self.serialize()\n                    file_path = os.path.join(self.exp_dir, self.training_target,\n                                             '%d.pkl' % self.training_step)\n                    torch.save(model_serialized, file_path)\n                    log(\"Found better model, saved to %s\" % file_path, self.config)\n            if val_loss >= self.min_val_loss:\n                # LR decays\n                self.val_no_impv += 1\n                if self.val_no_impv == self.config.patience:\n                    log(\"No imporvement for %d epochs, early stopped!\"%(\n                        self.config.patience), self.config)\n                    self.bmuf_trainer.kill_all_processes()\n                    break\n                if self.val_no_impv >= self.config.patience // 2:\n                    self.model.load_state_dict(best_state)\n            else:\n                self.val_no_impv = 0\n                self.min_val_loss = val_loss\n                best_state = copy.deepcopy(self.ema_helper.state_dict())\n\n    def _run_one_epoch(self, validate=False):\n        \"\"\"\n        Run one epoch\n\n        Parameters:\n            validate (bool):      whether to run a valiation epoch or a training epoch\n        Returns:\n            average loss (float): the average training/validation loss\n        \"\"\"\n        start = time.time()\n        total_loss, total_cnt = 0, 0\n        if validate and self.training_target == 'score_nets':\n            # Use EMA state dict for validation\n            self.valid_sampler.model.load_state_dict(self.ema_helper.state_dict())\n            # To validate score nets, we use Sampler to test sample quality instead of loss\n            generated_audio, _ = self.valid_sampler.sampling()\n            # Compute objective scores (score = PESQ)\n            quality_score = self.valid_sampler.assess(generated_audio)[0]\n            return - torch.FloatTensor([quality_score])[0].cuda()\n        data_loader = self.vl_loader if validate else self.tr_loader\n        data_loader.dataset.reset()\n        start_step = self.training_step\n        for i, batch in enumerate(data_loader):\n            mels, audios = list(map(lambda x: x.cuda(), batch))\n            loss = self.loss_func(self.model, mels, audios)\n            total_loss += loss.detach().sum()\n            total_cnt += len(loss)\n            avg_loss = loss.mean()\n            if not validate:\n                self.optimizer.zero_grad()\n                avg_loss.backward()\n                torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip)\n                self.optimizer.step()\n                # Apply block momentum and sync parameters\n                self.bmuf_trainer.update_and_sync()\n                self.bmuf_trainer.check_all_processes_running()\n                # n_gpus * batch_size\n                self.training_step += len(loss) * self.bmuf_trainer.world_size\n                if i % self.config.log_period == 0:\n                    log('Train Step {} | Avg. Loss {:.5f} | New Loss {:.5f} | {:.2f}s/step'.format(\n                        self.training_step, total_loss / total_cnt, avg_loss,\n                        (time.time() - start) / (self.training_step - start_step)), self.config)\n                if self.training_target == 'schedule_nets':\n                    self.ema_helper.update(self.model.schedule_net)\n                else:\n                    self.ema_helper.update(self.model)\n                if self.training_step >= self.n_training_steps or\\\n                        max(i, self.training_step - start_step) >= self.config.steps_per_epoch:\n                    # Release grad memory\n                    self.optimizer.zero_grad()\n                    return total_loss / total_cnt\n            else:\n                if i % self.config.log_period == 0:\n                    log('Valid Step {} | Avg. Loss {:.5f} | New Loss {:.5f} | {:.2f}s/step'.format(\n                        i + 1, total_loss/total_cnt, avg_loss,\n                        (time.time() - start) / (i + 1)), self.config)\n        if not validate:\n            # Release grad memory\n            self.optimizer.zero_grad()\n        torch.cuda.empty_cache()\n        return total_loss / total_cnt\n\n    def serialize(self):\n        \"\"\"\n        Pack the model and configurations into a dictionary\n\n        Returns:\n            package (dict): the serialized package to be saved\n        \"\"\"\n        if self.training_target == 'schedule_nets':\n            model_state = copy.deepcopy(self.model.state_dict())\n            ema_state = copy.deepcopy(self.ema_helper.state_dict())\n            for p in self.ema_helper.state_dict():\n                model_state['schedule_net.'+p] =  ema_state[p]\n        else:\n            model_state = copy.deepcopy(self.ema_helper.state_dict())\n        if self.config.save_fp16:\n            for p in model_state:\n                model_state[p] = model_state[p].half()\n        package = {\n            # hyper-parameter\n            'config': self.config,\n            # state\n            'model_state_dict': model_state\n        }\n        if self.training_target == 'score_nets':\n            package['score_net_training_step'] = self.training_step\n            package['schedule_net_training_step'] = 0\n        else:\n            package['score_net_training_step'] = self.config.score_net_training_steps\n            package['schedule_net_training_step'] = self.training_step\n        return package\n"
  },
  {
    "path": "bddm/utils/check_utils.py",
    "content": "#!/bin/env python\n# -*- coding: utf-8 -*-\n########################################################################\n#\n#  Check Utils: Find Checkpoints\n#\n#  Author: Max W. Y. Lam (maxwylam@tencent.com)\n#  Copyright (c) 2021Tencent. All Rights Reserved\n#\n########################################################################\n\n\nimport os\nimport glob\nimport torch\n\n\ndef check_score_network(config):\n    \"\"\"\n    Check if the score network is trained by searching the ${exp_dir}/score_nets\n\n    Parameters:\n        config (namespace): the configuration given by the user\n    Returns:\n        result (bool):      a boolean to determine if the score network is trained\n        path (str):         the path to the score network checkpoint if trained\n    \"\"\"\n    if config.load != '':\n        ckpt = torch.load(config.load)\n        if 'score_net_training_step' in ckpt.keys():\n            if ckpt['score_net_training_step'] >= config.score_net_training_steps:\n                return True, config.load\n        else:\n            # We suppose that an external checkpoint is already well-trained\n            return True, config.load\n    max_training_steps = 0\n    for path in glob.glob(os.path.join(config.exp_dir, 'score_nets', '[0-9]*.pkl')):\n        n_training_steps = int(path.split('/')[-1].split('.')[0])\n        max_training_steps = max(max_training_steps, n_training_steps)\n    if max_training_steps == 0:\n        return False, None\n    load_path = os.path.join(config.exp_dir, 'score_nets', '%d.pkl'%(max_training_steps))\n    return True, load_path\n\n\ndef check_schedule_network(config):\n    \"\"\"\n    Check if the schedule network is trained by searching the ${exp_dir}/schedule_nets\n\n    Parameters:\n        config (namespace): the configuration given by the user\n    Returns:\n        result (bool):      a boolean to determine if the schedule network is trained\n        path (str):         the path to the BDDM checkpoint if trained\n    \"\"\"\n    if config.load != '':\n        ckpt = torch.load(config.load)\n        if 'schedule_net_training_step' in ckpt.keys():\n            return True, config.load\n        else:\n            return False, config.load\n    max_training_steps = 0\n    for path in glob.glob(os.path.join(config.exp_dir, 'schedule_nets', '[0-9]*.pkl')):\n        n_training_steps = int(path.split('/')[-1].split('.')[0])\n        max_training_steps = max(max_training_steps, n_training_steps)\n    if max_training_steps == 0:\n        # We suppose that an external checkpoint only contains a well-trained score network\n        return False, config.load\n    schedule_net_load_path = os.path.join(\n        config.exp_dir, 'schedule_nets', '%d.pkl'%(max_training_steps))\n    return True, schedule_net_load_path\n"
  },
  {
    "path": "bddm/utils/diffusion_utils.py",
    "content": "#!/bin/env python\n# -*- coding: utf-8 -*-\n########################################################################\n#\n#  Diffusion Utils: Pre-compute Variables\n#\n#  Author: Max W. Y. Lam (maxwylam@tencent.com)\n#  Copyright (c) 2021Tencent. All Rights Reserved\n#\n########################################################################\n\n\nimport torch\n\n\ndef compute_diffusion_params(beta):\n    \"\"\"\n    Compute the diffusion parameters defined in BDDMs\n\n    Parameters:\n        beta (tensor):      the beta schedule\n    Returns:\n        diff_params (dict): a dictionary of diffusion hyperparameters including:\n            T (int), beta/alpha/sigma (torch.tensor on cpu, shape=(T, ))\n            These cpu tensors are changed to cuda tensors on each individual gpu\n    \"\"\"\n\n    alpha = 1 - beta\n    sigma = beta + 0\n    for t in range(1, len(beta)):\n        alpha[t] *= alpha[t-1]\n        sigma[t] *= (1-alpha[t-1]) / (1-alpha[t])\n    alpha = torch.sqrt(alpha)\n    sigma = torch.sqrt(sigma)\n    diff_params = {\"T\": len(beta), \"beta\": beta, \"alpha\": alpha, \"sigma\": sigma}\n    return diff_params\n\n\ndef map_noise_scale_to_time_step(alpha_infer, alpha):\n    \"\"\"\n    Map an alpha_infer to an approx. time step in the alpha tensor.\n        (Modified from Algorithm 3 Fast Sampling in DiffWave)\n\n    Parameters:\n        alpha_infer (float): noise scale at time `n` for inference\n        alpha (tensor):      noise scales used in training, shape=(T, )\n    Returns:\n        t (float):           approximated time step in alpha tensor\n    \"\"\"\n\n    if alpha_infer < alpha[-1]:\n        return len(alpha) - 1\n    if alpha_infer > alpha[0]:\n        return 0\n    for t in range(len(alpha) - 1):\n        if alpha[t+1] <= alpha_infer <= alpha[t]:\n            step_diff = alpha[t] - alpha_infer\n            step_diff /= alpha[t] - alpha[t+1]\n            return t + step_diff.item()\n    return -1\n"
  },
  {
    "path": "bddm/utils/log_utils.py",
    "content": "#!/bin/env python\n# -*- coding: utf-8 -*-\n########################################################################\n#\n#  Log Utils: Print Log with Time/PID\n#\n#  Author: Max W. Y. Lam (maxwylam@tencent.com)\n#  Copyright (c) 2021Tencent. All Rights Reserved\n#\n########################################################################\n\nimport os\nimport sys\nimport time\n\n\ndef ctime():\n    \"\"\"\n    Get time now\n\n    Returns:\n        time_string (str): current time in string\n    \"\"\"\n    return time.strftime(\"%Y-%m-%d %H:%M:%S\", time.localtime(time.time()))\n\n\ndef head():\n    \"\"\"\n    Get header for logging: time and PID of the current process\n\n    Returns:\n        header_string (str): the header string for logging\n    \"\"\"\n    return \"%s %d\" % (ctime(), os.getpid())\n\n\ndef log(msg, config):\n    \"\"\"\n    Save log to the device[id].log & print device0.log to STDOUT\n\n    Parameters:\n        msg (sting):        message to be logged\n        config (namespace): BDDM Configuration\n    \"\"\"\n    if config.local_rank == 0:\n        sys.stdout.write(\"[%s] %s\\n\" % (head(), msg))\n        sys.stdout.flush()\n    os.makedirs(config.exp_dir, exist_ok=True)\n    with open(os.path.join(config.exp_dir, 'device%d.log'%(config.local_rank)), 'a') as f:\n        f.write(\"[%s] %s\\n\" % (head(), msg))\n        f.flush()\n"
  },
  {
    "path": "egs/lj/conf.yml",
    "content": "# General config\nexp_dir: './exp'\nseed: 0\nload: 'DiffWave_GALR.pkl'\n# Generation config\nsampling_noise_schedule: './noise_schedules/12steps_PESQ4.07_STOI0.978.ns'\ngen_data_dir: \"./demo_case\"\n#gen_data_dir: \"./test_wavs\"\nuse_ddim_steps: 0 # options: 0-DDPM, 1-DDIM, M-DDIM M steps (when noise schedule is not given)\n# Diffusion config\nT: 200\nbeta_0: 0.0001\nbeta_T: 0.02\ntau: 66\n# Noise scheduling config\nN: 12\nbddm_search_bins: 9\nnoise_scheduling_attempts: 10\n# Score network config\nscore_net: 'DiffWave'\nnum_res_layers: 30\nin_channels: 1\nres_channels: 128\nskip_channels: 128\ndilation_cycle: 10\nout_channels: 1\ndiffusion_step_embed_dim_in: 128\ndiffusion_step_embed_dim_mid: 512\ndiffusion_step_embed_dim_out: 512\n# Schedule network config\nschedule_net: 'GALR'\nblocks: 1\nhidden_dim: 128\ninput_dim: 128\nwindow_length: 8\nsegment_size: 64\n# Trainer config\nresume_training: False\nsave_fp16: True\nsteps_per_epoch: 1000\nscore_net_training_steps: 5000000\nschedule_net_training_steps: 10000\nlr: 0.00001\nfreeze_checkpoint_params: False # only used when provided load path\nsave_generated_dir: 'gen_wav'\nbatch_size: 1\npatience: 100\ngrad_clip: 5.0\nweight_decay: 0.00001\nlog_period: 1\nbmuf_block_momentum: 0.5\nbmuf_block_lr: 0.7\nbmuf_sync_period: 2\nema_rate: 0.999\n# Data config\ntrain_data_dir: \"LJSpeech-1.1/train_wavs\"\nvalid_data_dir: \"LJSpeech-1.1/val_wavs\"\nn_worker: 10\nsampling_rate: 22050\nseg_len: 16000\nfil_len: 1024\nhop_len: 256\nwin_len: 1024\nmel_fmin: 0.0\nmel_fmax: 8000.0\n"
  },
  {
    "path": "egs/lj/eval.list",
    "content": "LJ001-0001.wav\nLJ001-0047.wav\nLJ003-0140.wav\nLJ003-0145.wav\nLJ003-0235.wav\nLJ003-0240.wav\nLJ004-0195.wav\nLJ005-0059.wav\nLJ005-0206.wav\nLJ005-0269.wav\nLJ006-0141.wav\nLJ006-0301.wav\nLJ007-0104.wav\nLJ007-0106.wav\nLJ007-0192.wav\nLJ007-0208.wav\nLJ008-0043.wav\nLJ008-0265.wav\nLJ009-0071.wav\nLJ009-0190.wav\nLJ010-0056.wav\nLJ010-0082.wav\nLJ011-0053.wav\nLJ011-0062.wav\nLJ011-0221.wav\nLJ012-0005.wav\nLJ012-0015.wav\nLJ013-0004.wav\nLJ013-0060.wav\nLJ015-0063.wav\nLJ015-0254.wav\nLJ016-0070.wav\nLJ016-0318.wav\nLJ017-0049.wav\nLJ017-0251.wav\nLJ018-0029.wav\nLJ018-0071.wav\nLJ018-0105.wav\nLJ019-0329.wav\nLJ019-0350.wav\nLJ019-0358.wav\nLJ020-0086.wav\nLJ020-0102.wav\nLJ022-0164.wav\nLJ022-0185.wav\nLJ023-0117.wav\nLJ024-0103.wav\nLJ025-0143.wav\nLJ026-0030.wav\nLJ027-0020.wav\nLJ028-0130.wav\nLJ028-0155.wav\nLJ028-0302.wav\nLJ028-0432.wav\nLJ028-0465.wav\nLJ028-0497.wav\nLJ028-0499.wav\nLJ029-0018.wav\nLJ030-0088.wav\nLJ031-0072.wav\nLJ032-0010.wav\nLJ032-0057.wav\nLJ035-0061.wav\nLJ035-0124.wav\nLJ035-0200.wav\nLJ036-0093.wav\nLJ037-0061.wav\nLJ037-0186.wav\nLJ037-0195.wav\nLJ037-0237.wav\nLJ038-0008.wav\nLJ038-0051.wav\nLJ038-0092.wav\nLJ039-0072.wav\nLJ040-0210.wav\nLJ040-0216.wav\nLJ041-0075.wav\nLJ042-0092.wav\nLJ042-0159.wav\nLJ042-0181.wav\nLJ042-0224.wav\nLJ042-0248.wav\nLJ043-0026.wav\nLJ043-0150.wav\nLJ045-0123.wav\nLJ045-0128.wav\nLJ045-0147.wav\nLJ045-0216.wav\nLJ046-0026.wav\nLJ046-0043.wav\nLJ046-0162.wav\nLJ046-0186.wav\nLJ047-0058.wav\nLJ047-0114.wav\nLJ047-0131.wav\nLJ048-0022.wav\nLJ049-0155.wav\nLJ049-0165.wav\nLJ050-0043.wav\nLJ050-0134.wav\n"
  },
  {
    "path": "egs/lj/generate.sh",
    "content": "#!/bin/bash\n\n## example usage (single GPU): sh generate.sh 0 conf.yml\n\ncuda=$1\n\nCUDA_VISIBLE_DEVICES=$cuda python3 main.py \\\n\t--command generate --config $2\n"
  },
  {
    "path": "egs/lj/main.py",
    "content": "import os\nimport sys\nsys.path.append('../../')\nimport json\nimport shutil\nimport hashlib\nimport argparse\nimport numpy as np\nimport yaml\n\nimport torch\ntorch.backends.cudnn.enabled = True\ntorch.backends.cudnn.benchmark = True\n\nfrom bddm import trainer, sampler\nfrom bddm.utils.log_utils import log\n\n\ndef dict_hash_5char(dictionary):\n    ''' Map a unique dictionary into a 5-character string '''\n    dhash = hashlib.md5()\n    encoded = json.dumps(dictionary, sort_keys=True).encode()\n    dhash.update(encoded)\n    return dhash.hexdigest()[:5]\n\n\ndef start_exp(config, config_hash):\n    ''' Create experiment directory or set it to an existing directory '''\n    if config.load != '' and '_nets' in config.load:\n        config.exp_dir = '/'.join(config.load.split('/')[:-2])\n    else:\n        config.exp_dir += '/%s-%s_conf-hash-%s' % (\n            config.score_net, config.schedule_net, config_hash)\n    if config.local_rank != 0:\n        return\n    log('Experiment directory: %s' % (config.exp_dir), config)\n    # Backup the config file\n    shutil.copyfile(config.config, os.path.join(config.exp_dir, 'conf.yml'))\n    # Create a backup scripts sub-folder\n    os.makedirs(os.path.join(config.exp_dir, 'backup_scripts'), exist_ok=True)\n    # Backup all .py files under bddm/\n    backup_files = []\n    for root, _, files in os.walk(\"../../\"):\n        if 'egs' in root:\n            continue\n        for f in files:\n            if f.endswith(\".py\"):\n                backup_files.append(os.path.join(root, f))\n    for src_file in backup_files:\n        basename = src_file\n        while '../' in basename:\n            basename = basename.replace('../', '')\n        basename = basename.replace('./', '')\n        dst_file = os.path.join(config.exp_dir, 'backup_scripts', basename)\n        dst_dir = '/'.join(dst_file.split('/')[:-1])\n        if not os.path.exists(dst_dir):\n            os.makedirs(dst_dir)\n        shutil.copyfile(src_file, dst_file)\n    # Prepare sub-folders for saving model checkpoints\n    os.makedirs(os.path.join(config.exp_dir, 'score_nets'), exist_ok=True)\n    os.makedirs(os.path.join(config.exp_dir, 'schedule_nets'), exist_ok=True)\n\n\nif __name__ == '__main__':\n    parser = argparse.ArgumentParser(description='Bilateral Denoising Diffusion Models')\n    parser.add_argument('--command',\n                        type=str,\n                        default='train',\n                        help='available commands: train | search | generate')\n    parser.add_argument('--config',\n                        '-c',\n                        type=str,\n                        default='conf.yml',\n                        help='config .yml path')\n    parser.add_argument('--local_rank',\n                        type=int,\n                        default=0,\n                        help='process device ID for multi-GPU training')\n\n    arg_config = parser.parse_args()\n\n    # Parse yaml and define configurations\n    config = arg_config.__dict__\n    with open(arg_config.config) as f:\n        yaml_config = yaml.safe_load(f)\n    HASH = dict_hash_5char(yaml_config)\n    for key in yaml_config:\n        config[key] = yaml_config[key]\n    config = argparse.Namespace(**config)\n\n    # Set random seed for reproducible results\n    np.random.seed(config.seed)\n    torch.manual_seed(config.seed)\n    torch.cuda.manual_seed_all(config.seed)\n    torch.cuda.set_device(config.local_rank)\n\n    # Check if the command is valid or not\n    commands = ['train', 'schedule', 'generate']\n    assert config.command in commands, 'Error: %s command not found.'%(config.command)\n\n    # Create/retrieve exp dir\n    start_exp(config, HASH)\n    log('Argv: %s' % (' '.join(sys.argv)), config)\n\n    try:\n        if config.command == 'train':\n            # Create Trainer for training\n            trainer = trainer.Trainer(config)\n            trainer.train()\n        elif config.command == 'schedule':\n            # Create Sampler for noise scheduling\n            sampler = sampler.Sampler(config)\n            sampler.noise_scheduling_without_params()\n        elif config.command == 'generate':\n            # Create Sampler for generation\n            # NOTE: Remember to define \"gen_data_dir\" in conf.yml before data generation\n            sampler = sampler.Sampler(config)\n            sampler.generate()\n        log('-' * 80, config)\n\n    except KeyboardInterrupt:\n        log('-' * 80, config)\n        log('Exiting early', config)\n"
  },
  {
    "path": "egs/lj/schedule.sh",
    "content": "#!/bin/bash\n\n## example usage (single GPU): sh schedule.sh 0 conf.yml\n\ncuda=$1\n\nCUDA_VISIBLE_DEVICES=$cuda python3 main.py \\\n\t--command schedule --config $2\n"
  },
  {
    "path": "egs/lj/train.sh",
    "content": "#!/bin/bash\n\n## example usage: sh train.sh 0,1,2,3 conf.yml\n\ncuda=$1\ncomma=${cuda//[^,]}\nnproc=$((${#comma}+1))\n\nCUDA_VISIBLE_DEVICES=$cuda python3 -m torch.distributed.launch \\\n\t--nproc_per_node $nproc --master_port $RANDOM main.py \\\n\t--command train --config $2\n"
  },
  {
    "path": "egs/lj/valid.list",
    "content": "LJ001-0023.wav\nLJ001-0084.wav\nLJ001-0144.wav\nLJ001-0163.wav\nLJ001-0179.wav\nLJ002-0042.wav\nLJ002-0140.wav\nLJ002-0201.wav\nLJ002-0245.wav\nLJ003-0004.wav\nLJ003-0009.wav\nLJ003-0143.wav\nLJ003-0227.wav\nLJ003-0305.wav\nLJ003-0316.wav\nLJ003-0336.wav\nLJ004-0059.wav\nLJ004-0098.wav\nLJ004-0130.wav\nLJ004-0140.wav\nLJ004-0173.wav\nLJ004-0199.wav\nLJ005-0071.wav\nLJ005-0108.wav\nLJ005-0226.wav\nLJ006-0053.wav\nLJ006-0055.wav\nLJ006-0081.wav\nLJ006-0099.wav\nLJ006-0103.wav\nLJ006-0107.wav\nLJ006-0152.wav\nLJ007-0169.wav\nLJ007-0232.wav\nLJ008-0007.wav\nLJ008-0053.wav\nLJ008-0110.wav\nLJ008-0200.wav\nLJ008-0263.wav\nLJ008-0280.wav\nLJ008-0281.wav\nLJ009-0068.wav\nLJ009-0158.wav\nLJ009-0179.wav\nLJ009-0184.wav\nLJ009-0185.wav\nLJ009-0234.wav\nLJ009-0235.wav\nLJ009-0304.wav\nLJ010-0011.wav\nLJ010-0057.wav\nLJ010-0106.wav\nLJ010-0191.wav\nLJ010-0282.wav\nLJ010-0314.wav\nLJ011-0064.wav\nLJ011-0076.wav\nLJ011-0130.wav\nLJ011-0144.wav\nLJ011-0249.wav\nLJ011-0275.wav\nLJ012-0046.wav\nLJ012-0061.wav\nLJ012-0100.wav\nLJ012-0133.wav\nLJ012-0145.wav\nLJ012-0173.wav\nLJ012-0225.wav\nLJ012-0288.wav\nLJ012-0290.wav\nLJ013-0080.wav\nLJ013-0096.wav\nLJ013-0103.wav\nLJ013-0137.wav\nLJ013-0170.wav\nLJ013-0247.wav\nLJ013-0258.wav\nLJ014-0063.wav\nLJ014-0148.wav\nLJ014-0202.wav\nLJ014-0205.wav\nLJ014-0278.wav\nLJ014-0294.wav\nLJ014-0309.wav\nLJ014-0314.wav\nLJ015-0136.wav\nLJ015-0177.wav\nLJ015-0295.wav\nLJ016-0214.wav\nLJ016-0237.wav\nLJ016-0284.wav\nLJ016-0332.wav\nLJ017-0047.wav\nLJ017-0050.wav\nLJ017-0078.wav\nLJ017-0243.wav\nLJ018-0047.wav\nLJ018-0055.wav\nLJ018-0142.wav\nLJ018-0262.wav\nLJ018-0337.wav\nLJ018-0361.wav\nLJ019-0017.wav\nLJ019-0058.wav\nLJ019-0074.wav\nLJ019-0080.wav\nLJ019-0097.wav\nLJ019-0123.wav\nLJ019-0187.wav\nLJ019-0253.wav\nLJ019-0274.wav\nLJ019-0317.wav\nLJ020-0025.wav\nLJ021-0132.wav\nLJ021-0137.wav\nLJ021-0180.wav\nLJ022-0004.wav\nLJ022-0133.wav\nLJ022-0148.wav\nLJ022-0152.wav\nLJ023-0003.wav\nLJ023-0033.wav\nLJ024-0139.wav\nLJ024-0141.wav\nLJ025-0056.wav\nLJ025-0104.wav\nLJ025-0117.wav\nLJ026-0022.wav\nLJ027-0063.wav\nLJ027-0075.wav\nLJ028-0026.wav\nLJ028-0083.wav\nLJ028-0116.wav\nLJ028-0142.wav\nLJ028-0293.wav\nLJ028-0343.wav\nLJ028-0387.wav\nLJ028-0488.wav\nLJ028-0502.wav\nLJ029-0035.wav\nLJ029-0060.wav\nLJ030-0013.wav\nLJ030-0121.wav\nLJ030-0154.wav\nLJ030-0241.wav\nLJ031-0083.wav\nLJ031-0123.wav\nLJ031-0205.wav\nLJ032-0078.wav\nLJ032-0119.wav\nLJ032-0179.wav\nLJ032-0220.wav\nLJ033-0035.wav\nLJ033-0066.wav\nLJ033-0085.wav\nLJ033-0143.wav\nLJ034-0056.wav\nLJ034-0073.wav\nLJ034-0178.wav\nLJ034-0206.wav\nLJ035-0002.wav\nLJ035-0079.wav\nLJ036-0007.wav\nLJ036-0013.wav\nLJ036-0015.wav\nLJ036-0067.wav\nLJ036-0173.wav\nLJ037-0056.wav\nLJ037-0067.wav\nLJ037-0075.wav\nLJ037-0085.wav\nLJ037-0102.wav\nLJ037-0145.wav\nLJ037-0163.wav\nLJ038-0015.wav\nLJ038-0019.wav\nLJ038-0071.wav\nLJ038-0083.wav\nLJ038-0282.wav\nLJ038-0294.wav\nLJ038-0303.wav\nLJ039-0076.wav\nLJ039-0088.wav\nLJ040-0034.wav\nLJ040-0091.wav\nLJ040-0123.wav\nLJ040-0229.wav\nLJ041-0103.wav\nLJ041-0113.wav\nLJ042-0037.wav\nLJ042-0049.wav\nLJ042-0060.wav\nLJ042-0120.wav\nLJ042-0127.wav\nLJ042-0147.wav\nLJ042-0161.wav\nLJ042-0201.wav\nLJ043-0010.wav\nLJ043-0074.wav\nLJ043-0089.wav\nLJ043-0128.wav\nLJ044-0076.wav\nLJ044-0139.wav\nLJ044-0140.wav\nLJ044-0200.wav\nLJ044-0209.wav\nLJ045-0184.wav\nLJ045-0191.wav\nLJ046-0169.wav\nLJ046-0203.wav\nLJ047-0031.wav\nLJ047-0120.wav\nLJ047-0132.wav\nLJ047-0153.wav\nLJ047-0178.wav\nLJ047-0192.wav\nLJ047-0201.wav\nLJ047-0238.wav\nLJ048-0046.wav\nLJ048-0093.wav\nLJ048-0119.wav\nLJ048-0139.wav\nLJ048-0193.wav\nLJ048-0207.wav\nLJ048-0270.wav\nLJ049-0044.wav\nLJ049-0089.wav\nLJ049-0140.wav\nLJ049-0179.wav\nLJ050-0012.wav\nLJ050-0049.wav\nLJ050-0070.wav\nLJ050-0078.wav\nLJ050-0136.wav\nLJ050-0179.wav\nLJ050-0270.wav\n"
  },
  {
    "path": "egs/vctk/conf.yml",
    "content": "# General config\nexp_dir: './exp'\nseed: 0\nload: 'DiffWave_GALR.pkl'\n# Generation config\nsampling_noise_schedule: './noise_schedules/8steps_PESQ3.88_STOI0.987.ns'\ngen_data_dir: \"./demo_case\"\n#gen_data_dir: \"./test_wavs\"\nuse_ddim_steps: 0 # options: 0-DDPM, 1-DDIM, M-DDIM M steps (when noise schedule is not given)\n# Diffusion config\nT: 200\nbeta_0: 0.0001\nbeta_T: 0.02\ntau: 25\n# Noise scheduling config\nN: 8\nbddm_search_bins: 9\nnoise_scheduling_attempts: 10\n# Score network config\nscore_net: 'DiffWave'\nnum_res_layers: 30\nin_channels: 1\nres_channels: 128\nskip_channels: 128\ndilation_cycle: 10\nout_channels: 1\ndiffusion_step_embed_dim_in: 128\ndiffusion_step_embed_dim_mid: 512\ndiffusion_step_embed_dim_out: 512\n# Schedule network config\nschedule_net: 'GALR'\nblocks: 1\nhidden_dim: 128\ninput_dim: 128\nwindow_length: 8\nsegment_size: 64\n# Trainer config\nresume_training: False\nsave_fp16: True\nsteps_per_epoch: 1000\nscore_net_training_steps: 5000000\nschedule_net_training_steps: 5000\nlr: 0.00001\nfreeze_checkpoint_params: False # only used when provided load path\nsave_generated_dir: 'gen_wav'\nbatch_size: 1\npatience: 100\ngrad_clip: 5.0\nweight_decay: 0.00001\nlog_period: 1\nbmuf_block_momentum: 0.5\nbmuf_block_lr: 0.7\nbmuf_sync_period: 2\nema_rate: 0.999\n# Data config\ntrain_data_dir: \"VCTK-Corpus/wav\"\nvalid_data_dir: \"val_case\"\nn_worker: 10\nsampling_rate: 22050\nseg_len: 16000\nfil_len: 1024\nhop_len: 256\nwin_len: 1024\nmel_fmin: 0.0\nmel_fmax: 8000.0\n"
  },
  {
    "path": "egs/vctk/generate.sh",
    "content": "#!/bin/bash\n\n## example usage (single GPU): sh generate.sh 0 conf.yml\n\ncuda=$1\n\nCUDA_VISIBLE_DEVICES=$cuda python3 main.py \\\n\t--command generate --config $2\n"
  },
  {
    "path": "egs/vctk/main.py",
    "content": "import os\nimport sys\nsys.path.append('../../')\nimport json\nimport shutil\nimport hashlib\nimport argparse\nimport numpy as np\nimport yaml\n\nimport torch\ntorch.backends.cudnn.enabled = True\ntorch.backends.cudnn.benchmark = True\n\nfrom bddm import trainer, sampler\nfrom bddm.utils.log_utils import log\n\n\ndef dict_hash_5char(dictionary):\n    ''' Map a unique dictionary into a 5-character string '''\n    dhash = hashlib.md5()\n    encoded = json.dumps(dictionary, sort_keys=True).encode()\n    dhash.update(encoded)\n    return dhash.hexdigest()[:5]\n\n\ndef start_exp(config, config_hash):\n    ''' Create experiment directory or set it to an existing directory '''\n    if config.load != '' and '_nets' in config.load:\n        config.exp_dir = '/'.join(config.load.split('/')[:-2])\n    else:\n        config.exp_dir += '/%s-%s_conf-hash-%s' % (\n            config.score_net, config.schedule_net, config_hash)\n    if config.local_rank != 0:\n        return\n    log('Experiment directory: %s' % (config.exp_dir), config)\n    # Backup the config file\n    shutil.copyfile(config.config, os.path.join(config.exp_dir, 'conf.yml'))\n    # Create a backup scripts sub-folder\n    os.makedirs(os.path.join(config.exp_dir, 'backup_scripts'), exist_ok=True)\n    # Backup all .py files under bddm/\n    backup_files = []\n    for root, _, files in os.walk(\"../../\"):\n        if 'egs' in root:\n            continue\n        for f in files:\n            if f.endswith(\".py\"):\n                backup_files.append(os.path.join(root, f))\n    for src_file in backup_files:\n        basename = src_file\n        while '../' in basename:\n            basename = basename.replace('../', '')\n        basename = basename.replace('./', '')\n        dst_file = os.path.join(config.exp_dir, 'backup_scripts', basename)\n        dst_dir = '/'.join(dst_file.split('/')[:-1])\n        if not os.path.exists(dst_dir):\n            os.makedirs(dst_dir)\n        shutil.copyfile(src_file, dst_file)\n    # Prepare sub-folders for saving model checkpoints\n    os.makedirs(os.path.join(config.exp_dir, 'score_nets'), exist_ok=True)\n    os.makedirs(os.path.join(config.exp_dir, 'schedule_nets'), exist_ok=True)\n\n\nif __name__ == '__main__':\n    parser = argparse.ArgumentParser(description='Bilateral Denoising Diffusion Models')\n    parser.add_argument('--command',\n                        type=str,\n                        default='train',\n                        help='available commands: train | search | generate')\n    parser.add_argument('--config',\n                        '-c',\n                        type=str,\n                        default='conf.yml',\n                        help='config .yml path')\n    parser.add_argument('--local_rank',\n                        type=int,\n                        default=0,\n                        help='process device ID for multi-GPU training')\n\n    arg_config = parser.parse_args()\n\n    # Parse yaml and define configurations\n    config = arg_config.__dict__\n    with open(arg_config.config) as f:\n        yaml_config = yaml.safe_load(f)\n    HASH = dict_hash_5char(yaml_config)\n    for key in yaml_config:\n        config[key] = yaml_config[key]\n    config = argparse.Namespace(**config)\n\n    # Set random seed for reproducible results\n    np.random.seed(config.seed)\n    torch.manual_seed(config.seed)\n    torch.cuda.manual_seed_all(config.seed)\n    torch.cuda.set_device(config.local_rank)\n\n    # Check if the command is valid or not\n    commands = ['train', 'schedule', 'generate']\n    assert config.command in commands, 'Error: %s command not found.'%(config.command)\n\n    # Create/retrieve exp dir\n    start_exp(config, HASH)\n    log('Argv: %s' % (' '.join(sys.argv)), config)\n\n    try:\n        if config.command == 'train':\n            # Create Trainer for training\n            trainer = trainer.Trainer(config)\n            trainer.train()\n        elif config.command == 'schedule':\n            # Create Sampler for noise scheduling\n            sampler = sampler.Sampler(config)\n            sampler.noise_scheduling_without_params()\n        elif config.command == 'generate':\n            # Create Sampler for generation\n            # NOTE: Remember to define \"gen_data_dir\" in conf.yml before data generation\n            sampler = sampler.Sampler(config)\n            sampler.generate()\n        log('-' * 80, config)\n\n    except KeyboardInterrupt:\n        log('-' * 80, config)\n        log('Exiting early', config)\n"
  },
  {
    "path": "egs/vctk/schedule.sh",
    "content": "#!/bin/bash\n\n## example usage (single GPU): sh schedule.sh 0 conf.yml\n\ncuda=$1\n\nCUDA_VISIBLE_DEVICES=$cuda python3 main.py \\\n\t--command schedule --config $2\n"
  },
  {
    "path": "egs/vctk/train.sh",
    "content": "#!/bin/bash\n\n## example usage: sh train.sh 0,1,2,3 conf.yml\n\ncuda=$1\ncomma=${cuda//[^,]}\nnproc=$((${#comma}+1))\n\nCUDA_VISIBLE_DEVICES=$cuda python3 -m torch.distributed.launch \\\n\t--nproc_per_node $nproc --master_port $RANDOM main.py \\\n\t--command train --config $2\n"
  }
]