Repository: jasonppy/VoiceCraft Branch: master Commit: a702dfd2ced6 Files: 47 Total size: 1.2 MB Directory structure: gitextract_ru4iadb9/ ├── .dockerignore ├── .gitignore ├── Dockerfile ├── LICENSE-CODE ├── LICENSE-MODEL ├── README.md ├── RealEdit.txt ├── cog.yaml ├── config.py ├── data/ │ ├── __init__.py │ ├── gigaspeech.py │ ├── phonemize_encodec_encode_hf.py │ └── tokenizer.py ├── demo/ │ └── temp/ │ ├── 84_121550_000074_000000.txt │ └── mfa_alignments/ │ ├── 5895_34622_000026_000002.csv │ └── 84_121550_000074_000000.csv ├── edit_utils.py ├── environment.yml ├── gradio_app.ipynb ├── gradio_app.py ├── gradio_requirements.txt ├── inference_speech_editing.ipynb ├── inference_speech_editing_scale.py ├── inference_tts.ipynb ├── inference_tts_scale.py ├── main.py ├── models/ │ ├── codebooks_patterns.py │ ├── modules/ │ │ ├── __init__.py │ │ ├── activation.py │ │ ├── embedding.py │ │ ├── sampling.py │ │ ├── scaling.py │ │ ├── transformer.py │ │ └── utils.py │ └── voicecraft.py ├── predict.py ├── pretrained_models/ │ └── .gitkeep ├── start-jupyter.bat ├── start-jupyter.sh ├── steps/ │ ├── __init__.py │ ├── optim.py │ ├── trainer.py │ └── trainer_utils.py ├── tts_demo.py ├── voicecraft-gradio-colab.ipynb └── z_scripts/ ├── e830M.sh └── e830M_ft.sh ================================================ FILE CONTENTS ================================================ ================================================ FILE: .dockerignore ================================================ # The .dockerignore file excludes files from the container build process. # # https://docs.docker.com/engine/reference/builder/#dockerignore-file # Exclude Git files .git .github .gitignore # Exclude Python cache files __pycache__ .mypy_cache .pytest_cache .ruff_cache # Exclude Python virtual environment /venv ================================================ FILE: .gitignore ================================================ __pycache__/ *.py[cod] *$py.class *.egg-info .pytest_cache .ipynb_checkpoints thumbs.db .DS_Store .idea *.log *.pdf *.mkv *.mp4 *.png *.wav *.mp3 *.pth *.th *.json *durip* *rtx* *l40* *a40* src/audiocraft !/demo/ !/demo/* /demo/temp/*.txt !/demo/temp/84_121550_000074_000000.txt .cog/tmp/* ================================================ FILE: Dockerfile ================================================ FROM jupyter/base-notebook:python-3.9.13 USER root # Install OS dependencies RUN apt-get update && apt-get install -y git-core ffmpeg espeak-ng && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* # Update Conda, create the voicecraft environment, and install dependencies RUN conda update -y -n base -c conda-forge conda && \ conda create -y -n voicecraft python=3.9.16 && \ conda run -n voicecraft conda install -y -c conda-forge montreal-forced-aligner=2.2.17 openfst=1.8.2 kaldi=5.5.1068 && \ conda run -n voicecraft mfa model download dictionary english_us_arpa && \ conda run -n voicecraft mfa model download acoustic english_us_arpa && \ conda run -n voicecraft pip install -e git+https://github.com/facebookresearch/audiocraft.git@c5157b5bf14bf83449c17ea1eeb66c19fb4bc7f0#egg=audiocraft && \ conda run -n voicecraft pip install xformers==0.0.22 && \ conda run -n voicecraft pip install torch==2.0.1 && \ conda run -n voicecraft pip install torchaudio==2.0.2 && \ conda run -n voicecraft pip install tensorboard==2.16.2 && \ conda run -n voicecraft pip install phonemizer==3.2.1 && \ conda run -n voicecraft pip install datasets==2.16.0 && \ conda run -n voicecraft pip install torchmetrics==0.11.1 && \ conda run -n voicecraft pip install huggingface_hub==0.22.2 # Install the Jupyter kernel RUN conda install -n voicecraft ipykernel --update-deps --force-reinstall -y && \ conda run -n voicecraft python -m ipykernel install --name=voicecraft ================================================ FILE: LICENSE-CODE ================================================ Attribution-NonCommercial-ShareAlike 4.0 International ======================================================================= Creative Commons Corporation ("Creative Commons") is not a law firm and does not provide legal services or legal advice. Distribution of Creative Commons public licenses does not create a lawyer-client or other relationship. Creative Commons makes its licenses and related information available on an "as-is" basis. Creative Commons gives no warranties regarding its licenses, any material licensed under their terms and conditions, or any related information. Creative Commons disclaims all liability for damages resulting from their use to the fullest extent possible. Using Creative Commons Public Licenses Creative Commons public licenses provide a standard set of terms and conditions that creators and other rights holders may use to share original works of authorship and other material subject to copyright and certain other rights specified in the public license below. The following considerations are for informational purposes only, are not exhaustive, and do not form part of our licenses. Considerations for licensors: Our public licenses are intended for use by those authorized to give the public permission to use material in ways otherwise restricted by copyright and certain other rights. Our licenses are irrevocable. Licensors should read and understand the terms and conditions of the license they choose before applying it. Licensors should also secure all rights necessary before applying our licenses so that the public can reuse the material as expected. Licensors should clearly mark any material not subject to the license. This includes other CC- licensed material, or material used under an exception or limitation to copyright. More considerations for licensors: wiki.creativecommons.org/Considerations_for_licensors Considerations for the public: By using one of our public licenses, a licensor grants the public permission to use the licensed material under specified terms and conditions. If the licensor's permission is not necessary for any reason--for example, because of any applicable exception or limitation to copyright--then that use is not regulated by the license. Our licenses grant only permissions under copyright and certain other rights that a licensor has authority to grant. Use of the licensed material may still be restricted for other reasons, including because others have copyright or other rights in the material. A licensor may make special requests, such as asking that all changes be marked or described. Although not required by our licenses, you are encouraged to respect those requests where reasonable. More considerations for the public: wiki.creativecommons.org/Considerations_for_licensees ======================================================================= Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public License By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public License ("Public License"). To the extent this Public License may be interpreted as a contract, You are granted the Licensed Rights in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions. Section 1 -- Definitions. a. Adapted Material means material subject to Copyright and Similar Rights that is derived from or based upon the Licensed Material and in which the Licensed Material is translated, altered, arranged, transformed, or otherwise modified in a manner requiring permission under the Copyright and Similar Rights held by the Licensor. For purposes of this Public License, where the Licensed Material is a musical work, performance, or sound recording, Adapted Material is always produced where the Licensed Material is synched in timed relation with a moving image. b. Adapter's License means the license You apply to Your Copyright and Similar Rights in Your contributions to Adapted Material in accordance with the terms and conditions of this Public License. c. BY-NC-SA Compatible License means a license listed at creativecommons.org/compatiblelicenses, approved by Creative Commons as essentially the equivalent of this Public License. d. Copyright and Similar Rights means copyright and/or similar rights closely related to copyright including, without limitation, performance, broadcast, sound recording, and Sui Generis Database Rights, without regard to how the rights are labeled or categorized. For purposes of this Public License, the rights specified in Section 2(b)(1)-(2) are not Copyright and Similar Rights. e. Effective Technological Measures means those measures that, in the absence of proper authority, may not be circumvented under laws fulfilling obligations under Article 11 of the WIPO Copyright Treaty adopted on December 20, 1996, and/or similar international agreements. f. Exceptions and Limitations means fair use, fair dealing, and/or any other exception or limitation to Copyright and Similar Rights that applies to Your use of the Licensed Material. g. License Elements means the license attributes listed in the name of a Creative Commons Public License. The License Elements of this Public License are Attribution, NonCommercial, and ShareAlike. h. Licensed Material means the artistic or literary work, database, or other material to which the Licensor applied this Public License. i. Licensed Rights means the rights granted to You subject to the terms and conditions of this Public License, which are limited to all Copyright and Similar Rights that apply to Your use of the Licensed Material and that the Licensor has authority to license. j. Licensor means the individual(s) or entity(ies) granting rights under this Public License. k. NonCommercial means not primarily intended for or directed towards commercial advantage or monetary compensation. For purposes of this Public License, the exchange of the Licensed Material for other material subject to Copyright and Similar Rights by digital file-sharing or similar means is NonCommercial provided there is no payment of monetary compensation in connection with the exchange. l. Share means to provide material to the public by any means or process that requires permission under the Licensed Rights, such as reproduction, public display, public performance, distribution, dissemination, communication, or importation, and to make material available to the public including in ways that members of the public may access the material from a place and at a time individually chosen by them. m. Sui Generis Database Rights means rights other than copyright resulting from Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, as amended and/or succeeded, as well as other essentially equivalent rights anywhere in the world. n. You means the individual or entity exercising the Licensed Rights under this Public License. Your has a corresponding meaning. Section 2 -- Scope. a. License grant. 1. Subject to the terms and conditions of this Public License, the Licensor hereby grants You a worldwide, royalty-free, non-sublicensable, non-exclusive, irrevocable license to exercise the Licensed Rights in the Licensed Material to: a. reproduce and Share the Licensed Material, in whole or in part, for NonCommercial purposes only; and b. produce, reproduce, and Share Adapted Material for NonCommercial purposes only. 2. Exceptions and Limitations. For the avoidance of doubt, where Exceptions and Limitations apply to Your use, this Public License does not apply, and You do not need to comply with its terms and conditions. 3. Term. The term of this Public License is specified in Section 6(a). 4. Media and formats; technical modifications allowed. The Licensor authorizes You to exercise the Licensed Rights in all media and formats whether now known or hereafter created, and to make technical modifications necessary to do so. The Licensor waives and/or agrees not to assert any right or authority to forbid You from making technical modifications necessary to exercise the Licensed Rights, including technical modifications necessary to circumvent Effective Technological Measures. For purposes of this Public License, simply making modifications authorized by this Section 2(a) (4) never produces Adapted Material. 5. Downstream recipients. a. Offer from the Licensor -- Licensed Material. Every recipient of the Licensed Material automatically receives an offer from the Licensor to exercise the Licensed Rights under the terms and conditions of this Public License. b. Additional offer from the Licensor -- Adapted Material. Every recipient of Adapted Material from You automatically receives an offer from the Licensor to exercise the Licensed Rights in the Adapted Material under the conditions of the Adapter's License You apply. c. No downstream restrictions. You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, the Licensed Material if doing so restricts exercise of the Licensed Rights by any recipient of the Licensed Material. 6. No endorsement. Nothing in this Public License constitutes or may be construed as permission to assert or imply that You are, or that Your use of the Licensed Material is, connected with, or sponsored, endorsed, or granted official status by, the Licensor or others designated to receive attribution as provided in Section 3(a)(1)(A)(i). b. Other rights. 1. Moral rights, such as the right of integrity, are not licensed under this Public License, nor are publicity, privacy, and/or other similar personality rights; however, to the extent possible, the Licensor waives and/or agrees not to assert any such rights held by the Licensor to the limited extent necessary to allow You to exercise the Licensed Rights, but not otherwise. 2. Patent and trademark rights are not licensed under this Public License. 3. To the extent possible, the Licensor waives any right to collect royalties from You for the exercise of the Licensed Rights, whether directly or through a collecting society under any voluntary or waivable statutory or compulsory licensing scheme. In all other cases the Licensor expressly reserves any right to collect such royalties, including when the Licensed Material is used other than for NonCommercial purposes. Section 3 -- License Conditions. Your exercise of the Licensed Rights is expressly made subject to the following conditions. a. Attribution. 1. If You Share the Licensed Material (including in modified form), You must: a. retain the following if it is supplied by the Licensor with the Licensed Material: i. identification of the creator(s) of the Licensed Material and any others designated to receive attribution, in any reasonable manner requested by the Licensor (including by pseudonym if designated); ii. a copyright notice; iii. a notice that refers to this Public License; iv. a notice that refers to the disclaimer of warranties; v. a URI or hyperlink to the Licensed Material to the extent reasonably practicable; b. indicate if You modified the Licensed Material and retain an indication of any previous modifications; and c. indicate the Licensed Material is licensed under this Public License, and include the text of, or the URI or hyperlink to, this Public License. 2. You may satisfy the conditions in Section 3(a)(1) in any reasonable manner based on the medium, means, and context in which You Share the Licensed Material. For example, it may be reasonable to satisfy the conditions by providing a URI or hyperlink to a resource that includes the required information. 3. If requested by the Licensor, You must remove any of the information required by Section 3(a)(1)(A) to the extent reasonably practicable. b. ShareAlike. In addition to the conditions in Section 3(a), if You Share Adapted Material You produce, the following conditions also apply. 1. The Adapter's License You apply must be a Creative Commons license with the same License Elements, this version or later, or a BY-NC-SA Compatible License. 2. You must include the text of, or the URI or hyperlink to, the Adapter's License You apply. You may satisfy this condition in any reasonable manner based on the medium, means, and context in which You Share Adapted Material. 3. You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, Adapted Material that restrict exercise of the rights granted under the Adapter's License You apply. Section 4 -- Sui Generis Database Rights. Where the Licensed Rights include Sui Generis Database Rights that apply to Your use of the Licensed Material: a. for the avoidance of doubt, Section 2(a)(1) grants You the right to extract, reuse, reproduce, and Share all or a substantial portion of the contents of the database for NonCommercial purposes only; b. if You include all or a substantial portion of the database contents in a database in which You have Sui Generis Database Rights, then the database in which You have Sui Generis Database Rights (but not its individual contents) is Adapted Material, including for purposes of Section 3(b); and c. You must comply with the conditions in Section 3(a) if You Share all or a substantial portion of the contents of the database. For the avoidance of doubt, this Section 4 supplements and does not replace Your obligations under this Public License where the Licensed Rights include other Copyright and Similar Rights. Section 5 -- Disclaimer of Warranties and Limitation of Liability. a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. c. The disclaimer of warranties and limitation of liability provided above shall be interpreted in a manner that, to the extent possible, most closely approximates an absolute disclaimer and waiver of all liability. Section 6 -- Term and Termination. a. This Public License applies for the term of the Copyright and Similar Rights licensed here. However, if You fail to comply with this Public License, then Your rights under this Public License terminate automatically. b. Where Your right to use the Licensed Material has terminated under Section 6(a), it reinstates: 1. automatically as of the date the violation is cured, provided it is cured within 30 days of Your discovery of the violation; or 2. upon express reinstatement by the Licensor. For the avoidance of doubt, this Section 6(b) does not affect any right the Licensor may have to seek remedies for Your violations of this Public License. c. For the avoidance of doubt, the Licensor may also offer the Licensed Material under separate terms or conditions or stop distributing the Licensed Material at any time; however, doing so will not terminate this Public License. d. Sections 1, 5, 6, 7, and 8 survive termination of this Public License. Section 7 -- Other Terms and Conditions. a. The Licensor shall not be bound by any additional or different terms or conditions communicated by You unless expressly agreed. b. Any arrangements, understandings, or agreements regarding the Licensed Material not stated herein are separate from and independent of the terms and conditions of this Public License. Section 8 -- Interpretation. a. For the avoidance of doubt, this Public License does not, and shall not be interpreted to, reduce, limit, restrict, or impose conditions on any use of the Licensed Material that could lawfully be made without permission under this Public License. b. To the extent possible, if any provision of this Public License is deemed unenforceable, it shall be automatically reformed to the minimum extent necessary to make it enforceable. If the provision cannot be reformed, it shall be severed from this Public License without affecting the enforceability of the remaining terms and conditions. c. No term or condition of this Public License will be waived and no failure to comply consented to unless expressly agreed to by the Licensor. d. Nothing in this Public License constitutes or may be interpreted as a limitation upon, or waiver of, any privileges and immunities that apply to the Licensor or You, including from the legal processes of any jurisdiction or authority. ======================================================================= Creative Commons is not a party to its public licenses. Notwithstanding, Creative Commons may elect to apply one of its public licenses to material it publishes and in those instances will be considered the “Licensor.” The text of the Creative Commons public licenses is dedicated to the public domain under the CC0 Public Domain Dedication. Except for the limited purpose of indicating that material is shared under a Creative Commons public license or as otherwise permitted by the Creative Commons policies published at creativecommons.org/policies, Creative Commons does not authorize the use of the trademark "Creative Commons" or any other trademark or logo of Creative Commons without its prior written consent including, without limitation, in connection with any unauthorized modifications to any of its public licenses or any other arrangements, understandings, or agreements concerning use of licensed material. For the avoidance of doubt, this paragraph does not form part of the public licenses. Creative Commons may be contacted at creativecommons.org. ================================================ FILE: LICENSE-MODEL ================================================ Coqui Public Model License 1.0.0 https://coqui.ai/cpml.txt This license allows only non-commercial use of a machine learning model and its outputs. Acceptance In order to get any license under these terms, you must agree to them as both strict obligations and conditions to all your licenses. Licenses The licensor grants you a copyright license to do everything you might do with the model that would otherwise infringe the licensor's copyright in it, for any non-commercial purpose. The licensor grants you a patent license that covers patent claims the licensor can license, or becomes able to license, that you would infringe by using the model in the form provided by the licensor, for any non-commercial purpose. Non-commercial Purpose Non-commercial purposes include any of the following uses of the model or its output, but only so far as you do not receive any direct or indirect payment arising from the use of the model or its output. Personal use for research, experiment, and testing for the benefit of public knowledge, personal study, private entertainment, hobby projects, amateur pursuits, or religious observance. Use by commercial or for-profit entities for testing, evaluation, or non-commercial research and development. Use of the model to train other models for commercial use is not a non-commercial purpose. Use by any charitable organization for charitable purposes, or for testing or evaluation. Use for revenue-generating activity, including projects directly funded by government grants, is not a non-commercial purpose. Notices You must ensure that anyone who gets a copy of any part of the model, or any modification of the model, or their output, from you also gets a copy of these terms or the URL for them above. No Other Rights These terms do not allow you to sublicense or transfer any of your licenses to anyone else, or prevent the licensor from granting licenses to anyone else. These terms do not imply any other licenses. Patent Defense If you make any written claim that the model infringes or contributes to infringement of any patent, your licenses for the model granted under these terms ends immediately. If your company makes such a claim, your patent license ends immediately for work on behalf of your company. Violations The first time you are notified in writing that you have violated any of these terms, or done anything with the model or its output that is not covered by your licenses, your licenses can nonetheless continue if you come into full compliance with these terms, and take practical steps to correct past violations, within 30 days of receiving notice. Otherwise, all your licenses end immediately. No Liability AS FAR AS THE LAW ALLOWS, THE MODEL AND ITS OUTPUT COME AS IS, WITHOUT ANY WARRANTY OR CONDITION, AND THE LICENSOR WILL NOT BE LIABLE TO YOU FOR ANY DAMAGES ARISING OUT OF THESE TERMS OR THE USE OR NATURE OF THE MODEL OR ITS OUTPUT, UNDER ANY KIND OF LEGAL CLAIM. IF THIS PROVISION IS NOT ENFORCEABLE IN YOUR JURISDICTION, YOUR LICENSES ARE VOID. Definitions The licensor is the individual or entity offering these terms, and the model is the model the licensor makes available under these terms, including any documentation or similar information about the model. You refers to the individual or entity agreeing to these terms. Your company is any legal entity, sole proprietorship, or other kind of organization that you work for, plus all organizations that have control over, are under the control of, or are under common control with that organization. Control means ownership of substantially all the assets of an entity, or the power to direct its management and policies by vote, contract, or otherwise. Control can be direct or indirect. Your licenses are all the licenses granted to you under these terms. Use means anything you do with the model or its output requiring one of your licenses. ================================================ FILE: README.md ================================================ # VoiceCraft: Zero-Shot Speech Editing and Text-to-Speech in the Wild [![Paper](https://img.shields.io/badge/arXiv-2403.16973-brightgreen.svg?style=flat-square)](https://arxiv.org/pdf/2403.16973.pdf) [![HuggingFace](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/pyp1/VoiceCraft_gradio) [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1IOjpglQyMTO2C3Y94LD9FY0Ocn-RJRg6?usp=sharing) [![Replicate](https://replicate.com/cjwbw/voicecraft/badge)](https://replicate.com/cjwbw/voicecraft) [![YouTube demo](https://img.shields.io/youtube/views/eikybOi8iwU)](https://youtu.be/eikybOi8iwU) [![Demo page](https://img.shields.io/badge/Audio_Samples-blue?logo=Github&style=flat-square)](https://jasonppy.github.io/VoiceCraft_web/) ### TL;DR VoiceCraft is a token infilling neural codec language model, that achieves state-of-the-art performance on both **speech editing** and **zero-shot text-to-speech (TTS)** on in-the-wild data including audiobooks, internet videos, and podcasts. To clone or edit an unseen voice, VoiceCraft needs only a few seconds of reference. ## How to run inference There are three ways (besides running Gradio in Colab): 1. More flexible inference beyond Gradio UI in Google Colab. see [quickstart colab](#quickstart-colab) 2. with docker. see [quickstart docker](#quickstart-docker) 3. without docker. see [environment setup](#environment-setup). You can also run gradio locally if you choose this option 4. As a standalone script that you can easily integrate into other projects. see [quickstart command line](#quickstart-command-line). When you are inside the docker image or you have installed all dependencies, Checkout [`inference_tts.ipynb`](./inference_tts.ipynb). If you want to do model development such as training/finetuning, I recommend following [envrionment setup](#environment-setup) and [training](#training). ## News :star: 03/15/2025: change inference sampling from topp=1 to topk=40 massively improve editing and TTS performance :star: 04/22/2024: 330M/830M TTS Enhanced Models are up [here](https://huggingface.co/pyp1), load them through [`gradio_app.py`](./gradio_app.py) or [`inference_tts.ipynb`](./inference_tts.ipynb)! Replicate demo is up, major thanks to [@chenxwh](https://github.com/chenxwh)! :star: 04/11/2024: VoiceCraft Gradio is now available on HuggingFace Spaces [here](https://huggingface.co/spaces/pyp1/VoiceCraft_gradio)! Major thanks to [@zuev-stepan](https://github.com/zuev-stepan), [@Sewlell](https://github.com/Sewlell), [@pgsoar](https://github.com/pgosar) [@Ph0rk0z](https://github.com/Ph0rk0z). :star: 04/05/2024: I finetuned giga330M with the TTS objective on gigaspeech and 1/5 of librilight. Weights are [here](https://huggingface.co/pyp1/VoiceCraft/tree/main). Make sure maximal prompt + generation length <= 16 seconds (due to our limited compute, we had to drop utterances longer than 16s in training data). Even stronger models forthcomming, stay tuned! :star: 03/28/2024: Model weights for giga330M and giga830M are up on HuggingFace🤗 [here](https://huggingface.co/pyp1/VoiceCraft/tree/main)! ## TODO - [x] Codebase upload - [x] Environment setup - [x] Inference demo for speech editing and TTS - [x] Training guidance - [x] RealEdit dataset and training manifest - [x] Model weights - [x] Better guidance on training/finetuning - [x] Colab notebooks - [x] HuggingFace Spaces demo - [x] Command line - [ ] Improve efficiency ## QuickStart Colab :star: To try out speech editing or TTS Inference with VoiceCraft, the simplest way is using Google Colab. Instructions to run are on the Colab itself. 1. To try [Speech Editing](https://colab.research.google.com/drive/1FV7EC36dl8UioePY1xXijXTMl7X47kR_?usp=sharing) 2. To try [TTS Inference](https://colab.research.google.com/drive/1lch_6it5-JpXgAQlUTRRI2z2_rk5K67Z?usp=sharing) ## QuickStart Command Line :star: To use it as a standalone script, check out tts_demo.py and speech_editing_demo.py. Be sure to first [setup your environment](#environment-setup). Without arguments, they will run the standard demo arguments used as an example elsewhere in this repository. You can use the command line arguments to specify unique input audios, target transcripts, and inference hyperparameters. Run the help command for more information: `python3 tts_demo.py -h` ## QuickStart Docker :star: To try out TTS inference with VoiceCraft, you can also use docker. Thank [@ubergarm](https://github.com/ubergarm) and [@jayc88](https://github.com/jay-c88) for making this happen. Tested on Linux and Windows and should work with any host with docker installed. ```bash # 1. clone the repo on in a directory on a drive with plenty of free space git clone git@github.com:jasonppy/VoiceCraft.git cd VoiceCraft # 2. assumes you have docker installed with nvidia container container-toolkit (windows has this built into the driver) # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/1.13.5/install-guide.html # sudo apt-get install -y nvidia-container-toolkit-base || yay -Syu nvidia-container-toolkit || echo etc... # 3. First build the docker image docker build --tag "voicecraft" . # 4. Try to start an existing container otherwise create a new one passing in all GPUs ./start-jupyter.sh # linux start-jupyter.bat # windows # 5. now open a webpage on the host box to the URL shown at the bottom of: docker logs jupyter # 6. optionally look inside from another terminal docker exec -it jupyter /bin/bash export USER=(your_linux_username_used_above) export HOME=/home/$USER sudo apt-get update # 7. confirm video card(s) are visible inside container nvidia-smi # 8. Now in browser, open inference_tts.ipynb and work through one cell at a time echo GOOD LUCK ``` ## Environment setup ```bash conda create -n voicecraft python=3.9.16 conda activate voicecraft pip install -e git+https://github.com/facebookresearch/audiocraft.git@c5157b5bf14bf83449c17ea1eeb66c19fb4bc7f0#egg=audiocraft pip install xformers==0.0.22 pip install torchaudio==2.0.2 torch==2.0.1 # this assumes your system is compatible with CUDA 11.7, otherwise checkout https://pytorch.org/get-started/previous-versions/#v201 apt-get install ffmpeg # if you don't already have ffmpeg installed apt-get install espeak-ng # backend for the phonemizer installed below pip install tensorboard==2.16.2 pip install phonemizer==3.2.1 pip install datasets==2.16.0 pip install torchmetrics==0.11.1 pip install huggingface_hub==0.22.2 # install MFA for getting forced-alignment, this could take a few minutes conda install -c conda-forge montreal-forced-aligner=2.2.17 openfst=1.8.2 kaldi=5.5.1068 # install MFA english dictionary and model mfa model download dictionary english_us_arpa mfa model download acoustic english_us_arpa # pip install huggingface_hub # conda install pocl # above gives an warning for installing pocl, not sure if really need this # to run ipynb conda install -n voicecraft ipykernel --no-deps --force-reinstall ``` If you have encountered version issues when running things, checkout [environment.yml](./environment.yml) for exact matching. ## Inference Examples Checkout [`inference_speech_editing.ipynb`](./inference_speech_editing.ipynb) and [`inference_tts.ipynb`](./inference_tts.ipynb) ## Gradio ### Run in colab [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1IOjpglQyMTO2C3Y94LD9FY0Ocn-RJRg6?usp=sharing) ### Run locally After environment setup install additional dependencies: ```bash apt-get install -y espeak espeak-data libespeak1 libespeak-dev apt-get install -y festival* apt-get install -y build-essential apt-get install -y flac libasound2-dev libsndfile1-dev vorbis-tools apt-get install -y libxml2-dev libxslt-dev zlib1g-dev pip install -r gradio_requirements.txt ``` Run gradio server from terminal or [`gradio_app.ipynb`](./gradio_app.ipynb): ```bash python gradio_app.py ``` It is ready to use on [default url](http://127.0.0.1:7860). ### How to use it 1. (optionally) Select models 2. Load models 3. Transcribe 4. (optionally) Tweak some parameters 5. Run 6. (optionally) Rerun part-by-part in Long TTS mode ### Some features Smart transcript: write only what you want to generate TTS mode: Zero-shot TTS Edit mode: Speech editing Long TTS mode: Easy TTS on long texts ## Training To train an VoiceCraft model, you need to prepare the following parts: 1. utterances and their transcripts 2. encode the utterances into codes using e.g. Encodec 3. convert transcripts into phoneme sequence, and a phoneme set (we named it vocab.txt) 4. manifest (i.e. metadata) Step 1,2,3 are handled in [./data/phonemize_encodec_encode_hf.py](./data/phonemize_encodec_encode_hf.py), where 1. Gigaspeech is downloaded through HuggingFace. Note that you need to sign an agreement in order to download the dataset (it needs your auth token) 2. phoneme sequence and encodec codes are also extracted using the script. An example run: ```bash conda activate voicecraft export CUDA_VISIBLE_DEVICES=0 cd ./data python phonemize_encodec_encode_hf.py \ --dataset_size xs \ --download_to path/to/store_huggingface_downloads \ --save_dir path/to/store_extracted_codes_and_phonemes \ --encodec_model_path path/to/encodec_model \ --mega_batch_size 120 \ --batch_size 32 \ --max_len 30000 ``` where encodec_model_path is avaliable [here](https://huggingface.co/pyp1/VoiceCraft). This model is trained on Gigaspeech XL, it has 56M parameters, 4 codebooks, each codebook has 2048 codes. Details are described in our [paper](https://jasonppy.github.io/assets/pdfs/VoiceCraft.pdf). If you encounter OOM during extraction, try decrease the batch_size and/or max_len. The extracted codes, phonemes, and vocab.txt will be stored at `path/to/store_extracted_codes_and_phonemes/${dataset_size}/{encodec_16khz_4codebooks,phonemes,vocab.txt}`. As for manifest, please download train.txt and validation.txt from [here](https://huggingface.co/datasets/pyp1/VoiceCraft_RealEdit/tree/main), and put them under `path/to/store_extracted_codes_and_phonemes/manifest/`. Please also download vocab.txt from [here](https://huggingface.co/datasets/pyp1/VoiceCraft_RealEdit/tree/main) if you want to use our pretrained VoiceCraft model (so that the phoneme-to-token matching is the same). Now, you are good to start training! ```bash conda activate voicecraft cd ./z_scripts bash e830M.sh ``` It's the same procedure to prepare your own custom dataset. Make sure that if ## Finetuning You also need to do step 1-4 as Training, and I recommend to use AdamW for optimization if you finetune a pretrained model for better stability. checkout script `./z_scripts/e830M_ft.sh`. If your dataset introduce new phonemes (which is very likely) that doesn't exist in the giga checkpoint, make sure you combine the original phonemes with the phoneme from your data when construction vocab. And you need to adjust `--text_vocab_size` and `--text_pad_token` so that the former is bigger than or equal to you vocab size, and the latter has the same value as `--text_vocab_size` (i.e. `--text_pad_token` is always the last token). Also since the text embedding are now of a different size, make sure you modify the weights loading part so that I won't crash (you could skip loading `text_embedding` or only load the existing part, and randomly initialize the new) ## License The codebase is under CC BY-NC-SA 4.0 ([LICENSE-CODE](./LICENSE-CODE)), and the model weights are under Coqui Public Model License 1.0.0 ([LICENSE-MODEL](./LICENSE-MODEL)). Note that we use some of the code from other repository that are under different licenses: `./models/codebooks_patterns.py` is under MIT license; `./models/modules`, `./steps/optim.py`, `data/tokenizer.py` are under Apache License, Version 2.0; the phonemizer we used is under GNU 3.0 License. ## Acknowledgement We thank Feiteng for his [VALL-E reproduction](https://github.com/lifeiteng/vall-e), and we thank audiocraft team for open-sourcing [encodec](https://github.com/facebookresearch/audiocraft). ## Citation ``` @article{peng2024voicecraft, author = {Peng, Puyuan and Huang, Po-Yao and Mohamed, Abdelrahman and Harwath, David}, title = {VoiceCraft: Zero-Shot Speech Editing and Text-to-Speech in the Wild}, journal = {arXiv}, year = {2024}, } ``` ## Disclaimer Any organization or individual is prohibited from using any technology mentioned in this paper to generate or edit someone's speech without his/her consent, including but not limited to government leaders, political figures, and celebrities. If you do not comply with this item, you could be in violation of copyright laws. ================================================ FILE: RealEdit.txt ================================================ wav_fn orig_transcript new_transcript orig_masked_span new_masked_span type YOU1000000102_S0000137.wav if i had never dropped out. i would have never dropped in on that calligraphy class and personal computers might not have the wonderful typography that they do. if i had never dropped out. i would have never stopped by that calligraphy class and personal computers might not have the wonderful typography that they do. 10,12 10,11 substitution YOU1000000124_S0000174.wav so people in symbolic era. i mean we all agree that symbols come in and symbols come out, okay. so people in symbolic era. i mean we all agree that signals go in and symbols come out, okay. 11,12 11,12 substitution YOU1000000006_S0000016.wav and then we can actually go through and generate a lead. so we've got this first call to action here and this is sending them to a landing page. and then we can actually go through and generate a lead. so we've got this first action here and this is sending them to a landing page. 16,17 15,16 deletion YOU1000000149_S0000172.wav the dry shampoo honestly like weirdly styles it enough to where i'm like satisfied with it but i do like to add a little bit of hair spray. the dry shampoo honestly like weirdly styles it enough to where i'm like satisfied with it but sometimes it needs a little extra i do like to add a little bit of hair spray. 16,17 17,22 insertion YOU1000000102_S0000031.wav steve also cofounded pixar animation studios. which has revolutionized the film industry in it short history with brilliant use of technology. steve also cofounded pixar animation studios. which has revolutionized the film industry in it short history with films like toy story that showcase brilliant use of technology. 16,17 17,22 insertion YOU1000000148_S0000041.wav so i was actually happy to wear such a heavy dress and to be able to wear proper wool, you know mountain underwear ha ha ha. so i was actually happy to wear such a rugged coat and boots and to be able to wear proper wool, you know mountain underwear ha ha ha. 9,10 9,12 substitution YOU1000000153_S0000000.wav some times i really feel that the world around us continues to be more hectic and more complicated and so many of us are truly craving to find simplicity. some times i really feel that the world around us continues to be more hectic, more impersonal, and more uncaring and so many of us are truly craving to find simplicity. 14,17 14,19 substitution YOU1000000163_S0000051.wav and finally, pressing on the crown opens up the app menu so this is where you can access your third party apps and system settings. and finally, pressing on the crown opens up the app menu where you can access your third party apps and system settings. 11,13 10,11 deletion YOU1000000115_S0000057.wav in the future when the borrower repays the loan plus interest, the asset and the liability disappear and the transaction is settled. in the future when the borrower repays the lender the loan plus interest, the asset and the liability disappear and the transaction is settled. 7,8 8,9 insertion YOU1000000103_S0000113.wav with an election firmly behind us, voters are taking a new measure of joe biden and what they believe he should deliver as president. with an election firmly behind us, voters are taking a new measure of joe biden and the rest of his administration and reconsidering what they believe he should deliver as president. 15,16 16,22 insertion YOU1000000019_S0000015.wav causing a lock up. although this mostly happens when i'm spamming tps in the last layer. so i've had to learn to control my speed to help with overshooting. causing a lock up. although this mostly happens when i'm tapping the pedal faster than i should be so i've had to learn to control my speed to help with overshooting. 10,15 10,17 substitution YOU1000000045_S0000205.wav and said, you know, we need to start a process ah in order to figure out how we can protect the kurds who have been our allies. and said, you know, we need to start a process ah in order to figure out how we can provide aid to all the groups that have helped us including the kurds who have been our allies. 19 19,29 substitution YOU1000000155_S0000027.wav in a case like this, i probably wouldn't spend any more time looking at the deal if i was only interested in the cash flow. in a case like this, i probably wouldn't spend any more time looking at all the details and the fine print if i was only interested in the cash flow. 14,15 14,20 substitution YOU1000000118_S0000018.wav the reason will often comeback to how you created the machine learning dataset. so give yourself time to absorb the lessons. the reason will often boil down to the quality of the machine learning dataset. so give yourself time to absorb the lessons. 4,8 4,9 substitution YOU1000000115_S0000104.wav the total amount of credit in the united states is about fifty trillion dollars and the total amount of money is only about three trillion dollars. the total amount of credit is about fifty trillion dollars and the total amount of money is only about three trillion dollars. 5,8 4,5 deletion YOU1000000045_S0000187.wav so, there's a huge difference. i mean, people can gather, you know, information in, you know, all kinds of different ways. so, there's a huge difference. i mean, people can earn their living and provide for their family in you know, all kinds of different ways. 9,13 9,17 substitution YOU1000000101_S0000060.wav they knew that governments don't control things. a government can't control the economy without controlling people. they knew that governments don't control money directly. a government can't control the economy without controlling people. 6 6,7 substitution YOU1000000106_S0000178.wav okay so my little cousin julia wants to know what did you want to do with your life at age five? okay so my little cousin julia has kind of a weird question for you. she wants to know what did you want to do with your life at age five? 5,6 6,14 insertion YOU1000000165_S0000039.wav and one to watch over the course of the six seasons of girls. after rising to prominence on the series, he promptly began picking up increasingly significant roles. and one to watch over the course of the six seasons of girls. after his dating scandal, he has lost some significant roles. 14,25 14,20 substitution YOU1000000181_S0000024.wav around a large pot of vinegar crowd three men, these aren't any ordinary men in fact they're the three founders of the great asian philosophies. around a large pot of vinegar crowd three men. they're cooking for the three founders of the great asian philosophies. 8,16 8,11 substitution YOU1000000116_S0000006.wav which was no part of his intention and this term invisible hand is famous led by the invisible hand to promote an end. which was no part of his intention and this term is famous led by the invisible hand to promote an end. 10,11 9,10 deletion YOU1000000015_S0000044.wav or press control c or command c to copy that link. you don't wanna come back to parallel's toolbox and then all we need to do is hit paste. or press control c or command c to copy that paragraph, then go back to the original document and then all we need to do is hit paste. 10,18 10,17 substitution YOU1000000167_S0000004.wav you could tell christopher robin had something important to say from the way he clasped his knees tightly and wriggled his toes. you could tell christopher was really eager to get out of his seat from the way he clasped his knees tightly and wriggled his toes. 4,9 4,12 substitution YOU1000000119_S0000017.wav at the end of this course, you will be able to explain the three fundamental characteristic that define the blockchain using bitcoin blockchain. at the end of this course, you will be able to explain to your friends what blockchain is, and why they should be using bitcoin blockchain. 12,19 12,22 substitution YOU1000000170_S0000103.wav it it felt like, at the time like, an incredible milestone. we we needed to post it to use net. it it felt like, we needed to post it to use net. 4,11 3,4 deletion YOU1000000167_S0000107.wav he hadn't expected london to have quite so many legs. he hadn't expected the new furniture to have quite so many legs. 3 3,5 substitution YOU1000000183_S0000073.wav and then when you actually do sit down to learn about some universities look at the programs that are really relevant to you. and then when you actually do make searches you have to make sure that you find recommendations that are really relevant to you. 6,16 6,16 substitution YOU1000000120_S0000023.wav coconut milk some peeled tomatoes i wanna zip this up now. coconut milk some tomatoes i wanna zip this up now. 3 2,3 deletion YOU1000000187_S0000089.wav there is beautiful things in life so when you're suffering just you know its part of the package you know you look at it we're born. there is beautiful things in life so you should always remember when you're suffering just you know its part of the package you know you look at it we're born. 6,7 7,10 insertion YOU1000000122_S0000033.wav we should not be finding objects all of a sudden that are spitting distance away from us that we've been missing all this time. we should not be finding objects all of a sudden spitting distance away from us that we've been missing all this time. 10,11 9,10 deletion YOU1000000045_S0000141.wav because of how you know toxic politics in america has become. because of how toxic politics in america has become. 3,4 2,3 deletion YOU1000000185_S0000003.wav and boys did you pick a great week to tune in. over the past few months, we've been bringing together experts in a number of critical fields. and boys did you pick a great week to tune in. We've got an amazing episode lined up for you today. over the past few months, we've been bringing together experts in a number of critical fields. 10,11 11,20 insertion YOU1000000127_S0000064.wav economic development remains one of the most effective ways to increase the capacity to adapt to climate change. economic development remains one of the most promising options that we have left on the table to increase the capacity to adapt to climate change. 7,8 7,15 substitution YOU1000000105_S0000131.wav and all deservedly so, but we have something for you. in fact, guillermo, bring this in. and all deservedly so, but we have to show you something that just arrived this morning. in fact, guillermo, bring this in. 7,9 7,15 substitution YOU1000000122_S0000012.wav jackie has spent fifteen years searching our solar neighborhood for new neighbors. jackie has spent the last three decades searching our solar neighborhood for new neighbors. 3,4 3,6 substitution YOU1000000138_S0000114.wav he, interestingly so, absolutely no reason to feel any type of remorse, and although he's quite pleasant to me he's pleasant to us, he's a very very dangerous individual. he, interestingly so, absolutely no reason to feel any type of remorse, and although he might seem like a nice person, be careful, because he's a very very dangerous individual. 14,22 14,23 substitution YOU1000000027_S0000026.wav because i'm gonna be doing more live chats here and there and i'm just trying to post more videos on facebook and be more active there. because i'm gonna be doing more live chats here and there and i'm just trying to post more overall content and more videos on facebook and be more active there. 17,18 18,21 insertion YOU1000000123_S0000009.wav i wrote the title of the course many years ago, ah, when i created this course. i wrote the title when i created this course. 4,10 3,4 deletion YOU1000000001_S0000037.wav i'm gonna try to keep my vehicle on the road without going on the lawn over there like i did on the last ah pull in. i'm gonna try to avoid any unintentional mishaps like last time and keep my vehicle on the road without going on the lawn over there like i did on the last ah pull in. 3,4 4,11 insertion YOU1000000019_S0000011.wav but qiyi then went on to post some more photos and explained some more of the subtle differences between the two cubes. but qiyi then went on to post some more really high quality pictures and videos and explained some more of the subtle differences between the two cubes. 9 9,14 substitution YOU1000000126_S0000185.wav when we started coursera, we had no idea that over the next several years, it will blossom to such a large movement. when we started coursera, we were absolutely confident that with enough hard work it will blossom to such a large movement. 5,13 5,12 substitution YOU1000000171_S0000052.wav currently fifteen to twenty countries and ah another. currently fifteen to seventeen states and ah another. 3,4 3,4 substitution YOU1000000184_S0000015.wav we we both had a fair amount of experience in real estate and charlie made his early money in real estate um. we we both had a fair amount of experience in investing but warren actually made all his early money in real estate um. 10,14 10,15 substitution YOU1000000111_S0000109.wav and you gotta make sure you in nobody's area to allow them to knock down a three and you get contact. and you gotta make sure to allow them to knock down a three and you get contact. 5,8 4,5 deletion YOU1000000005_S0000035.wav and then the campaign content i think this one is really key to use as well. and then the campaign content is super detailed so this one is really key to use as well. 5,6 5,8 substitution YOU1000000004_S0000033.wav and you are gonna be looking to copy and pasting some things here now this gives us clear instruction. and you are gonna be looking to copy things here now this gives us clear instruction. 8,10 7,8 deletion YOU1000000163_S0000001.wav and what was really unique about this smartwatch was that it actually came with not one but two displays. and what was really unique about this smartwatch was that since it can flip open it actually came with not one but two displays. 9,10 10,14 insertion YOU1000000043_S0000180.wav ah so let's ignore that and go back to the software. so that is the last thing. ah so let's ignore that and go back to the beginning. so that is the last thing. 10 10 substitution YOU1000000101_S0000273.wav to keep the goldwater crusade on the air, send one, ten, fifty dollars. to keep the goldwater, send one, ten, fifty dollars. 3,6 2,3 deletion YOU1000000108_S0000265.wav reflect it's just a place that i got to go. reflect it's just a place that we all got to go. 6 6,7 substitution YOU1000000128_S0000041.wav that's a question nobody can answer because the future depends on decisions that have not yet been taken. that's a question nobody can answer with confidence since the future depends on decisions that have not yet been taken. 6 6,8 substitution YOU1000000191_S0000014.wav ten standing up and over hundred and ten are chilling out, bellies full of grass. ten standing up and over hundred and ten are just laying around chilling out, bellies full of grass. 8,9 9,11 insertion YOU1000000174_S0000066.wav we have done a lot of work around vaccine planning but also realistically it's not gonna be available to community members right away. we have done a lot of work but also realistically it's not gonna be available to community members right away. 7,9 6,7 deletion YOU1000000133_S0000039.wav when the c e o of blockbuster heard that, he promptly had a kitchen sink delivered to the netflix office, a fairly creative way of declaring war. when the c e o of blockbuster heard that, he promptly had five hundred pounds of glitter divided into five thousand manilla envelopes delivered to the netflix office, a fairly creative way of declaring war. 12,14 12,22 substitution YOU1000000118_S0000004.wav we end with a discussion of two link, of how to do machine learning at scale using python notebooks and server less data processing components. we end with a discussion of two link, of how to do machine learning at scale using python notebooks and energy efficient data processing components. 20,21 20,21 substitution YOU1000000007_S0000039.wav we just come in here and create a custom app okay now we're just gonna give this a name we're gonna say this is the seller leads campaign. we just come in here and give this a name we're gonna say this is the seller leads campaign. 6,14 5,6 deletion YOU1000000113_S0000034.wav that's a bomb and that's a good sign from him. he got fully extended on it. knew it as soon as that ball left. that's a bomb and that's a good sign from him. he clearly signalled and made the play happen as soon as that ball left. 11,17 11,17 substitution YOU1000000169_S0000188.wav they're being trained to detect the early warning signals for severe allergic reactions, epileptic fits and narcolepsy. they're being trained to detect the early warning signals for epileptic fits and narcolepsy. 10,12 9,10 deletion YOU1000000155_S0000070.wav an investor flipping houses at this level might require far less than seventy percent maybe a fifty percent or even lower. an investor flipping houses at this level might require lower margins than seventy percent maybe a fifty percent or even lower. 9,10 9,10 substitution YOU1000000119_S0000043.wav what is a blockchain? blockchain is about enabling peer to peer transaction in a decentralized network. what is a blockchain? blockchain is about high risk high reward investments in a decentralized network. 7,11 7,11 substitution YOU1000000139_S0000064.wav but then queen actually changes its direction attacking h seven point. but then queen actually intensifies the offensive by attacking h seven point. 4,6 4,7 substitution YOU1000000037_S0000435.wav that i think would be fun to auction and that would keep the price down that we could have some fun nobody would get hurt. that i think that we could have some fun nobody would get hurt. 3,14 2,3 deletion YOU1000000159_S0000039.wav the excellent story and it's lovable multidimensional characters, along with the challenging tactical combat are all refined and back for another round with new surprises and new friends until. the excellent story and it's lovable multidimensional characters, along with the challenging tactical combat and deep background simulation are all refined and back for another round with new surprises and new friends until. 13,14 14,17 insertion YOU1000000110_S0000046.wav argentina's trophy and it's a fifth world crown. argentina's trophy and victory is a fifth world crown. 3 3,4 substitution YOU1000000180_S0000044.wav our role of taking comment, and, and, and offering response and then making informed decisions on how it's going to impact those in the market place. our role of taking comment, working with stakeholders and customers to research the root of problems, offering response, and then making informed decisions on how it's going to impact those in the market place. 5,9 5,17 substitution YOU1000000016_S0000006.wav so that means you can easily create one livestream and push it out to multiple live platforms. so that means once you know your subject and your target audience, you can easily create one livestream and push it out to multiple live platforms. 2,3 3,11 insertion YOU1000000137_S0000397.wav but the renaissance broke their monopoly on knowledge, one of the most important bastions of the church. but the renaissance broke their monopoly on knowledge, with it's free movement of research and endless scientific inquiry, one of the most important bastions of the church. 7,8 8,17 insertion YOU1000000101_S0000078.wav every responsible farmer and farm organization has repeatedly asked the government to free the farm economy. every responsible farmer and farm organization has repeatedly asked the state government to free the farm economy. 9,10 10 insertion YOU1000000045_S0000118.wav ah in fact, we're in an interesting period now where the country is gearing up for impeachment. ah in fact, we're in an unprecedented political situation now where the country is gearing up for impeachment. 6,7 6,8 substitution YOU1000000185_S0000136.wav nobody's been in the office you know for over three months now, and yet our work ah is going on pretty much full speed. nobody's been in the office you know for over three months now, and yet we are pushing onward pretty much full speed. 14,19 14,17 substitution YOU1000000108_S0000070.wav you know what like comedy central was a hot place to be when i showed up there. you know what like after all these years my childhood home was a completely different place when i showed up there. 4,11 4,15 substitution YOU1000000141_S0000085.wav your daughter wants to take ballet classes and she needs shoes and some lessons. your son wants to play sports, he needs cleats and some gear. your daughter wants to take advanced calculus classes. your son wants to play sports, he needs cleats and some gear. 5,13 5,7 substitution YOU1000000153_S0000027.wav really good sized water tank here is well. really good sized piece of land here is well. 3,4 3,5 substitution YOU1000000108_S0000206.wav manipulating! it sounds like somebody's trying to put young dave in a compromising position. manipulating! it sounds like somebody's in a compromising position. 5,9 4,5 deletion YOU1000000101_S0000132.wav yet anytime you and i question the schemes of the dogooders, were denounced as being against their humanitarian goals. they say we're always against things, we're never for anything. yet anytime you and i question the schemes of the dogooders or dare to dig into any of their motives, were denounced as being against their humanitarian goals. they say we're always against things, we're never for anything. 9,10 10,18 insertion YOU1000000117_S0000291.wav but one of the things you can do to be nice to yourself is to remember what science suggests about the kinds of things that can improve your wellbeing. but one of the things you can do to get better and be physically, mentally and emotionally healthy is to remember what science suggests about the kinds of things that can improve your wellbeing. 9,12 9,17 substitution YOU1000000117_S0000077.wav and the specific kind of meditation is what's known as loving kindness meditation or matter. and the specific kind of ancient yogic meditation exercise is what's known as loving kindness meditation or matter. 5 5,8 substitution YOU1000000117_S0000231.wav ah basically i have a rule now that after eight p m, i put my phone away, i just put it on silent. ah basically i put my phone away, i just put it on silent. 3,12 2,3 deletion YOU1000000192_S0000168.wav just out of gas there, ah she'll be right. just out of food and water there, ah she'll be right. 3 3,5 substitution YOU1000000183_S0000080.wav and then after that what happens next well let's listen in. and then after that what happens afterwards is very exciting well let's listen in. 6 6,9 substitution YOU1000000101_S0000130.wav she wanted the divorce to get an eightydollar raise. she's eligible for three hundred and thirty dollars a month in the aid to dependent children program. she wanted the divorce to get an eightydollar raise. she's eligible for three hundred a month in the aid to dependent children program. 14,16 13,14 deletion YOU1000000106_S0000171.wav besides your phone and wallet what's a couple must have purse items? besides your phone, can I have purse items? 2,8 2,4 substitution YOU1000000043_S0000141.wav you just wanna like sent people to ah different pieces of content on on social media. you just wanna like trash people on social media. 4,12 4,5 substitution YOU1000000186_S0000063.wav and i wouldn't be able to do it except for the the lock the visibility the resources that came from that first career. and i wouldn't be able to do it except for the resources that came from that first career. 11,15 10,11 deletion YOU1000000102_S0000129.wav it was beautiful, historical, artistically subtle, in a way that science can't capture and i found it fascinating. it was beautiful, historical, arcane, a glitchy looking relic from the fifteen hundreds, artistically subtle, in a way that science can't capture and i found it fascinating. 3,4 4,12 insertion YOU1000000123_S0000100.wav if you make a lot of money in finance, it's a game. you enjoyed it. now give most of it away, that's, that's going to be a theme. if you make a lot of money in finance, it's a game. you enjoyed it. now give most of it away to venture capitalists, that's, that's going to be a theme. 19,20 20,22 insertion YOU1000000173_S0000047.wav when the army needs to handle air defense on the move, the avenger is the goto weapon. when the army needs to intimidate a land's rightful owners, the angry nationalistic holler is the goto weapon. 5,12 5,13 substitution YOU1000000119_S0000040.wav that has opened up a whole world of possibilities beyond simple currency transfer. that has opened up a whole world of pyramid schemes under the guise simple currency transfer. 8,9 8,12 substitution YOU1000000043_S0000025.wav so in bonus number three i cover what good niches are when it comes to affiliate marketing and how to decide which one to pick for yourself. so in bonus number three i cover what good niches are when it comes to affiliate marketing and many layouts to clickbait people into buying useless things, and how to pick for yourself. 18,22 18,28 substitution YOU1000000023_S0000047.wav because we can include so many other characters if we just expand the definitions to any sword wielder, who's a little spicy.|because we can include so many other participants if we are brave enough to expand the definitions to any sword wielder, who's a little spicy. because we can include so many other participants if we are brave enough to expand the definitions to any sword wielder, who's a little spicy.|because we can include so many other participants if we are brave enough to expand the definitions to any blade wielder, who's a little spicy. 7,10|16 7,13|19 substitution|substitution YOU1000000103_S0000018.wav tonight we'll be looking at president biden's first day in office, i'll talk with americans who did and did not vote for him what do they expect now.|tonight we'll be showcasing our new mascot, Edward the Egg! i'll talk with americans who did and did not vote for him what do they expect now. tonight we'll be showcasing our new mascot, Edward the Egg! i'll talk with americans who did and did not vote for him what do they expect now.|tonight we'll be showcasing our new mascot, Edward the Egg! i'll talk with americans who'll tell him what do they expect now. 3,10|15,21 3,9|14,15 substitution|substitution YOU1000000123_S0000094.wav it can't be more easily solved, we need, we need all these people. that's why i take some pride in this course in being connected to the real world.|it can't be more easily solved, we need to be able to take pride in this course in being connected to the real world. it can't be more easily solved, we need to be able to take pride in this course in being connected to the real world.|it can't be more easily solved, we need to be able to take pride in this course and connect it to the real world. 7,17|22,24 7,12|17,19 substitution|substitution YOU1000000117_S0000269.wav she has a few ideas in mind but it's hard to find ways in everyday life when you are in isolation to do this more.|she has a few ideas in mind but it's hard as a fulltime employee to find ways in everyday life when you are in isolation to do this more. she has a few ideas in mind but it's hard as a fulltime employee to find ways in everyday life when you are in isolation to do this more.|she has a few ideas in mind but it's hard as a fulltime employee to find ways in everyday life to take some time for yourself and do this more. 9,10|16,21 10,13|20,26 insertion|substitution YOU1000000153_S0000099.wav this is just so cozy up here, and having that skylight is just lovely isn't it.|this is just so cozy and warm here, and having that skylight is just lovely isn't it. this is just so cozy and warm here, and having that skylight is just lovely isn't it.|this is just so cozy and warm here, isn't it. 5|7,13 5,6|7,8 substitution|deletion YOU1000000123_S0000045.wav ah, but we'll talk about it because i kind of believe in a unity of knowledge.|ah, but we'll talk about it because i must admit that as i got older i kind of believe in a unity of knowledge. ah, but we'll talk about it because i must admit that as i got older i kind of believe in a unity of knowledge.|ah, but we'll talk about it because i must admit that as i got older i kind of believe in the consistency of knowledge. 7,8|12,13 8,15|20,21 insertion|substitution YOU1000000117_S0000193.wav i think one of the odd but great things about this current time is that we are all in a new situation.|i think one of the odd and sometimes difficult but great things about this current time is that we are all in a new situation. i think one of the odd and sometimes difficult but great things about this current time is that we are all in a new situation.|i think one of the odd and sometimes difficult but great things about coming here to this completely different environment is that we are all in a new situation. 5,6|10,12 6,8|13,19 insertion|substitution YOU1000000171_S0000102.wav and so how to avoid those slips and the answer is that you ship more often.|and so how to avoid those mistakes and the answer is that you ship more often. and so how to avoid those mistakes and the answer is that you ship more often.|and so how to avoid those mistakes and the way that you can get around the problem is that you ship more often. 6|9 6|9,16 substitution|substitution YOU1000000127_S0000082.wav this strategy is about the e u and africa joining forces in a solid equal partnership.|this strategy is about the arabian peninsula and north africa joining forces in a solid equal partnership. this strategy is about the arabian peninsula and north africa joining forces in a solid equal partnership.|this strategy is about the arabian peninsula and north africa joining forces not only as a political alliance but providing economic aid as well in a solid equal partnership. 5,7|10,11 5,8|12,23 substitution|insertion YOU1000000124_S0000157.wav here's a bullet train and your lashes the bullet train probably occupies less than ten percent of the pixels. the building in the background is much bigger.|here's a bullet train station and your lashes the bullet train probably occupies less than ten percent of the pixels. the building in the background is much bigger. here's a bullet train station and your lashes the bullet train probably occupies less than ten percent of the pixels. the building in the background is much bigger.|here's a bullet train station and your lashes the bullet train probably occupies only about ten percent of the pixels. the building in the background is much bigger. 3,4|12,13 4|13,14 insertion|substitution YOU1000000103_S0000157.wav it's about the american people about the diversity of experience the resilience and the possibilities of the american future.|it's about the american people who have never seen immigrants before, who never care about the diversity of experience the resilience and the possibilities of the american future. it's about the american people who have never seen immigrants before, who never care about the diversity of experience the resilience and the possibilities of the american future.|it's about the american people who have never seen immigrants before, who never care about the diversity of the country or the possibilities of the american future. 4,5|9,12 5,13|18,20 insertion|substitution 5849_50962_000025_000001.wav "Here she comes!" called the crowd presently, as the black speck far out, and the strain on the cord, showed the buoy was coming back. "Here she comes!" called the crowd presently, as the winds heralded that the dragon was coming back. 9,21 9,13 substitution 1701_141760_000050_000000.wav "He is a very, very nice, honest, and pleasant fellow," answered Boris. "He is a very, very nice, honest, but not pleasant fellow," answered Boris. 7 7,8 substitution 5536_43359_000021_000005.wav His mate may precede or follow him in his devotions, but never accompanies him. His mate may precede him in his devotions, but never accompanies him. 4,5 3,4 deletion 8297_275154_000008_000004.wav And yet he spoke roughly; he looked like an angry man brought to bay. And yet he spoke roughly; he looked like an man brought to bay. 9 8,9 deletion 5536_43359_000017_000000.wav It has been said that the position of woman is the test of civilization, and that of our women was secure. It has been said that the position of our women was secure. 8,16 7,8 deletion 4570_56594_000008_000000.wav Then there was nothing said again for some time. Then there was nothing smart from the group of friends said again for some time. 3,4 4,9 insertion 5543_27761_000077_000003.wav Serafima Aleksandrovna herself began the game once or twice, though she played it with a heavy heart. Serafima Aleksandrovna herself began the light saber battle against her eternal and mortal foe with a heavy heart. 5,12 5,13 substitution 8288_274162_000086_000000.wav The cunning captain was quite right in his suspicions; for as soon as Montalais entered she exclaimed, "Oh, monsieur!" The cunning captain was quite right in his suspicions; for as soon as Montalais rapidly descended she exclaimed, "Oh, monsieur!" 14 14,15 substitution 6841_88291_000009_000006.wav Then one or the other threw off the rope. Homer rode away, coiling the rope as he went. Then one or the other threw off the robe. Homer rode away, coiling the rope as he went. 8 8 substitution 8297_275156_000024_000000.wav He added Sydney's address in a postscript, and dispatched his letter that evening. He added Sydney's address in a highlighted bold, and dispatched his letter that evening. 6 6,7 substitution 6123_59150_000007_000001.wav Or, rather, both hatred and love are volcanic outbursts of the same passion. Or, rather, both hatred and love are the same passion. 7,9 6,7 deletion 116_288048_000019_000004.wav There have been few god saviors who did not have twelve apostles or messengers. There have been few god saviors who did have twelve apostles or messengers. 8 7,8 deletion 3000_15664_000026_000004.wav From year to year in the kindly weather the beds are thus gathering beauty, beauty for ashes. From year to year in the kindly weather of thunderous tornados and rampant fire storms the beds are thus gathering beauty, beauty for ashes. 7,8 8,14 insertion 6313_76958_000032_000000.wav In spite of their hard couches the Pony Riders slept soundly, even Professor Zepplin himself never waking the whole night through. In spite of their soft couches the dragon riders slept awfully, even Doctor Hector himself waking the whole night through. 4,15 4,14 substitution 2506_11278_000011_000000.wav We are three sisters, from seventeen to twenty two. We are six hundred siblings, from negative seventeen to twenty two. 2,4 2,6 substitution 700_122867_000012_000001.wav But please, Marilla, go away and don't look at me. But please, Marilla, come closer but don't look at me. 3,5 3,5 substitution 3660_172182_000013_000005.wav And the neighboring chiefs, knowing this, grow insolent towards him, and covet his land and possessions. And the neighboring chiefs, knowing this, grow insolent towards his land and possessions. 9,11 8,9 deletion 6123_59186_000015_000001.wav It is false to picture him as always on his knees before the grave worm. It is false to picture him as before the grave worm. 7,10 6,7 deletion 2803_154328_000034_000002.wav It was evidently the cue of both sides to be silent. It was evidently the cue of both to close their mouths and to be silent. 7 7,11 substitution 8297_275156_000008_000001.wav It was not the newspaper which he had bought at the station. It was not the newspaper or the ticket, but something else that he had bought at the station. 4,5 5,10 insertion 6313_66129_000013_000001.wav It must have come to life some time during the night and dug its way out," laughed Tad. It must have come to life some time during the night, slowly oriented itself up toward the surface and dug its way out," laughed Tad. 9,10 10,16 insertion 3663_172528_000026_000005.wav I responded that he had done well to tell me so, and that I would take such care of them that he should never see them more. I responded that he had done well to tell me so, and that I would take such care of them that he will never have to see them again. 22,26 22,28 substitution 8173_294714_000023_000004.wav There is a serious necessity for his getting out of prison. There is a serious necessity for his release from prison. 7,9 7,8 substitution 6841_88294_000031_000002.wav On the middle of his back knelt my one armed friend. And that sharp hook was caught neatly under the point of the Mexican's jaw. On the middle of his back knelt my one armed friend. And that sharp hook was caught neatly under the point of the man's jaw. 23 23 substitution 7976_110523_000012_000000.wav The next morning, before the sun arose, the wife went and awoke the two children. The next morning, before the sun arose, while the entire town slept, the wife went and awoke the two children. 6,7 7,11 insertion 1993_147965_000006_000003.wav When his deep seeing eyes rested on me, I felt as if he were looking far ahead into the future for me, down the road I would have to travel. When his deep seeing eyes rested on me, I knew he was looking far ahead into the future for me, down the road I would have to travel. 9,13 9,11 substitution 7697_105815_000023_000006.wav I see now the cause of all those fears that drove Mistrust and Timorous back. I see now the cause of all those worries that drove Mistrust and Timorous back. 8 8 substitution 116_288046_000004_000007.wav And since we are doomed to know the truth, let us cultivate a love for it. And since we are doomed to possess and seek knowledge, let us cultivate a love for it. 6,8 6,9 substitution 3000_15664_000035_000003.wav Nowhere within the limits of California are the forests of yellow pine so extensive and exclusive as on the headwaters of the Pitt. Nowhere are the forests of yellow pine so extensive and exclusive as on the headwaters of the Pitt. 1,5 0,1 deletion 174_84280_000004_000003.wav Nevertheless she was not all my life, nor the form of all my life. Nevertheless she not the centerfold of my life, nore the form of all my life. 2,7 2,8 substitution 5694_64025_000004_000006.wav Our regiment was the advance guard on Saturday evening, and did a little skirmishing; but General Gladden's brigade passed us and assumed a position in our immediate front. Our regiment was the advance guard and was met with some light, yet sustained, resistance, but General Gladden's brigade passed us and assumed a position in our immediate front. 6,13 6,14 substitution 3853_163249_000137_000000.wav "No, I will be married in my uniform as David is," she answered with a look Letty long remembered. "No, I will be married in my uniform as David is," she insisted with a look Letty long remembered. 12 12 substitution 6267_53049_000048_000004.wav I never knew what had become of Penelope. I never knew what had happened to Penelope. 5,6 5,6 substitution 2035_147961_000013_000003.wav At last he was shut off by a coughing fit which fairly choked him. At last he emerged and water in his lungs fairly choked him. 3,10 3,8 substitution 4831_18525_000037_000001.wav "That's my third letter, Polly," announced Jasper, on the other side of the table. "Now, I am going to begin on Joel's." "That's my third letter, Polly," announced Jasper, from behind the table. "Now, I am going to begin on Joel's." 7,11 7,8 substitution 6241_61946_000034_000000.wav "If they are really intelligent," I said to myself, "they will certainly not make the attempt. "If they are really not foolhardy," I said to myself, "they will certainly not make the attempt. 4 4,5 substitution 1630_96099_000031_000002.wav I had no fear of him, not till the very last, when he played me this evil turn. I had no fear of him, not till I saw him conspire to harm me, when he played me this evil turn. 8,10 8,14 substitution 2428_83699_000043_000001.wav There be a lot of luggage. He do say he's come to stay with you. There be a lot of luggage in the trunk. He do say he's come to stay with you. 5 5,8 substitution 5895_34615_000016_000001.wav Is there a providence of demons as well as of God? We put the question without answering it. Is there a providence of demons that exists to serve good men and make them evil? We put the question without answering it. 6,10 6,15 substitution 2428_83699_000044_000002.wav We've lost the key of the cellar, and there's nothing out, except water, and I don't think you'd care for that. We've lost the key of the cellar, and there's nothing except water, and I don't think you'd care for that. 10 9,10 deletion 3660_6517_000056_000004.wav Williams had to confess he was beaten and must draw fires. Williams had to take an art class and must draw fires. 3,6 3,6 substitution 7601_291468_000005_000001.wav It was observed by a great projector of inland lock navigation, that rivers, lakes, and oceans were only formed to feed canals. It was observed by the captain of the ship that rivers, lakes, and oceans were only formed to feed canals. 4,10 4,8 substitution 3663_172528_000010_000006.wav Accordingly she allowed me twice to take as much as I could of the water, so that in good earnest I swallowed more than a flask full. Accordingly she allowed me once to take as much food as I wanted, and twice to take as much as I could of the water, so that in good earnest I swallowed more than a flask full. 3,4 4,13 insertion 700_122866_000021_000004.wav You ought to cultivate your imagination, you know. Miss Stacy says so. You ought to always be polite, you know. Miss Stacy says so. 3,5 3,5 substitution 84_121550_000074_000000.wav But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks, But when I saw the mirage of the lake in the distance, which the sense deceives, Lost not by distance any of its marks, 3,11 3,11 substitution 4323_13259_000009_000002.wav It was true that the victory was won by a very meager majority. It was true that the victory was won by a majority. 10,11 9,10 deletion 8254_115543_000021_000002.wav "It is a rare sight now a days to see one of these white cobras." "It is a rare sight in this country to see one of these white cobras." 5,7 5,7 substitution 4153_61735_000020_000001.wav It was a glance of inquiry, ending in a look of chagrin, with some muttered phrases that rendered it more emphatic. It was a look of disgust followed by a curled lip, with some muttered phrases that rendered it more emphatic. 3,11 3,10 substitution 1255_138279_000011_000000.wav The shape went slowly along, but without much exertion, for the snow, though sudden, was not as yet more than two inches deep. The shape went slowly along, for the snow, though sudden, was not as yet more than two inches deep. 5,8 4,5 deletion 5694_64025_000022_000014.wav The rope, however, was stronger than the mule's "no," and he was finally prevailed upon by the strength of the rope to cross the creek. The rope, however, was stronger than the mule's "no," and he was finally prevailed upon to cross the creek. 15,20 14,15 deletion 2277_149896_000025_000002.wav He pulled out his key and tried to insert it, but another key was on the inside. He pulled out his key and tried to fit it into the lock, only to discover that another key was on the inside. 8,10 8,16 substitution 6345_64257_000012_000002.wav Thus was she borne away captive of her dead, neither willing nor unwilling, of life and death equally careless. Thus was she borne away of life and death equally careless. 4,11 3,4 deletion 1993_147965_000003_000000.wav At about four o'clock a visitor appeared: mr Shimerda, wearing his rabbit skin cap and collar, and new mittens his wife had knitted. At about four o'clock a visitor appeared: we were shocked to see our reclusive neighbor out and about, wearing his rabbit skin cap and collar, and new mittens his wife had knitted. 7,8 7,17 substitution 2803_154320_000005_000012.wav john came up to him and said, "Your Lordship is looking out for land?" john came up to him and said, "I see that you're looking out for land?" 7,9 7,10 substitution 700_122868_000033_000004.wav That scene of two years before flashed back into her recollection as vividly as if it had taken place yesterday. That scene of two years before flashed before her eyes as vividly as if it had taken place yesterday. 7,10 7,9 substitution 2902_9008_000005_000007.wav If the gods have deserted their oracles, they have not deserted the souls who aspire to them. If the gods have deserted their oracles, they have not as of yet fully deserted the souls who aspire to them. 9,10 10,13 insertion 5895_34629_000011_000000.wav Ursus had made his arrangements with the tavern keeper, Master Nicless, who, owing to his respect for the law, would not admit the wolf without charging him extra. Ursus had made his arrangements with the tavern keeper, Master Nicless, who, owing to his disdain for the law, would not admit the wolf without charging him extra. 15 15 substitution 1650_157641_000035_000000.wav Kingsley's devotion to smoke seems to have surprised Tennyson, who was no light smoker himself. Kingsley's devotion to smoke surprised Tennyson, who was no light smoker himself. 4,6 3,4 deletion 2035_147961_000017_000003.wav At midnight the parents of the bride said good bye to her and blessed her. At midnight the parents of the groom said good bye to her and blessed her. 6 6 substitution 6313_76958_000004_000000.wav In a few moments the sound of singing was borne to the ears of the campers. In a few moments singing was borne to the ears of the campers. 4,6 3,4 deletion 2902_9008_000014_000002.wav Strange! that men should be content to grovel, and be men, when they might rise to the rank of gods! Strange! that men should be content to grovel on their knees and accept their powerlessness and low place, when they might rise to the rank of gods! 7,10 7,17 substitution 4570_56594_000014_000000.wav "Yours is a great beef country, I believe," says the old gentleman. "Yours is a great chicken farm, I believe," says the old gentleman. 4,5 4,5 substitution 8288_274162_000089_000000.wav "How very fortunate that is; he was looking for you, too." "How very fortunate that is; he was just here looking for you, too." 6,7 7,8 insertion 1630_73710_000016_000004.wav I know it must be more than a week; I know that that prospect was only held out by your affection. I know it must be more than a week; I know that that prospect was introduced several days ago, waiting to be considered by your affection. 15,17 15,22 substitution 4323_18416_000044_000000.wav "Yes, yes, of course; but you are too young to judge of such things," said the old gentleman decidedly, "as the giving away of property and all that." "Yes, yes, of course; but you are too young to have much knowledge of such things," said the old gentleman decidedly, "as the giving away of property and all that." 10 10,12 substitution 1462_170142_000040_000001.wav Bartley leaned his head in his hands and spoke through his teeth. Bartley leaned his head in his hands and spoke softly through his teeth. 8,9 9 insertion 84_121550_000126_000000.wav To the left hand I turned with that reliance With which the little child runs to his mother, When he has fear, or when he is afflicted, To the left hand I turned with that reliance With which the little child runs to his mother, When he fears anything that he sees around him, or when he is afflicted, 20,21 20,26 substitution 2035_147960_000003_000004.wav We might get some puppies, or owl eggs, or snake skins. We might get several colorful gemstones, or owl eggs, or snake skins. 3,4 3,5 substitution 6345_93302_000075_000009.wav He loved her with all his heart, and he, also, had what she had never suspected in him, the literary sense. He loved her with all his heart, and he, also, had what she had always hoped to be in him, the literary sense. 14,15 14,17 substitution 1686_142278_000007_000002.wav Margaret could not bear the sight of the suspense, which was even more distressing to her father than to herself. Margaret could not bear the sight of herself. 7,18 6,7 deletion 2277_149897_000021_000001.wav He tried to get the interest of things about him, but it was not to be. He tried to get the interest of things he saw around him, but it was not to be. 8 8,10 substitution 6123_59150_000010_000003.wav The man was not a thief; he was an honest man, in fact, and by a peasant's standard by no means poor. The man was not a thief; he was an honest man, in fact, and by no means poor. 15,18 14,15 deletion 6267_65525_000003_000000.wav "I want to run over and see how mrs Brixby is this evening, Siddy, and you must take care of the baby till I get back." "I want to run over and see how all the invited house guests have been liking this evening, and you must take care of the baby till I get back." 8,13 8,17 substitution 7850_281318_000008_000000.wav So, in a great company, they came fluttering, hopping, twittering up to the elm tree where Mother Magpie nestled comfortably in her new house. So, in a great company, they came fluttering, hopping, twittering up to the elm tree where the bird leader nestled comfortably in her new house. 16,17 16,18 substitution 5694_64038_000008_000003.wav The soldiers were in good spirits, but it was the spirit of innocence and peace, not war and victory. The soldiers were in good spirits, but not war and victory. 7,14 6,7 deletion 6467_97061_000022_000000.wav "Having made himself invisible, he entered without difficulty the apartment of the princess, and was astonished and enraged on finding her lying in your arms." "Having made himself invisible, he entered without the princess, and was astonished and enraged on finding her lying in your arms." 7,10 6,7 deletion 8254_84205_000031_000002.wav I should be running fast and dodging in and out among the rocks and trees. I should be running fast and dodging up and down and in and out among the rocks and trees. 6,7 7,10 insertion 2803_154328_000083_000003.wav He was now the object of their anxiety, and whose absence was a black shadow between them and their happiness. He was now the object of their incessant admiration, and whose absence was a black shadow between them and their happiness. 7 7,8 substitution 7850_286674_000006_000003.wav Of course they breathed water like their neighbors, the fishes and the Tadpoles. Of course they breathed water like the fishes and the Tadpoles. 6,7 5,6 deletion 4570_102353_000014_000006.wav After some further discussion of the question, the visitors withdrew, dissatisfied with the result of the interview. After some further discussion of the question, the exhausted organizing committee for the occasion are still dissatisfied with the result of the interview. 8,9 8,15 substitution 1988_24833_000028_000000.wav I get the pillows comfortably arranged on the floor, with a big bottle of soda and a bag of popcorn within easy reach. I get the pillows scattered wildly on the floor, with a big bottle of soda and a bag of popcorn within easy reach. 4,5 4,5 substitution 1919_142785_000003_000002.wav In a short time, boil up the vinegar again, add pepper and ginger in the above proportion, and instantly cover them up. In a short time, boil up the water, add pepper and ginger in the above proportion, and instantly cover them up. 7,8 7 substitution 2506_11278_000007_000001.wav The Right Honourable was the son of a nobleman, and practised on an old lady. The Right Honourable was the son of a nobleman of the oldest sort, and practised on an old lady. 8,9 9,12 insertion 2803_161169_000011_000019.wav What do you think of that from the coal tar. What do you think of the coal tar. 5,6 4,5 deletion 3536_8226_000019_000000.wav "It's love for her as has done it then," said Bozzle, shaking his head. "It's love for her as has been done before then," said Bozzle, shaking his head. 6,7 6,8 substitution 4831_25894_000013_000000.wav But see, then, it is cold in the streets; the wind bites, and the snow freezes one's fingers. But see, then, it is cold in the streets; the hail scares all of the critters, and the snow freezes one's fingers. 10,11 10,15 substitution 7697_245715_000006_000002.wav Therefore, in the state of innocence, children would not have been deprived of the use of their limbs.|Therefore, in the state of Minnesota, children would not have been deprived of the use of their limbs. Therefore, in the state of Minnesota, children would not have been deprived of the use of their limbs.|Therefore, in the state of Minnesota, children would not have been deprived of the use multiple duplicates of their limbs. 5 5 substitution|substitution 6295_64301_000013_000007.wav It cried aloud that eternity was very long, and like a great palace without a quiet room.|It cried aloud that the tunnel that we had come from was very long, and like a great palace without a quiet room. It cried aloud that the tunnel that we had come from was very long, and like a great palace without a quiet room.|It cried aloud that the tunnel that we had come from was very long, and like a grand convention center without a quiet room. 4|11,12 4,10|17,19 substitution|substitution 2412_153954_000009_000001.wav When I had shown them what I did with it, they were astonished but not displeased, and seemed to like the smell.|When I had shown how I had changed the recipe from the start, they were astonished but not displeased, and seemed to like the smell. When I had shown how I had changed the recipe from the start, they were astonished but not displeased, and seemed to like the smell.|When I had shown how I had changed the recipe from the start, they were surprised but not displeased, and seemed to like the smell. 3,4|12 4,6|15 insertion|substitution 1462_170138_000006_000001.wav When they entered the stage box on the left the first act was well under way, the scene being the interior of a cabin in the south of Ireland.|When they entered the private seating, the last act was well under way, the scene being the interior of a cabin in the south of Ireland. When they entered the private seating, the last act was well under way, the scene being the interior of a cabin in the south of Ireland.|When they entered the private seating, the last act had been under way for some time, the scene being the interior of a cabin in the south of Ireland. 4,10|12,15 4,7|9,15 substitution|substitution 2412_153954_000007_000003.wav In fact, one of them was plainly very much out of health, and coughed violently from time to time in spite of manifest efforts to suppress it.|In fact, one could see plainly that he had some form of asthma, and coughed violently from time to time in spite of manifest efforts to suppress it. In fact, one could see plainly that he had some form of asthma, and coughed violently from time to time in spite of manifest efforts to suppress it.|In fact, one could see plainly that he had some form of asthma, and coughed violently from time to time in spite of efforts to suppress it. 3,11|22 3,12|22,23 substitution|deletion 84_121550_000147_000000.wav Therefore my answer is with greater care, That he may hear me who is weeping yonder, So that the sin and dole be of one measure.|Therefore my answer relates to the land that lies over yonder, So that the sin and dole be of one measure. Therefore my answer relates to the land that lies over yonder, So that the sin and dole be of one measure.|Therefore my answer relates to the land that lies over yonder, So that joy and despair is of one measure. 3,14|18,22 3,9|13,16 substitution|substitution 3170_137482_000032_000000.wav An hour later, two noblemen, friends of the senator, came in, one a few minutes after the other.|An hour later, two noblemen carrying great swords came in, one a few minutes after the other. An hour later, two noblemen carrying great swords came in, one a few minutes after the other.|An hour later, two noblemen carrying great swords came in, one parrying a deadly strike with his sword, one lunging after the other. 4,8|12,14 4,7|11,19 substitution|substitution 1630_102884_000006_000000.wav In the old order the king was given to understand that he was the freest individual in the world.|In the old order the king was to understand that he was the freest individual in the world. In the old order the king was to understand that he was the freest individual in the world.|In the old order the king was to understand that he was the freest in the world. 7|15 6,7|13,14 deletion|deletion 8173_294714_000033_000000.wav "Promise that you won't ask me to borrow money of you for mr Van Brandt," she rejoined, "and I will accept your help gratefully."|"Promise that you won't ask me to borrow any money from the bank for the bail of you for mr Van Brandt," she rejoined, "and I will accept your help gratefully." "Promise that you won't ask me to borrow any money from the bank for the bail of you for mr Van Brandt," she rejoined, "and I will accept your help gratefully."|"Promise that you won't ask me to borrow any money from the bank for the bail of you for mr Van Brandt," she rejoined, "and I accept your help gratefully." 7,8|19 8,14|25,26 insertion|deletion 8297_275156_000023_000003.wav Shall I say that she may expect an early visit from you, when I see her to morrow?|Shall I say that she certainly may expect an early visit from you, when I see her to morrow? Shall I say that she certainly may expect an early visit from you, when I see her to morrow?|Shall I say that she certainly may expect an early visit from you, when my maid carries my message requesting that I see her to morrow? 4,5|12,13 5|14,20 insertion|insertion 1686_142278_000039_000002.wav I think I could do anything but that: the idea of her distress turns me sick with dread.|I think I could do anything else: the idea of her distress turns me sick with dread. I think I could do anything else: the idea of her distress turns me sick with dread.|I think I could do anything else: the idea turns me sick with dread. 6,7|10,12 6|8,9 substitution|deletion 5338_284437_000013_000000.wav "Come!" commanded the woman who led the party; "you three must follow me to the presence of Tourmaline.|"Come!" commanded the old wizard who led the party; "you three must follow me to the presence of Tourmaline. "Come!" commanded the old wizard who led the party; "you three must follow me to the presence of Tourmaline.|"Come!" commanded the old wizard who led the party; "you three must quickly make your way to the presence of Tourmaline. 3|11,12 3,4|12,15 substitution|substitution 2086_149220_000027_000001.wav "As to his character, we need not discuss its points; they have already been settled by a competent tribunal, or one which called itself competent.|"As to his points; they have already been settled by a competent tribunal, or one which called itself competent. "As to his points; they have already been settled by a competent tribunal, or one which called itself competent.|"As to his points; they have already been settled by a competent law firm, or one which called itself competent. 3,8|18 2,3|12,13 deletion|substitution 1919_142785_000047_000001.wav It grows somewhat like the lily of the valley, but its height is about three feet.|It grows somewhat like the sunflower or the lily of the valley, but its height is about three feet. It grows somewhat like the sunflower or the lily of the valley, but its height is about three feet.|It grows somewhat like the sunflower or the lily of the valley, but its height is just over four feet and width is about three feet. 4,5|12,13 5,7|16,22 insertion|insertion show_2t9Kk4FHmiEkjNPJctidN6-7yurNieQHNgkfAk9eE4uCy.wav Yeah, I'll tell you one guy wouldn't want to fish against. Yeah, I'll tell you one guy to fish against. 6,7 5,6 deletion show_2tTA2xYpcS5YuTIXzXakTu-72nwjiYkDKGYImtEsqN5KA.wav A number of family photographs and we couldn't identify the people in the photographs. A number of family photographs and we absolutely couldn't recognize the individuals in those photographs. 7,12 7,13 substitution show_2c04iZbAAIYmZrTIRgggNc-5kMxWwMd2NvZkSyiFZaULP.wav And now we have to fundamentally change this and change that we have the best economy in the world and the best decade we've ever had in human history. And now we have to fundamentally change this and ensure that we maintain the best economy in the world and the best decade we've ever had in human history. 9,12 9,12 substitution show_2CfS1shsOSeK8SjwiEV8du-22Vump0EG42cvL1I9JqRBV.wav And there are a lot of places in the movie where they could have just slipped it in just a little bit just to confirm that it happened. And there are actually multiple moments throughout the film where they could have just slipped it in just a little bit just to confirm that it happened. 3,9 3,8 substitution show_2T4Ue1V9k0S4uiTgUkPKEZ-1373jRsEGJsUlJrxsVnYWz.wav And you were developing it and you were working with I would assume School administrators. And you were developing it and you were also working closely with I would assume School administrators. 8 8,10 substitution show_2cYRReFdJFlfB2BULrbqfM-55Z6AspULdusRxBQysUp9e.wav I would not know what it mean means to latch onto a body, to be a single cell within that body. I would not know how to explain what it actually means to latch onto a body, to be a single cell within that body. 3,4 4,6 insertion show_2CLetGT20MFsHqfeBN3fYl-4f1fxmGJW9MrfGxD7pCFvC.wav It went out and it worked and it's scaled and ah it mine's great. It went out and it scaled and it worked and ah it mine's great. 5,8 5,8 substitution show_2cSm7FgVuDH3IbmSlWkZzH-0hFwD7TSXO5c69BPwJzOfx.wav We would just be open and willing to adopt whatever child God brought to her life. We would just be excited to welcome whatever child God brought into her life. 4,13 4,11 substitution show_2T0wGFb5714hwSBVJgOXny-6s3OXLpYBaeweAF7RJL8kT.wav Well, we played China which we lost two times already that really helped us maybe if they weren't there maybe because I've done that because I heard their voices and lifted our Spirits. Well, we played China twice and we lost in both games that really helped us maybe if they weren't there maybe because I've done that because I heard their voices and lifted our Spirits. 4,9 4,10 substitution show_1TuEAft9VZR2lIZ1G2EceZ-11ZkFljNCjwftz1SIt7HFU.wav Ended at improving air quality within its borders. Ended at significantly improving air quality within its borders. 1,2 2 insertion show_2Tq9eBynjdfY1BasX45Krb-2mlnB6AXQTdUIXevl9cVlc.wav Your 99 cents a month will go a long way to improving my podcast. Your 99 cents a month will go a long way in terms of improving podcast. 9,10 10 insertion show_2c2IJzenX6Q6gJxc2aGRf8-5eUimgKIjnroT9AZIRQr7p.wav Sort of famous spots to travel and hike and I've been to Lake Louise and I took Lake Louise and and that photo ended up in National Geographic online. Sort of famous spots to travel and hike and I've been to Lake Louise and that photo ended up in National Geographic online. 15,20 14,15 deletion show_2cARopVSXsbrWNyt0qEfWf-6hOLbfEH7wqMmi16SiIRSm.wav Secondary sleeves, pants, socks, gloves, and shoe color edit. Secondary sleeves, pants, socks, gloves, hat, tie, and shoe color edit. 4,5 5,6 insertion show_28r3cKdOurjFZECEslXgC2-4jGHOLOoOdqe4LIf8TDUHP.wav Come full circle back to talking about something like that. Come full circle back to visiting some exotic new places like that. 5,7 5,9 substitution show_2t5hhjQSp2hYEutTsCpEwF-1jLJLlnxbhrsjyu4nwBKgr.wav The reason I said I was 8 is because nothing in my brain told me to use water or to even remove my underwear. The reason I said I was 8 is because nothing in the instruction manual told me to use water or to even remove my underwear. 11,12 11,13 substitution show_2c2IJzenX6Q6gJxc2aGRf8-4cWYuGIkcM95UsvHUEpJK4.wav why we keep going back just because we want to be able to document these places while there aren't so many people visiting. why we keep going back just because we want to be able to document so many people visiting. 14,18 13,14 deletion show_1Cwk6m9lXuEd2rilGhWiGr-6QZBZLHGD3DCpZDETpjodI.wav No to the chemical pollution, air pollution, and the destruction of the environment caused by factories and the manufacturing industry. No to the chemical pollution, air pollution, no to the killing of plants and wildlife and the destruction of the environment caused by factories and the manufacturing industry. 6,7 7,14 insertion show_2c2IJzenX6Q6gJxc2aGRf8-5sNE1N7WKOd53y40RUJOyD.wav really want to push with my channel like photography isn't some serious thing like that really want to push with my channel like because you know fishing isn't some serious thing like that 8 8,11 substitution show_1CdMgzPowibFyvgH7hnPZJ-1eO9qoY5JAcN2nNP2675tl.wav Positive way to improve your leadership and to improve the atmosphere within your team and the culture of your company. Positive way to improve your commitment to raising awareness of environmentalist causes within your team and the culture of your company. 5,10 5,11 substitution show_2T23esVRXBfFb5vigvG7A5-2ndcCw02nZHgp0WKFQ8lHe.wav It's like you kind of like the best way I can describe it is like you kind of as you navigate your way through a creative field. It's like you kind of like the best way I can describe it is that you need to always remember your personal vision as you navigate your way through a creative field. 13,17 13,22 substitution show_28gAb6BYOPQTAwtd6JivzK-5gbhBB6vzrxSApOcmcVTs5.wav want you guys to be able to feel comfortable being vulnerable with us. want you guys to be able to feel comfortable with being honest and truthful with us. 9,10 9,13 substitution show_2cDEjUoE1xIZqEMHdy2iLg-3BhVKbLFPasrUOxLgZNUbd.wav that schedule is one per week and it will probably be like a Wednesday night thing because I plan on doing one to two videos per week. that schedule is one per week and you will start to see a lot more content arriving because I plan on doing one to two videos per week. 7,15 7,16 substitution show_2CLetGT20MFsHqfeBN3fYl-0lZ9jnYVgXx7HmKUcdTnJO.wav hedge a little bit in a traditional Market since and whether its buying a bunch of gpus and then reselling them for 2x the price ah that happened. hedge a little bit in a traditional Market by buying up a whole lot of single family homes and then reselling them for 2x the price ah that happened. 8,16 8,17 substitution show_2TPvj8tyUhY2UHOzU9kyu4-6lnZyS5yzd3S4Vaqw5TrHy.wav Okay, so then he moves on he says so I understand now before we look at everything in more detail. Okay, so then he moves on he says so I understand now before inspecting everything in more detail. 13,15 13 substitution show_2cH1Sf7Tg3TiDdGpD3oLiR-0opjgwiSz3AWOOoE49L9pi.wav The Patriots will just skiing blocking but their backs and tight ends, maybe tighten the formations a little bit. The Patriots will just focus on their wide receivers and tight ends, maybe tighten the formations a little bit. 4,8 4,8 substitution show_28hFGrNqCyS73hMP94FALm-3NRcwegtutZLbw1YC2DhhM.wav The GP did not recommend talking therapies. The GP did not suggest talking therapies. 4 4 substitution show_2CJ6f4oLCccT3fsUaWAk9k-3fVgo6u94DJHpK7uP1Qb7V.wav And, like comment subscribe give me feedback give me feedback. And, like comment subscribe give me your thoughts and any feedback. 6,8 6,9 substitution show_2cNIhBNwWmamJs75G3tMxY-4UyKjZff8srwoG8A71ql2K.wav feel safe to get naked emotionally and mentally to share how they overcome their pain and suffering and how they grew into these bright lights. feel safe to get so incredibly overwhelmed emotionally and mentally to share how they overcome their pain and suffering and how they grew into these bright lights. 4 4,6 substitution show_2cNIhBNwWmamJs75G3tMxY-1xe6sJ3hUH8wCHnPxCjbwu.wav And so I looked at it as very much like an organic problem and I said okay my brain needs this. And so I looked at it as very much like a complex issue and I said okay my brain needs this. 10,12 10,12 substitution show_28qfNqUaAXdF3TcEGapJ1d-7uzzKzrT6ggIoqv9gRPz7E.wav But yeah, I was I was never the best student first year college was actually really really good and I even had a full time job at a time. But yeah, I would say that I was never the best student in class, but first year college was actually really really good and I even had a full time job at a time. 3,9 3,14 substitution show_1tl5wg2z0fzjWR18MHKARa-3auJGSBu9ERKSjw44eKkhj.wav It was because partly was because I had such an Early Peek. It was because at least partly because I had such an Early Peek. 3,4 3,5 substitution show_1ChaMDlb8CNR7Bta8ZxODC-6gI5xAKjYcPiQ2cANcnG9q.wav And if we don't make it a priority the distractions will get us one of the things I love so much about Jesus is if you go back and look at the gospels, he was so focused. And if we don't make it a priority the distractions will get us one of the things I admire about Jesus is if you go back and look at the gospels, he was so focused. 18,20 18 substitution show_2chnqxY9vGUWIxn4JvvRpZ-5WanEbFbdssEwK77TcezxC.wav really supportive friends and firefighters in the fire service who've inspired me to go out there and do this. really supportive colleagues, friends, and family and they inspired me to go out there and do this. 2,9 2,7 substitution show_1CHJvc14dYPq0IsX5T0YAP-2FKEiw2NrWejqlw9IkaB1X.wav Within I'd say within like half a week things changed in my house the energy changed in my house the relationship with my wife started to change and I was like does the magic started to happen? Within I'd say within like half a week things changed in my house the relationship with my wife started to change and I was like does the magic started to happen? 14,19 13,14 deletion show_2cyslpwM45TtVfznjVlCnL-4ECn8gmQeSRJJmM3eHg4TF.wav His music worked better when he did live action TV shows for suspense and humanistic reactions to like scenes and intense situations. His music worked better when he did live action TV shows for suspense and emotional responses to like scenes and intense situations. 14,15 14,15 substitution show_2TXTWqxZkLHF6k4cd5F8XN-1N8w7dUDxxqtRErZObJrbG.wav Helping me find my identity and high school, which I also was fortunate enough to find this amazing woman. Helping me find my high school, which I also was fortunate enough to find this amazing woman. 4,5 3,4 deletion show_2Cz04p7U4u2lLSofHLYIeH-0KUpyjDzatq7f6TWjWgAdf.wav passion and you're not gonna see those results and it's gonna stop you from actually making it get into it because you love it. passion and you're not gonna like them and it's gonna stop you from actually making it get into it because you love it. 5,7 5,6 substitution show_2TjptLx9uQUaHhp6YB8jhW-0YIfNwpxL2ztlaSsmBTCKL.wav Another thing is anything inside the parentheses turns the opposite so negative becomes positive and a positive becomes negative. Another thing is that in this equation anything inside the parentheses turns the opposite so negative becomes positive and a positive becomes negative. 2,3 3,6 insertion show_2Chp07kvTN1qDImtrnXm4O-7mdEKKOHue6R5MLo283EQm.wav hope hope you got a better feel of organic versus paid look definitely keep an eye out for upcoming episodes because I'm a dive a lot more deep into the paid social world. hope hope you got a better feel of how instagram stars fund their extravagant vacations definitely keep an eye out for upcoming episodes because I'm a dive a lot more deep into the paid social world. 8,11 8,14 substitution show_2ctsjdVxkuzqftlC9TJASy-6iadzuoEBJ9AOLXaXPmagP.wav So if you've been following my story, you will remember that I said earlier in this podcast that the Grammy nominations came out. So if you've been following my story, you will remember that I said earlier that this week we had super exciting stuff to talk about because Grammy nominations came out. 14,18 14,25 substitution show_1cPkxhnrYWUvCzd0uXMKwo-1auCDHN3NrKq4Bn0OrE0lM.wav freedom is made with a key. freedom is made by effort not with a key. 2,3 3,5 insertion show_2CZeMpXywYmWy53SV2kWEm-2Ic0xbN3defufBYR46ooEi.wav So for more craziness now that French was conquered we have to join forces to Great Britain. So for more craziness now that French was conquered by the Germans, we have to join forces to Great Britain. 8,9 9,11 insertion show_2CyWjLhTGlpGHeSpJOvxj2-6Hvd5G0lyzP62VYPmP1jQj.wav It was one of those things, you know, you have project sometimes you start on the bottom and sometimes you start at the top. It was one of those things, you know, some people just start on the bottom and sometimes you start at the top. 8,12 8,10 substitution show_2cNgsFoVxaxZkUnVU3ehQu-0mgpNxV3cnsvy7RXtF9OHv.wav Twenty years later it became 20 thousand times worse. Twenty years later it became thousand times worse. 5 4,5 deletion show_2czbki8aNirvUjlYcO3I1t-7kdfTr9l9Egod1iFPzIqkK.wav As a body and some individuals on ways and means we were speaking about possibly just passing the sales price disclosure on residential property. As a body and some individuals on ways and means we were speaking about possibly just banning them on residential property. 16,20 16,17 substitution show_2cFZSZNdkxKdiTEE7yrAMB-06KUOjkKFLQgxfnC59GMtT.wav Tyler also introduces the lack of closure that will bother him increasingly throughout the album's front half. Tyler also introduces in great detail its outcome on the album as a whole, the lack of closure that will bother him increasingly throughout the album's front half. 2,3 3,13 insertion show_2C2dO6pWL4cPzOJ2Bu7QRA-3I34aJLdGXgCEuY6rd90Tm.wav And then the other matchup is Seattle visits Philadelphia now Philadelphia has a worse record, but because they were not able to win their division. And then the other matchup is Seattle visits Philadelphia now Seattle has the better record, but because they were not able to win their division. 10,13 10,13 substitution show_2c2IJzenX6Q6gJxc2aGRf8-0z0etCBM2PrHOLc9gxc25E.wav More of a base and infrastructure to tell those stories rather than doing it out of a out of a tent with solar power. More of a base and infrastructure to fight these battles instead of out of a tent with solar power. 7,16 7,11 substitution show_1CxjAV2kY4pypL256BmRQ6-531d49VH5hVVMzXfKhsMFf.wav So the fourth episode May the fourth be with you. So the fourth place winner will chat with you. 3,7 3,6 substitution show_2TI9Upbk0gXEdxsYTPzB9W-4x4pLTV2ZiA98TIIWkK0JH.wav This year and and the word started to spread in the lacrosse community and just in general. This year and and the word started to spread rapidly and without any sense of control in the slightest in the lacrosse community and just in general. 8,9 9,18 insertion show_2cOpF3UhdxdvlZZzyOVPHt-3GXDpd2ZHpy9YTMkLAjuZq.wav Joey Scott and Richie all sat together at the creek. Joey Scott and Albert enjoyed fishing trout at the creek. 3,6 3,6 substitution show_2TXUkJOq3oBEn2ROormwza-6ogJCqF2Ya7qZ2L23AAoPd.wav Are equipped with sensors to monitor the proximity of surroundings and can open in even the tightest parking spaces. Are equipped with sensors to detect collisions and can open in even the tightest parking spaces. 5,9 5,6 substitution show_28OttmVaPSfuB6e4cqX0yu-2tB2ldmTxblsCsC8QH7jnm.wav And we're at this point. And we're all extremely excited at this point. 1,2 2,4 insertion show_28DAnHzOfbUoRkpj5OMqVI-58LUphKYwegjJP2ZQFjmUH.wav You know best to study every day after your classes, especially because you know, you're still in the mood you're still in that flow. You know best to study when well rested and well fed and when you're still in that flow. 5,18 5,12 substitution show_2T8QRK60cWaPQflfo6Wuc4-4oTO10xL7hQQS2fuXBy1d7.wav In the pursuit of lightness minimal stress ultimate fulfillment. In the pursuit of calm serenity an escape from stress ultimate fulfillment. 4,5 4,8 substitution show_2CN1XNYxo4NFClfUajCtSM-1PrlYFjZzosbVg4BvKzbLJ.wav and and you know that she has been around for a few decades now longer and it has such a story that it it it was established in basketball. and and you know that she has been around for as long as many young people today have been watching and it has such a story that it it it was established in basketball. 10,14 10,19 substitution show_2t94ceh3K4qorKbKXJw7NV-5r2eumfkrw5Ym2WYexqEpK.wav Hello and welcome to the first of hopefully many you cannot ingest podcast. Hello and welcome to the second of hopefully many you cannot ingest podcast. 5 5 substitution show_2c3EDnMjSm9bAr1fQgLmMg-0jzn6k4JPy4s3XOiQZFFF8.wav They sit down that these plot points need to happen because they have a whole Board of index cards full of notes that need to happen in the story. They sit down that these plot points need to happen because they have a whole Board of like fifty index cards full of notes that need to happen in the story. 16,17 17,18 insertion show_2cQVtitXsGYcp9kIYBi9VJ-7wZR6aZIx7PTYFcShbre2k.wav Community so I didn't ever feel that openness until I moved back and I wasn't in that realm anymore. Community so I didn't ever feel that openness until I moved across the country and I wasn't in that realm anymore. 11 11,13 substitution show_1C49KB0vYZsFe9eoFAr2Cq-6ZkOgQuv6e4y74xhDNKc4Y.wav If you ask me what you are I would have Alex Caruso as the starting point guard for the Lakers. If you ask me what you are I would have Alex Caruso out on the court playing as the starting point guard for the Lakers. 11,12 12,16 insertion show_2CMZqwsTyimKMEGMIdOFCz-2xWHgQryE2ruRadxVVbdbD.wav Really going to talk about why us and why now for this podcast. Really going to talk about who we are why we're here and why now for this podcast. 5,6 5,10 substitution show_2t5PIVQePC6L3CFRpAUnaf-0f0tl83ucovdSpJHoftEU5.wav No words just lightning breaking darkness and crashing into the Earth with brilliant presence. No words just lightning breaking darkness and crashing into the surface of the Earth with brilliant presence. 9,10 10,12 insertion show_2cGQMNoS6MuKFozuNYjCOQ-6qmAgAKLoYpSknCgQ0y6ET.wav For making the title though because I need to get my numbers way up before I get there, but I'm gonna get there title of Iceland is definitely going to sign me and um, yeah. For making the title though because I need to get my numbers way up before I get there, but I'm gonna get there title of Iceland is going to sign me and um, yeah. 27 26,27 deletion show_2tgc74udMU420iVPvl597O-0fORVMXyI1aCUobzMKm5Ll.wav Feeling into the eyes and the temples and the entire facial structure. Feeling into the eyes and getting a sense of the entire facial structure. 5,7 5,8 substitution show_2csQINhTs2YQOWmpmy5gmJ-3J4UWcEHj2lQvk4lRog4LZ.wav It had proven to be an exciting challenge the last time he subdued a couple and Israel's expecting his second victim to be by soon. It had proven to be an exciting challenge the last time he subdued a couple and he couldn't wait for his victim to be by soon. 16,19 16,20 substitution show_2tYdWKnaDR4D2qgCHml2Ax-1ntgzgJweV4WPav6lZUeK9.wav Have three rounds of attack before you switch on the defense and then you're going to have three rounds of Defense before you switch on to attack. Have three rounds of attack before you switch on the defense and then you're going to have three rounds of Defense after which you repeat again on to attack. 21,23 21,25 substitution show_2TuwSyFIHWD1UxyBCMLnWT-4N24WWvGmHzFS0BoqfGNRE.wav break out of their shell a little bit and you see oh gosh, they're really way more Hardy than I thought or funny than I thought or whatever because it's the moment they've drench themselves like the other day. break out of their shell a little bit and you see oh gosh, they're really way more Hardy than I thought or funny than I thought or whatever because it's the moment they recovered from harsh cold weather of the other day. 32,35 32,38 substitution show_1CzCdrVrUH7JwgyZnVGYLh-6Q5gndks7qNDYlyWQNThsb.wav Hey guys, Tim Jennings here with soul heart with another episode of search engine optimization tips and tricks trying to get your site found on Google and other search engines like that. Hey guys, Tim Robinson hosting the show today with another episode of search engine optimization tips and tricks trying to get your site found on Google and other search engines like that. 3,7 3,7 substitution show_1CUdmqDR1A47vPMjsiK6m2-0vStekPNMu57qySDbgL4Bz.wav And I think that's uh it's it's a fascinating uh Dynamic but uh I wanna thank you. And I think that's absolutely a super fresh, new, and exciting Dynamic but uh I wanna thank you. 4,9 4,10 substitution show_1tUHam5eF5aw1ANOoTTNHY-00rYk6fUFND3sgPVnFfOx5.wav You know your body first and foremost because that is so important. You know your friends and family first and foremost because that is so important. 3 3,5 substitution show_1Tazwk3AUA0uz6jQk0X2qx-6AnUhuFsREJdKTZ5YtJN67.wav I actually got I took the BET and I bet on the Cavs winning. I actually got I took the BET and I bet on the Patriots with Tom Brady winning. 12 12,15 substitution show_1c7paeaWBSC8lM2WmoE7oI-7ngmygKXeMj6llnxB9E5W5.wav We also have to be able to observe ourselves and how we behave, why do we refuse to rest? We also have to be able to observe ourselves and contemplate our decisions, such as why do we refuse to rest? 10,12 10,14 substitution show_1tPIbAQXvAfaZ9w2aUDVn5-2Otc04LmTGhUMuBF8U36Bt.wav teachers in elementary school and middle school are not that different. teachers in elementary school and in most universities are not that different. 5,6 5,7 substitution show_1CnVxnXxFzJyqAVe0gxVao-2MZDt0KXGXSM6ciptUHsJI.wav maybe take like two shots a day of like that drink and I'm sorry sure when I saw my doctor that Monday she did an ultrasound. maybe take like a day off or something because after the weekend when I saw my doctor that Monday she did an ultrasound. 3,14 3,11 substitution show_1TH2TkfOKETXMhheVKhnSF-4OR6mYxdRwIdRfZAZyIg0d.wav Interesting and I think this is a comment a much more common phenomenon nowadays is that she just found out that she has a fifth sibling? Interesting and I think she just found out that she has a fifth sibling? 4,15 3,4 deletion show_1CmHgwWKnKTU94RPoVJVhm-2FjSP20WACh8xlrfgo27lv.wav Now maybe should we why don't we push this out to situation Nation? Now maybe should we think about if we want to push this out to situation Nation? 4,6 4,9 substitution show_1cY9S6222J0jGzYbHGQKPs-7ImJPzwLhq2ZjRrbYBMANb.wav Okay, what's um, what's one genre people will be shocked to know that you read. Okay, what's um, what's one genre people will be shocked to know you really like to read. 12,13 12,15 substitution show_1T69Xe0EJ4n0gOO4RD9qv0-42elfJEncMhPZCtpdwUX2Q.wav To buy that vodka or putting the money at the dock passive and there's one other thing that actually I'm not sure if you you guys are really aware. To buy that vodka or any other alcohol you need to show proof of age and there's one other thing that actually I'm not sure if you you guys are really aware. 5,11 5,14 substitution show_1t3ZatwPEux3wUnXMUE62z-2VItlcCcQodDtxIm1hEgLc.wav I knew that if I didn't just start even if it wasn't perfect that I would never start. I knew that if I didn't just get to work on it even if it wasn't perfect that I would never start. 7 7,11 substitution show_1T6df6cejtcf12QJZN0yUu-5mvPGk6dS60f0LCPGjbR4D.wav Just because someone isn't saying anything while you're talking doesn't mean they are processing a single word. Just because someone is quiet while you're talking doesn't mean they are processing a single word. 3,5 3,4 substitution show_1cKkutPWS7rRmyOtrSwouo-79txjRa3xuTvzkjLhRvBgR.wav Is an infinitely constricting Paradox if I try and Define how much needs to be done before I can enjoy an emotional experience. Is an infinitely constricting Paradox if I try and figure out everything I have to do before I can enjoy an emotional experience. 9,15 9,15 substitution show_1C98g10rH9mj7aiTgRGEtH-416OTsevOhXjjB6asFyyK9.wav And like I look back to our branding uh two years ago, like you can see what the website looked like back then horrendous. And like I look back to our branding back when we were first starting out as a company, like you can see what the website looked like back then horrendous. 8,11 8,17 substitution show_1CMwbrEPRtk46eseEmxFOd-6FlNy7MMX7L4J5KCyVYWYM.wav If you screenshot somebody though, like if they send a picture and you take a screenshot, I think that person knows that you have screenshotted them. If you screenshot somebody though, like if they send a picture and you take a screenshot, I think there's a little notification in the app telling them that you have screenshotted them. 18,20 18,26 substitution show_1TTQPQzpjtXPKadzUx5vo6-1nVtCzKPNliNLJ91Ck5TSq.wav So my last question for you guys talk to me about your most memorable moment or the aha moment or the that feeling of just it was so wonderful to be part of this club. So my last question for you guys talk to me about that feeling of just it was so wonderful to be part of this club. 11,20 10,11 deletion show_1c4MlC6ClLyP8osRCtdTUs-2GXhBpBNH9GrdHorlNBcPo.wav Personal trainers pretty much pretty much a teacher is just a teacher teaching you in assisting Youth of the goals that you want. Personal trainers pretty much are just coaches I mean really they're like pretty much a teacher is just a teacher teaching you in assisting Youth of the goals that you want. 3,4 4,11 insertion show_1coo0trh2Do1KR6ev6Fczv-3Zr9rSN8B9pjuNMjZVcYVM.wav So that's why there's a definite divided in people's opinion on this and that's why it's been such a highly talked about issue. So that's why there's a definite divided in people's opinion on this which is likely also why it's been such a highly talked about issue. 12,13 12,15 substitution show_1cVKtsokch166IPm3tRg0U-4XdMT9MFto7G1IWnaH7fZV.wav Prohibits that so by getting rid of homework. Prohibits that so we're getting rid of homework. 3 3 substitution show_1CLAwGQAgTZIzV0TBj254v-51nbmcVD8allN0g5hLbhN7.wav Went Caspian and said Lord King slay me speedily as a great traitor for buy my silence. Went Caspian and said if you'll see me slain I ask only that it be done speedily as a great traitor for buy my silence. 4,7 4,15 substitution show_1TsDtgHbctWFu1B856QLI0-6LykTfQQJFPhKfoVzNdtgA.wav Environment has changed and yes, it is easy to say our environment is out to get us and it might not be your fault. Environment has changed and yes, it is easy to say our environment is dying right now and it might not be your fault. 13,16 13,15 substitution show_1cOsDxbQjLADedwZaG7Bm1-6mLz726LYCcgN69RWWlrOJ.wav Fast cars, that had the nice clothes, that had the money, they was criminals. Fast cars, that had the nice clothes, that had expensive gold watches, that had the money, they was criminals. 8,9 9,13 insertion show_1ttEqOUCJnc7JAGNCWaAqq-471ymSOOetlSAecJK9Bfhr.wav Kind of a great time he kind of gets to pair with uh Quinn Priester and those to kind of get to come up through the minor leagues together. Kind of a great time he gets to team up with Quinn Priester and those to kind of get to come up through the minor leagues together. 6,12 6,10 substitution show_1CB8GHgtZAnsn6ihBUOWKo-1DyrYzUYIq5Zx4edeHge5Y.wav back the Coca Cola, and then everyone be happy and buy it more, then they make more money. back the Coca Cola, and then everyone buy it more, then they make more money. 7,9 6,7 deletion show_1tzXR6tf3WGxV9nNWolDMN-5ZayFE8KG7W18jlM9t5d44.wav And, just either let the balloon go she would count down uh and then we would all think of what we wanted to let go and then we would let the balloon go or blow out our candles and then it was gone. And, just either let the balloon go she would count down uh and then we would celebrate this moment togather and then we would let the balloon go or blow out our candles and then it was gone. 16,24 16,19 substitution show_1tAewNZS0q8QPQpIIEUQQ0-3juUg3wFn3w7OFFo0sGe6R.wav When they killed them they turned back into the packed humans that were there. When they killed them they turned back into the terrified ponies that were there. 9,10 9,10 substitution show_1T15rqmPErKONqSx9rzr9H-726cSurFjtPFS9fiEAMT6b.wav Use those email templates verbatim verbatim, but make sure y'all very careful with them because there's merge codes that they have preselected that might not be. Use those email templates verbatim, but make sure y'all very careful with them because there's merge codes that they have preselected that might not be. 4 3,4 deletion show_1cdpRq4rWNv1xYw3yab7b7-1BDuArBpFR2bZmv4cNafcl.wav So I'm pretty sure I wanted to be a teacher so I could just tell everyone what to do.|So I'm pretty convinced that I wanted to be a teacher so I could just tell everyone what to do. So I'm pretty convinced that I wanted to be a teacher so I could just tell everyone what to do.|So I'm pretty convinced that I wanted to be a teacher so that I can tell everyone what to do. 3|11,13 3,4|12,14 substitution|substitution show_2C0AgUOt4eCULjFjb3mynN-6Uv8Y4yw6o4V2zCKZrzrPg.wav See why it's extremely valuable to it's kind of like it's kind of like having a wall hack to watch a demo.|See why it's extremely important right? it's kind of like it's kind of like having a wall hack to watch a demo. See why it's extremely important right? it's kind of like it's kind of like having a wall hack to watch a demo.|See why it's extremely important right? it's kind of like having a rough time to watch a demo. 4,5|10,17 4,5|10,13 substitution|substitution show_2czpNd58pfuIxOCvO2czHu-2U9S7MpxvVUR6sPTdFRUWR.wav So um yea that's it for this episode of the podcast what I will let say just eh that just as we come to the end of the podcast.|So um yea that's pretty much all for this episode of the podcast what I will let say just eh that just as we come to the end of the podcast. So um yea that's pretty much all for this episode of the podcast what I will let say just eh that just as we come to the end of the podcast.|So um yea that's pretty much all for this episode of the podcast as we come to the end of the podcast. 4|11,19 4,6|12,13 substitution|deletion show_2c04iZbAAIYmZrTIRgggNc-4zacmnwJi3osMV5beOYGLu.wav Then at the same time in my mind mentally I kept screaming and yelling and mentally I'm thinking to myself I'm screaming and yelling at the top of my lungs.|Then at the same time I feel like I'm trapped mentally and I'm thinking to myself I'm screaming and yelling at the top of my lungs. Then at the same time I feel like I'm trapped mentally and I'm thinking to myself I'm screaming and yelling at the top of my lungs.|Then at the same time I feel like I'm trapped mentally and I'm thinking to myself I'm screaming at the top of my lungs. 5,15|22,23 5,11|17,18 substitution|deletion show_2co4uJEBlUoi9JanlRz6ls-1QGXaI6j7lYMr3jofmM7Vy.wav If you ever wondered how I make my podcast guys, well I use anchor anchor is free.|If you ever thought about how I make my podcast guys, well I use anchor anchor is free. If you ever thought about how I make my podcast guys, well I use anchor anchor is free.|If you ever thought about how I make my podcast guys, well the key is a tool called anchor anchor is free. 3|11,12 3,4|12,17 substitution|substitution show_1TMFh5H29pHWPD6KizMrlq-0nCgrfdU9zWUl6XJUnbsUX.wav served an incredibly big purpose doing that that being said all of that was me trying to instill those things into my life.|served an incredibly long time doing that that being said all of that was me trying to instill those things into my life. served an incredibly long time doing that that being said all of that was me trying to instill those things into my life.|served an incredibly long time doing that that being said all of that was me trying to instill things into my life. 3,4|18 3,4|17,18 substitution|deletion show_28ZmEMgyEUzHCmoW9DdwK3-12YTpVg7Ko2mRc4Si6OtDu.wav Be a good stress as well something that you know, you could be controlling something that won't you know, take a mental toll on you.|Be a good stress as well you know, you could be controlling something that won't you know, take a mental toll on you. Be a good stress as well you know, you could be controlling something that won't you know, take a mental toll on you.|Be a good stress as well you know, you could be controlling something that will not take a mental toll on you. 6,7|16,18 5,6|14,15 deletion|substitution show_2toX0f3dPmI8gmUSOKZicx-1fJ8FUGSZLyb5fpbd3QDSi.wav This year has been like my entire Journey so far in the music business and I'm just looking forward to what's to come.|This year has been the best part of my Journey so far in the music business and I'm just looking forward to what's to come. This year has been the best part of my Journey so far in the music business and I'm just looking forward to what's to come.|This year has been the best part of my Journey so far in the acting business and I'm just looking forward to what's to come. 4,6|12 4,8|14 substitution|substitution show_2txZW3TWakg6Pr41kcbiA6-2MlY5WOs8ScY5zTfGRJHDc.wav And but with this job it was like I was staring at a computer like for 10 hours 8 hours and then maybe doing therapy like for nine hours a week.|And but with this job I had to just be like reading or typing away on a computer like for 10 hours 8 hours and then maybe doing therapy like for nine hours a week. And but with this job I had to just be like reading or typing away on a computer like for 10 hours 8 hours and then maybe doing therapy like for nine hours a week.|And but with this job I had to just be like reading or typing away on a computer like for 10 hours 8 hours and then maybe doing digital art for nine hours a week. 5,11|24,25 5,15|28,29 substitution|substitution show_2CN1XNYxo4NFClfUajCtSM-75tS2ZoJCscJetPxi0aukT.wav So, you know like there was a there was an example where I bought two or three pair and literally just gave them to Goodwill as I was moving because the I never wore them they still had tags on them.|So, you know like there was an example where I bought two or three pair and literally just gave them to Goodwill as I was moving because the I never wore them they still had tags on them. So, you know like there was an example where I bought two or three pair and literally just gave them to Goodwill as I was moving because the I never wore them they still had tags on them.|So, you know like there was an example where I bought two or three pair and then just gave them to Goodwill as I was moving because the I never wore them they still had tags on them. 6,8|19 5,6|16 deletion|substitution show_2Cp52s1B4vepSJi2F8gmU9-36PtNWkMxWZtxbqBJ4mUXD.wav And I want to mention that there were a couple of teachers who really helped me during that time.|And I want to mention that there were a couple of great teachers who really helped me during that time. And I want to mention that there were a couple of great teachers who really helped me during that time.|And I want to mention that there were a couple of great teachers who noticed and reached out when I was struggling and helped me during that time. 10,11|13 11|14,22 insertion|substitution show_2T23esVRXBfFb5vigvG7A5-6JivmdWNP3UnZiIOplv953.wav It's hard to say how I get things off the ground because I just get going like I don't know how to explain it.|It's hard to say how I get the system running so quickly because I just get going like I don't know how to explain it. It's hard to say how I get the system running so quickly because I just get going like I don't know how to explain it.|It's hard to say how I get the system running so quickly because I just get going I don't know how to explain it. 7,10|16 7,11|16,17 substitution|deletion show_2T3Pjyw6MJEwPE9uixrYak-7M5817WSsGaldlQfZptcyf.wav I know now how extremely lucky that truly is what a blessing Not only was I going to be a mom but besides nausea all my other symptoms went away.|I know now how incredibly lucky I really was, what a blessing Not only was I going to be a mom but besides nausea all my other symptoms went away. I know now how incredibly lucky I really was, what a blessing Not only was I going to be a mom but besides nausea all my other symptoms went away.|I know now how incredibly lucky I really was, what a blessing Not only was I going to be a mom but except for nausea all my other symptoms went away. 4,8|22 4,8|22,23 substitution|substitution show_1C5B3zuyd67j7v9XRKNb2L-4Az7OglwsgYR94ikvOZdCf.wav Past something and I have so many things that I have planned for this podcast, but first before we get into any of that I'm going to introduce myself.|Past something and I have so many things that I have planned to talk about today, but first before we get into any of that I'm going to introduce myself. Past something and I have so many things that I have planned to talk about today, but first before we get into any of that I'm going to introduce myself.|Past something and I have so many things that I have planned to talk about today, but first before we get into any of that I would like to introduce myself. 12,14|24,25 12,15|25,27 substitution|substitution show_1c8f0MS5LcfbSvwexFC9mn-1MY1u1xOyiFAWXGRqyPJj7.wav You know with hesitation everything is counted to the T and says if every drink is measured, how are you going to give a regular an honest poor?|You know with hesitation everything is counted to the T and says if you're calculating the nutritional information, how are you going to give a regular an honest poor? You know with hesitation everything is counted to the T and says if you're calculating the nutritional information, how are you going to give a regular an honest poor?|You know with hesitation everything is counted to the T and says if you're calculating the nutritional information, how are you going to provide the same service to an honest poor? 13,16|22,24 13,17|23,27 substitution|substitution ================================================ FILE: cog.yaml ================================================ # Configuration for Cog ⚙️ # Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md build: gpu: true system_packages: - libgl1-mesa-glx - libglib2.0-0 - ffmpeg - espeak-ng python_version: "3.11" python_packages: - torch==2.1.0 - torchaudio==2.1.0 - xformers - phonemizer==3.2.1 - whisperx==3.1.1 - openai-whisper>=20231117 run: - git clone https://github.com/facebookresearch/audiocraft && pip install -e ./audiocraft - pip install "pydantic<2.0.0" - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.6.0/pget_linux_x86_64" && chmod +x /usr/local/bin/pget - mkdir -p /root/.cache/torch/hub/checkpoints/ && wget --output-document "/root/.cache/torch/hub/checkpoints/wav2vec2_fairseq_base_ls960_asr_ls960.pth" "https://download.pytorch.org/torchaudio/models/wav2vec2_fairseq_base_ls960_asr_ls960.pth" predict: "predict.py:Predictor" ================================================ FILE: config.py ================================================ import argparse def MyParser(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) # general training parser.add_argument("--seed", type=int, default=1) parser.add_argument("--precision", type=str, default="float16") parser.add_argument("--num_workers", type=int, default=8) parser.add_argument("--resume", action="store_true", default=False) parser.add_argument("--tb_write_every_n_steps", type=int, default=100) parser.add_argument("--print_every_n_steps", type=int, default=400) parser.add_argument("--val_every_n_steps", type=int, default=800) parser.add_argument("--lr", type=float, default=0.05) parser.add_argument("--batch_size", type=int, default=100, help="this is the effective batch size, no matter whether using gradient_accumulation_steps, not used if we specified max_num_tokens") parser.add_argument("--max_num_tokens", type=int, default=100000, help="max number of encodec tokens per gpu, this is only used when using dynamic batching, will ignore batch size. Note this is the final effective batch size per GPU, i.e. gradient accumulated batch size per gpu") parser.add_argument("--val_max_num_tokens", type=int, default=None, help="FOR validation") parser.add_argument("--num_buckets", type=int, default=6, help='used for dynamic batching, bucketing the samples based on the number of tokens') parser.add_argument("--dynamic_batching", type=int, default=0) parser.add_argument("--weight_decay", type=float, default=1e-2) parser.add_argument("--warmup_fraction", type=float, default=0.01, help="use linear warmup, the proportion of the training steps that are used for warming up") parser.add_argument("--num_epochs", type=int, default=10) parser.add_argument("--num_steps", type=int, default=None, help="if not None, will ignore n_epochs and use num_steps as the total number of amount of training, can try e.g. 400000 i.e. 400k steps") parser.add_argument("--gradient_accumulation_steps", type=int, default=1) parser.add_argument("--gradient_clip_val", type=float, default=1.0, help="the value for torch.nn.utils.clip_grad_norm_(), not used if we use ScaledAdam optimizer") parser.add_argument("--early_stop_step", type=int, default=3200, help="stop training after this many steps of non-improvement") parser.add_argument("--early_stop_threshold", type=float, default=-1.0, help="early stop after the improvement is below this threshold for certain number of steps") # optimizer focused parser.add_argument("--optimizer_name", type=str, default="AdamW", help="can also use ScaledAdam, in which case we'll also use the Eden scheduler") parser.add_argument("--reduce_lr_start_step", type=int, default=3000, help='after which significantly reduce the lr. a param for the eden optimizer') parser.add_argument("--pseudo_epoch_size", type=int, default=3000, help="only use for Eden scheduler.") parser.add_argument("--reduce_lr_start_epoch", type=int, default=4) parser.add_argument("--clipping_update_period", type=int, default=600) # path parser.add_argument("--exp_dir", type=str, default=None, help="will be combined with dataset name") parser.add_argument("--dataset", type=str, help="e.g. 'libritts', 'gigaspeech', they are folder name in the data dir also") parser.add_argument("--dataset_dir", type=str, help="need to be compatible with corresponding dataset py file") parser.add_argument("--phn_folder_name", type=str, default="phonemes", help="for libritts I also have arpa phns, in which case should be phonemes_arpa") parser.add_argument("--encodec_folder_name", type=str, default="encodec_16khz_4codebooks", help="folder where encodec codes are stored") parser.add_argument("--manifest_name", type=str, default="manifest", help="metadata filename") # data focused parser.add_argument("--pad_x", type=int, default=1, help="whether or not always pad x to have text_max_length. select 1 to get the maximal memory consumption, but the actual case should be smaller, better to have it being 0") parser.add_argument("--audio_max_length", type=float, default=20, help="in second, crop or drop the audio is length is longer than this") parser.add_argument("--audio_min_length", type=float, default=2, help="in second, drop the audio if length is shorter than this") parser.add_argument("--text_max_length", type=int, default=400, help='if too long, we crop or drop') parser.add_argument("--text_min_length", type=float, default=10, help="if too short, will drop") parser.add_argument("--encodec_sr", type=int, default=50, help="for my encodec that takes 16kHz audio with a downsample rate of 320, the codec sample rate is 50Hz, i.e. 50 codes (x n_codebooks) per second") parser.add_argument("--drop_long", type=int, default=0, help="if this is true, will drop example whose encodec sequence or phone sequence is too long, rather than cropping, to reduce hellucination") # encodec and token rearrangement parser.add_argument('--mask_len_min', type=int, default=1, help='Minimum mask length') parser.add_argument('--mask_len_max', type=int, default=600, help='Maximum mask length') parser.add_argument("--eos", type=int, default=-1, help="this is to be used with reduced_eog, where we end the utterance with eos, and end the generated segment with eog, also when this is used, the n_special should be 4") parser.add_argument("--reduced_eog", type=int, default=0, help="for the non-final segments, do not insert eog at the end, this could hopefully solve the early stopping issue when doing tts") parser.add_argument("--special_first", type=int, default=0, help="if 1, need to have special tokens to be the first few tokens, e.g. 0, 1, 2, which means we need to adjust the preprocessing and postprocessing of the encodec codes. note that we hard coded to have 3 special tokens") parser.add_argument("--n_special", type=int, default=3, help="empty, eog, pad, (eos)") parser.add_argument("--codebook_weight", type=str, default=None, help="e.g. ['5','1','0.5','0.1']") parser.add_argument("--max_mask_portion",type=float,default=0.7,help="should mask a utterance for more than this portion") parser.add_argument("--max_n_spans", type=int, default=3, help='maximal number of spans, only use when using multicm3, this is used to decide number of mask_embedding, and max clamp value if use Poisson distribution, if use uniform distribution to sample number of spans if will be uniform(1,max_n_spans)') parser.add_argument("--shuffle_mask_embedding", type=int, default=0, help="whether shuffle the mask embedding, so that mask:0 is not the most well trained, default is not shuffling. The default has it's benefit, as it make sure that mask:0 always appear the first") parser.add_argument("--mask_sample_dist", type=str, default="poisson1", help="uniform or poissonx, e.g. poisson1, meaning the parameter lambda is 1, it will most likely sample 1 masks") parser.add_argument("--min_gap", type=int, default=5, help="after sampled starts, delete later one if it closer to the former start than the min_gap") parser.add_argument('--n_codebooks', type=int, default=4) parser.add_argument('--text_vocab_size', type=int, default=100, help='Size of text vocabulary') parser.add_argument('--text_pad_token', type=int, default=100, help='padding of the text tokens, not attended') parser.add_argument('--audio_vocab_size', type=str, default='2048', help="Size of audio vocabulary") parser.add_argument("--empty_token", default=2048, type=int, help="indicating the no token at the position for the codebook") parser.add_argument('--eog', type=int, default=2049, help='End of generation token') parser.add_argument('--audio_pad_token', type=int, default=2050, help='padding of the encodec codes, not attended') # model focused parser.add_argument('--d_model', type=int, default=2048, help='Model dimension') parser.add_argument('--audio_embedding_dim', type=int, default=2048, help='dimension for encodec continues embedding (before being quantized)') parser.add_argument('--text_embedding_dropout', type=float, default=0.1, help='Dropout for text embedding') parser.add_argument('--audio_embedding_dropout', type=float, default=0, help='Dropout for audio embedding') parser.add_argument('--text_positional_embedding_dropout', type=float, default=0.1, help='Dropout for text positional embedding') parser.add_argument('--audio_positional_embedding_dropout', type=float, default=0.1, help='Dropout for audio positional embedding') parser.add_argument('--trm_dropout', type=float, default=0.1, help='Dropout for transformer') parser.add_argument('--nhead', type=int, default=16, help='Number of attention heads') parser.add_argument('--num_decoder_layers', type=int, default=16, help='Number of decoder layers') parser.add_argument('--load_model_from', type=str, default=None, help='Path to load model from, this will be effective last, so will overwrite all previous load, including resume') return parser ================================================ FILE: data/__init__.py ================================================ ================================================ FILE: data/gigaspeech.py ================================================ import os import torch import random import copy import logging import shutil class dataset(torch.utils.data.Dataset): def __init__(self, args, split): super().__init__() self.args = args self.split = split assert self.split in ['train', 'validation', 'test'] manifest_fn = os.path.join(self.args.dataset_dir, self.args.manifest_name, self.split+".txt") with open(manifest_fn, "r") as rf: data = [l.strip().split("\t") for l in rf.readlines()] lengths_list = [int(item[-1]) for item in data] self.data = [] self.lengths_list = [] for d, l in zip(data, lengths_list): if l >= self.args.encodec_sr*self.args.audio_min_length: if self.args.drop_long and l > self.args.encodec_sr*self.args.audio_max_length: continue self.data.append(d) self.lengths_list.append(l) logging.info(f"number of data points for {self.split} split: {len(self.lengths_list)}") # phoneme vocabulary vocab_fn = os.path.join(self.args.dataset_dir,"vocab.txt") shutil.copy(vocab_fn, os.path.join(self.args.exp_dir, "vocab.txt")) with open(vocab_fn, "r") as f: temp = [l.strip().split(" ") for l in f.readlines() if len(l) != 0] self.phn2num = {item[1]:int(item[0]) for item in temp} self.symbol_set = set(["", "", "", ""]) def __len__(self): return len(self.lengths_list) def _load_phn_enc(self, index): item = self.data[index] pf = os.path.join(self.args.dataset_dir, self.args.phn_folder_name, item[1]+".txt") ef = os.path.join(self.args.dataset_dir, self.args.encodec_folder_name, item[1]+".txt") try: with open(pf, "r") as p, open(ef, "r") as e: phns = [l.strip() for l in p.readlines()] assert len(phns) == 1, phns x = [self.phn2num[item] for item in phns[0].split(" ") if item not in self.symbol_set] # drop ["", "", "", ""], as they are not in training set annotation encos = [l.strip().split() for k, l in enumerate(e.readlines()) if k < self.args.n_codebooks] assert len(encos) == self.args.n_codebooks, ef if self.args.special_first: y = [[int(n)+self.args.n_special for n in l] for l in encos] else: y = [[int(n) for n in l] for l in encos] except Exception as e: logging.info(f"loading failed for {pf} and {ef}, maybe files don't exist or are corrupted") logging.info(f"error message: {e}") return [], [[]] return x, y def __getitem__(self, index): x, y = self._load_phn_enc(index) x_len, y_len = len(x), len(y[0]) if x_len == 0 or y_len == 0: return { "x": None, "x_len": None, "y": None, "y_len": None, "y_mask_interval": None, # index y_mask_interval[1] is the position of start_of_continue token "extra_mask_start": None # this is only used in VE1 } while y_len < self.args.encodec_sr*self.args.audio_min_length: assert not self.args.dynamic_batching index = random.choice(range(len(self))) # regenerate an index x, y = self._load_phn_enc(index) x_len, y_len = len(x), len(y[0]) if self.args.drop_long: while x_len > self.args.text_max_length or y_len > self.args.encodec_sr*self.args.audio_max_length: index = random.choice(range(len(self))) # regenerate an index x, y = self._load_phn_enc(index) x_len, y_len = len(x), len(y[0]) ### padding and cropping below ### ### padding and cropping below ### # adjust the length of encodec codes, pad to max_len or randomly crop orig_y_len = copy.copy(y_len) max_len = int(self.args.audio_max_length * self.args.encodec_sr) if y_len > max_len: audio_start = random.choice(range(0, y_len-max_len)) for i in range(len(y)): y[i] = y[i][audio_start:(audio_start+max_len)] y_len = max_len else: audio_start = 0 if not self.args.dynamic_batching: pad = [0] * (max_len - y_len) if self.args.sep_special_token else [self.args.audio_pad_token] * (max_len - y_len) for i in range(len(y)): y[i] = y[i] + pad # adjust text # if audio is cropped, and text is longer than max, crop max based on how audio is cropped if audio_start > 0 and len(x) > self.args.text_max_length: # if audio is longer than max and text is long than max, start text the way audio started x = x[int(len(x)*audio_start/orig_y_len):] if len(x) > self.args.text_max_length: # if text is still longer than max, cut the end x = x[:self.args.text_max_length] x_len = len(x) if x_len > self.args.text_max_length: text_start = random.choice(range(0, x_len - self.args.text_max_length)) x = x[text_start:text_start+self.args.text_max_length] x_len = self.args.text_max_length elif self.args.pad_x and x_len <= self.args.text_max_length: pad = [0] * (self.args.text_max_length - x_len) if self.args.sep_special_token else [self.args.text_pad_token] * (self.args.text_max_length - x_len) x = x + pad ### padding and cropping above ### ### padding and cropping above ### return { "x": torch.LongTensor(x), "x_len": x_len, "y": torch.LongTensor(y), "y_len": y_len } def collate(self, batch): out = {key:[] for key in batch[0]} for item in batch: if item['x'] == None: # deal with load failure continue for key, val in item.items(): out[key].append(val) res = {} if self.args.pad_x: res["x"] = torch.stack(out["x"], dim=0) else: res["x"] = torch.nn.utils.rnn.pad_sequence(out["x"], batch_first=True, padding_value=self.args.text_pad_token) res["x_lens"] = torch.LongTensor(out["x_len"]) if self.args.dynamic_batching: if out['y'][0].ndim==2: res['y'] = torch.nn.utils.rnn.pad_sequence([item.transpose(1,0) for item in out['y']],padding_value=self.args.audio_pad_token) res['y'] = res['y'].permute(1,2,0) # T B K -> B K T else: assert out['y'][0].ndim==1, out['y'][0].shape res['y'] = torch.nn.utils.rnn.pad_sequence(out['y'], batch_first=True, padding_value=self.args.audio_pad_token) else: res['y'] = torch.stack(out['y'], dim=0) res["y_lens"] = torch.LongTensor(out["y_len"]) res["text_padding_mask"] = torch.arange(res['x'][0].shape[-1]).unsqueeze(0) >= res['x_lens'].unsqueeze(1) res["audio_padding_mask"] = torch.arange(res['y'][0].shape[-1]).unsqueeze(0) >= res['y_lens'].unsqueeze(1) return res ================================================ FILE: data/phonemize_encodec_encode_hf.py ================================================ import argparse def parse_args(): parser = argparse.ArgumentParser(description="encode the librilight dataset using encodec model") parser.add_argument("--dataset_size", type=str, default='xs', help='sizes of gigaspeech, xs, s, m, l, xl. we use xl for VoiceCraft training, xs is good for debugging') parser.add_argument('--download_to', type=str, default="/data/scratch/pyp/datasets/gigaspeech_debug", help="dir where you want the huggingface gigaspeech dataset to be downloaded to") parser.add_argument('--save_dir', type=str, default="/data/scratch/pyp/datasets/gigaspeech_phn_enc_manifest_debug", help="path to the manifest, phonemes, and encodec codes dirs") parser.add_argument('--encodec_model_path', type=str, default="/data/scratch/pyp/exp_pyp/audiocraft/encodec/xps/6f79c6a8/checkpoint.th") parser.add_argument('--n_workers', type=int, default=4, help="Number of parallel worker processes") parser.add_argument('--mega_batch_size', type=int, default=100, help="Number of samples in each mega batch for multiprocess dataloading") parser.add_argument('--batch_size', type=int, default=4, help="batch size for encodec encoding, decrease it if OOM. This is the sum of batch size *over each gpu*, so increase it if you are using more gpus") parser.add_argument('--model_sr', type=int, default=16000, help='encodec input audio sample rate') parser.add_argument('--downsample_rate', type=int, default=320, help='encodec downsample rate') parser.add_argument('--model_code_sr', type=int, default=50, help='encodec model code sample rate') parser.add_argument('--len_cap', type=float, default=35.0, help='will drop audios that are longer than this number') parser.add_argument('--max_len', type=int, default=30000, help='max length of audio in samples, if exceed, will cut a batch into half to process, decrease this number if OOM on your machine') return parser.parse_args() if __name__ == "__main__": import logging formatter = ( "%(asctime)s [%(levelname)s] %(filename)s:%(lineno)d || %(message)s" ) logging.basicConfig(format=formatter, level=logging.INFO) args = parse_args() import os import numpy as np import torch import tqdm import time from datasets import load_dataset, DownloadConfig from tokenizer import TextTokenizer, tokenize_text # get the path phn_save_root = os.path.join(args.save_dir, args.dataset_size, "phonemes") codes_save_root = os.path.join(args.save_dir, args.dataset_size, "encodec_16khz_4codebooks") vocab_fn = os.path.join(args.save_dir, args.dataset_size, "vocab.txt") os.makedirs(phn_save_root, exist_ok=True) os.makedirs(codes_save_root, exist_ok=True) def sort_by_audio_len(lens): inds = np.argsort(lens).tolist() logging.info(f"longest: {lens[inds[-1]]*args.model_code_sr} encodec codes, {lens[inds[-1]]:.2f} sec.") logging.info(f"shortest: {lens[inds[0]]*args.model_code_sr} encodec codes, {lens[inds[0]]:.2f} sec.") logging.info(f"median: {lens[inds[len(inds)//2]]*args.model_code_sr} encodec codes, {lens[inds[len(inds)//2]]:.2f} sec.") logging.info(f"95 percentile longest: {lens[inds[int(len(inds)*0.95)]]*args.model_code_sr} encodec codes, {lens[inds[int(len(inds)*0.95)]]:.2f} sec.") return inds[::-1] def write_array_to_txt_file(array, filename): with open(filename, 'w') as f: for a in array[:-1]: f.write(' '.join(map(str, a))+'\n') f.write(' '.join(map(str, array[-1]))) ### phonemization # load tokenizer # load the encodec model from audiocraft.solvers import CompressionSolver model = CompressionSolver.model_from_checkpoint(args.encodec_model_path) model = model.cuda() model = model.eval() text_tokenizer = TextTokenizer() # https://github.com/SpeechColab/GigaSpeech # there are only four different punctuations # need to check whether there are other < started strings punc2sym = {" ": ",", " ": ".", " ": "?", " ": "!"} # note the space in front of each punc name gar2sym = {"": "#%#", "": "##%", "": "%%#", "":"%#%"} # so that they are savely keep as the original sym when using tokenize_text punc2sym.update(gar2sym) word2sym = { "h æ ʃ h ɐ ʃ p ɚ s ɛ n t": "", "h æ ʃ p ɚ s ɛ n t h æ ʃ": "", "p ɚ s ɛ n t h ɐ ʃ p ɚ s ɛ n t": "", "p ɚ s ɛ n t p ɚ s ɛ n t h æ ʃ": ""} forbidden_words = set(['#%#', '##%', '%%#', '%#%']) dc = DownloadConfig(cache_dir=args.download_to) stime = time.time() logging.info("loading the dataset...") gs = load_dataset("speechcolab/gigaspeech", args.dataset_size, use_auth_token=True, cache_dir = args.download_to, download_config=dc) logging.info(f"time spend on loading the dataset: {time.time() - stime:.2f} seconds") splits = ['validation', 'test', 'train'] logging.info(f"gigaspeech dataset {args.dataset_size} info: {gs}") logging.info(f"phonemizing...") phn_vocab = set() all_lens = [] # you will see a ton of [WARNING] words_mismatch.py:88......, it's not a issue for split in tqdm.tqdm(splits): skip = 0 logging.info(f"now processing split {split}...") for item in tqdm.tqdm(gs[split]): save_fn = os.path.join(phn_save_root, item['segment_id']+".txt") text = item['text'] if sum(word in forbidden_words for word in text.split(" ")): logging.info(f"skip {item['segment_id']}, because it contains forbiden words. It's transcript: {text}") skip += 1 continue for k, v in punc2sym.items(): text = text.replace(k, v) phn = tokenize_text(text_tokenizer, text) phn_seq = " ".join(phn) for k, v in word2sym.items(): phn_seq = phn_seq.replace(k, v) phn_vocab.update(phn_seq.split(" ")) all_lens.append(len(phn_seq.split(" "))) with open(save_fn, "w") as f: f.write(phn_seq) logging.info(f"split {split} has {len(gs[split])} samples in total, skipped {skip} due to forbiden words") print(f"phn vocab size: {len(list(phn_vocab))}") print("phn sequence stats: ") print(f"longest: {max(all_lens)}") print(f"shortest: {min(all_lens)}") print(f"median: {np.quantile(all_lens, 0.5)}") print(f"95 percentile longest: {np.quantile(all_lens, 0.95)}") print("write vocabulary to ", vocab_fn) with open(vocab_fn, "w") as f: for i, phn in enumerate(list(phn_vocab)): if i < len(list(phn_vocab)) - 1: f.write(f"{str(i)} {phn}\n") else: f.write(f"{str(i)} {phn}") class mydataset(torch.utils.data.Dataset): def __init__(self, split): super().__init__() self.data = gs[split] def __len__(self): return len(self.data) def __getitem__(self, ind): try: segment_id, audio, sr, text, begin_time, end_time = self.data[ind]['segment_id'], torch.from_numpy(self.data[ind]['audio']['array']).float(), self.data[ind]['audio']['sampling_rate'], self.data[ind]['text'], self.data[ind]['begin_time'], self.data[ind]['end_time'] except: return None, None, None, None, None, None return segment_id, audio, sr, text, begin_time, end_time def collate(self, batch): res = {'segment_id': [], "audio": [], "sr": [], "text": [], "begin_time": [], "end_time": []} for item in batch: if item[0] != None: res['segment_id'].append(item[0]) res['audio'].append(item[1]) res['sr'].append(item[2]) res['text'].append(item[3]) res['begin_time'].append(item[4]) res['end_time'].append(item[5]) return res ## encodec codes extraction logging.info("encodec encoding...") train_dataset = mydataset('train') train_loader = torch.torch.utils.data.DataLoader(train_dataset, batch_size=args.mega_batch_size, shuffle=False, drop_last=False, num_workers=args.n_workers, collate_fn=train_dataset.collate) validation_dataset = mydataset('validation') validation_loader = torch.torch.utils.data.DataLoader(validation_dataset, batch_size=args.mega_batch_size, shuffle=False, drop_last=False, num_workers=args.n_workers, collate_fn=validation_dataset.collate) test_dataset = mydataset('test') test_loader = torch.torch.utils.data.DataLoader(test_dataset, batch_size=args.mega_batch_size, shuffle=False, drop_last=False, num_workers=args.n_workers, collate_fn=test_dataset.collate) splits = ['validation', 'test', 'train'] loaders = [validation_loader, test_loader, train_loader] # splits = ['validation'] # for debug # loaders = [validation_loader] for split, loader in zip(splits, loaders): skip = 0 logging.info(f"now processing split {split}...") mega_n_steps = int(np.ceil(len(gs[split]) / args.mega_batch_size)) logging.info(f"partition the split {split} into {mega_n_steps} parts, each has {args.mega_batch_size} samples") for m, mega_batch in enumerate(loader): logging.info(f"====================================") logging.info(f"====================================") logging.info(f"now processing mega step {m+1}/{mega_n_steps}") lengths = np.array(mega_batch['end_time']) - np.array(mega_batch['begin_time']) sorted_inds = sort_by_audio_len(lengths) for j in range(len(sorted_inds))[::-1]: if lengths[sorted_inds[j]] < 0.2 or lengths[sorted_inds[j]] > args.len_cap: # skip samples that are too short (shorter than 0.2s), or too big (bigger than 80s) skip += 1 del sorted_inds[j] n_steps = int(np.ceil(len(sorted_inds) / args.batch_size)) for n in tqdm.tqdm(range(n_steps), disable=True): inds_used = sorted_inds[n*args.batch_size:(n+1)*args.batch_size] audio_batch = [mega_batch['audio'][id] for id in inds_used] sr_batch = [mega_batch['sr'][id] for id in inds_used] segment_id_batch = [mega_batch['segment_id'][id] for id in inds_used] text_batch = [mega_batch['text'][id] for id in inds_used] padded_wav = torch.nn.utils.rnn.pad_sequence(audio_batch, batch_first=True).unsqueeze(1) # [B, T] -> [B, 1, T] all_lens = [lengths[id] for id in inds_used] with torch.no_grad(): if max(all_lens) > args.max_len and len(all_lens) > 1: # NOTE decrease args.max_len if OOM, or chunk it into more than 2 forward passes codes = [] inwav = padded_wav.cuda() codes.append(model.encode(inwav[:len(inwav)//2])[0].cpu()) codes.append(model.encode(inwav[len(inwav)//2:])[0].cpu()) codes = torch.cat(codes, dim=0) else: encoded_frames = model.encode(padded_wav.cuda()) # logging.info(f"encoded_frames: {encoded_frames[0].shape}") codes = encoded_frames[0].cpu() for i, length in enumerate(all_lens): save_fn = os.path.join(codes_save_root, segment_id_batch[i]+".txt") actual_len = round(length * args.model_code_sr) # 320 is downsample rate for this model cur_code = codes[i].tolist() if type(codes) == list else codes[i, :, :actual_len].tolist() write_array_to_txt_file(cur_code, save_fn) ================================================ FILE: data/tokenizer.py ================================================ # cp from https://github.com/lifeiteng/vall-e/blob/main/valle/data/tokenizer.py # Copyright 2023 (authors: Feiteng Li) # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import re from dataclasses import asdict, dataclass from typing import Any, Dict, List, Optional, Pattern, Union import numpy as np import torch import torchaudio # from lhotse.features import FeatureExtractor # from lhotse.utils import Seconds, compute_num_frames from phonemizer.backend import EspeakBackend from phonemizer.backend.espeak.language_switch import LanguageSwitch from phonemizer.backend.espeak.words_mismatch import WordMismatch from phonemizer.punctuation import Punctuation from phonemizer.separator import Separator class TextTokenizer: """Phonemize Text.""" def __init__( self, language="en-us", backend="espeak", separator=Separator(word="_", syllable="-", phone="|"), preserve_punctuation=True, punctuation_marks: Union[str, Pattern] = Punctuation.default_marks(), with_stress: bool = False, tie: Union[bool, str] = False, language_switch: LanguageSwitch = "keep-flags", words_mismatch: WordMismatch = "ignore", ) -> None: phonemizer = EspeakBackend( language, punctuation_marks=punctuation_marks, preserve_punctuation=preserve_punctuation, with_stress=with_stress, tie=tie, language_switch=language_switch, words_mismatch=words_mismatch, ) self.backend = phonemizer self.separator = separator def to_list(self, phonemized: str) -> List[str]: fields = [] for word in phonemized.split(self.separator.word): # "ɐ m|iː|n?" ɹ|ɪ|z|ɜː|v; h|ɪ|z. pp = re.findall(r"\w+|[^\w\s]", word, re.UNICODE) fields.extend( [p for p in pp if p != self.separator.phone] + [self.separator.word] ) assert len("".join(fields[:-1])) == len(phonemized) - phonemized.count( self.separator.phone ) return fields[:-1] def __call__(self, text, strip=True) -> List[List[str]]: if isinstance(text, str): text = [text] phonemized = self.backend.phonemize( text, separator=self.separator, strip=strip, njobs=1 ) return [self.to_list(p) for p in phonemized] def tokenize_text(tokenizer: TextTokenizer, text: str) -> List[str]: phonemes = tokenizer([text.strip()]) return phonemes[0] # k2symbols def convert_audio(wav: torch.Tensor, sr: int, target_sr: int, target_channels: int): assert wav.shape[0] in [1, 2], "Audio must be mono or stereo." if target_channels == 1: wav = wav.mean(0, keepdim=True) elif target_channels == 2: *shape, _, length = wav.shape wav = wav.expand(*shape, target_channels, length) elif wav.shape[0] == 1: wav = wav.expand(target_channels, -1) wav = torchaudio.transforms.Resample(sr, target_sr)(wav) return wav class AudioTokenizer: """EnCodec audio.""" def __init__( self, device: Any = None, signature = None ) -> None: from audiocraft.solvers import CompressionSolver model = CompressionSolver.model_from_checkpoint(signature) self.sample_rate = model.sample_rate self.channels = model.channels if not device: device = torch.device("cpu") if torch.cuda.is_available(): device = torch.device("cuda:0") self._device = device self.codec = model.to(device) @property def device(self): return self._device def encode(self, wav: torch.Tensor) -> torch.Tensor: codes = self.codec.encode(wav.to(self.device)) return [(codes[0], None)] def decode(self, frames: torch.Tensor) -> torch.Tensor: frames = frames[0][0] # [1,4,T] return self.codec.decode(frames) def tokenize_audio(tokenizer: AudioTokenizer, audio_path: str, offset = -1, num_frames=-1): # Load and pre-process the audio waveform if offset != -1 and num_frames!=-1: wav, sr = torchaudio.load(audio_path, frame_offset=offset, num_frames=num_frames) else: wav, sr = torchaudio.load(audio_path) wav = convert_audio(wav, sr, tokenizer.sample_rate, tokenizer.channels) wav = wav.unsqueeze(0) # Extract discrete codes from EnCodec with torch.no_grad(): encoded_frames = tokenizer.encode(wav) return encoded_frames ================================================ FILE: demo/temp/84_121550_000074_000000.txt ================================================ But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks, ================================================ FILE: demo/temp/mfa_alignments/5895_34622_000026_000002.csv ================================================ Begin,End,Label,Type,Speaker 0.04,0.58,gwynplaine,words,temp 0.58,0.94,had,words,temp 0.94,1.45,besides,words,temp 1.45,1.62,for,words,temp 1.62,1.86,his,words,temp 1.86,2.16,work,words,temp 2.16,2.31,and,words,temp 2.31,2.49,for,words,temp 2.49,2.71,his,words,temp 2.71,3.03,feats,words,temp 3.03,3.12,of,words,temp 3.12,3.61,strength,words,temp 3.95,4.25,round,words,temp 4.25,4.45,his,words,temp 4.45,4.7,neck,words,temp 4.7,4.81,and,words,temp 4.81,5.04,over,words,temp 5.04,5.22,his,words,temp 5.22,5.83,shoulders,words,temp 6.16,6.31,an,words,temp 6.41,7.15,esclavine,words,temp 7.15,7.29,of,words,temp 7.29,7.7,leather,words,temp 0.04,0.1,G,phones,temp 0.1,0.13,W,phones,temp 0.13,0.22,IH1,phones,temp 0.22,0.3,N,phones,temp 0.3,0.38,P,phones,temp 0.38,0.42,L,phones,temp 0.42,0.53,EY1,phones,temp 0.53,0.58,N,phones,temp 0.58,0.71,HH,phones,temp 0.71,0.86,AE1,phones,temp 0.86,0.94,D,phones,temp 0.94,0.97,B,phones,temp 0.97,1.01,IH0,phones,temp 1.01,1.14,S,phones,temp 1.14,1.34,AY1,phones,temp 1.34,1.4,D,phones,temp 1.4,1.45,Z,phones,temp 1.45,1.52,F,phones,temp 1.52,1.55,AO1,phones,temp 1.55,1.62,R,phones,temp 1.62,1.69,HH,phones,temp 1.69,1.76,IH1,phones,temp 1.76,1.86,Z,phones,temp 1.86,1.95,W,phones,temp 1.95,2.07,ER1,phones,temp 2.07,2.16,K,phones,temp 2.16,2.23,AH0,phones,temp 2.23,2.26,N,phones,temp 2.26,2.31,D,phones,temp 2.31,2.38,F,phones,temp 2.38,2.41,AO1,phones,temp 2.41,2.49,R,phones,temp 2.49,2.55,HH,phones,temp 2.55,2.62,IH1,phones,temp 2.62,2.71,Z,phones,temp 2.71,2.8,F,phones,temp 2.8,2.9,IY1,phones,temp 2.9,2.98,T,phones,temp 2.98,3.03,S,phones,temp 3.03,3.07,AH0,phones,temp 3.07,3.12,V,phones,temp 3.12,3.2,S,phones,temp 3.2,3.26,T,phones,temp 3.26,3.32,R,phones,temp 3.32,3.39,EH1,phones,temp 3.39,3.48,NG,phones,temp 3.48,3.53,K,phones,temp 3.53,3.61,TH,phones,temp 3.95,4.03,R,phones,temp 4.03,4.16,AW1,phones,temp 4.16,4.21,N,phones,temp 4.21,4.25,D,phones,temp 4.25,4.29,HH,phones,temp 4.29,4.36,IH1,phones,temp 4.36,4.45,Z,phones,temp 4.45,4.53,N,phones,temp 4.53,4.62,EH1,phones,temp 4.62,4.7,K,phones,temp 4.7,4.74,AH0,phones,temp 4.74,4.77,N,phones,temp 4.77,4.81,D,phones,temp 4.81,4.92,OW1,phones,temp 4.92,4.97,V,phones,temp 4.97,5.04,ER0,phones,temp 5.04,5.11,HH,phones,temp 5.11,5.18,IH1,phones,temp 5.18,5.22,Z,phones,temp 5.22,5.34,SH,phones,temp 5.34,5.47,OW1,phones,temp 5.47,5.51,L,phones,temp 5.51,5.58,D,phones,temp 5.58,5.71,ER0,phones,temp 5.71,5.83,Z,phones,temp 6.16,6.23,AE1,phones,temp 6.23,6.31,N,phones,temp 6.41,7.15,spn,phones,temp 7.15,7.21,AH0,phones,temp 7.21,7.29,V,phones,temp 7.29,7.36,L,phones,temp 7.36,7.44,EH1,phones,temp 7.44,7.49,DH,phones,temp 7.49,7.7,ER0,phones,temp ================================================ FILE: demo/temp/mfa_alignments/84_121550_000074_000000.csv ================================================ Begin,End,Label,Type,Speaker 0.03,0.18,but,words,temp 0.18,0.32,when,words,temp 0.32,0.48,i,words,temp 0.48,0.64,had,words,temp 0.64,1.19,approached,words,temp 1.22,1.58,so,words,temp 1.58,1.91,near,words,temp 1.91,2.07,to,words,temp 2.07,2.42,them,words,temp 2.53,2.61,the,words,temp 2.61,3.01,common,words,temp 3.05,3.62,object,words,temp 3.68,3.93,which,words,temp 3.93,4.02,the,words,temp 4.02,4.34,sense,words,temp 4.34,4.97,deceives,words,temp 5.04,5.54,lost,words,temp 5.54,6.0,not,words,temp 6.0,6.14,by,words,temp 6.14,6.67,distance,words,temp 6.79,7.05,any,words,temp 7.05,7.18,of,words,temp 7.18,7.34,its,words,temp 7.34,7.87,marks,words,temp 0.03,0.06,B,phones,temp 0.06,0.09,AH1,phones,temp 0.09,0.18,T,phones,temp 0.18,0.23,W,phones,temp 0.23,0.27,EH1,phones,temp 0.27,0.32,N,phones,temp 0.32,0.48,AY1,phones,temp 0.48,0.49,HH,phones,temp 0.49,0.6,AE1,phones,temp 0.6,0.64,D,phones,temp 0.64,0.7,AH0,phones,temp 0.7,0.83,P,phones,temp 0.83,0.88,R,phones,temp 0.88,0.99,OW1,phones,temp 0.99,1.12,CH,phones,temp 1.12,1.19,T,phones,temp 1.22,1.4,S,phones,temp 1.4,1.58,OW1,phones,temp 1.58,1.7,N,phones,temp 1.7,1.84,IH1,phones,temp 1.84,1.91,R,phones,temp 1.91,2.01,T,phones,temp 2.01,2.07,AH0,phones,temp 2.07,2.13,DH,phones,temp 2.13,2.3,EH1,phones,temp 2.3,2.42,M,phones,temp 2.53,2.55,DH,phones,temp 2.55,2.61,AH0,phones,temp 2.61,2.73,K,phones,temp 2.73,2.85,AA1,phones,temp 2.85,2.9,M,phones,temp 2.9,2.95,AH0,phones,temp 2.95,3.01,N,phones,temp 3.05,3.22,AA1,phones,temp 3.22,3.27,B,phones,temp 3.27,3.34,JH,phones,temp 3.34,3.48,EH0,phones,temp 3.48,3.54,K,phones,temp 3.54,3.62,T,phones,temp 3.68,3.69,HH,phones,temp 3.69,3.76,W,phones,temp 3.76,3.8,IH1,phones,temp 3.8,3.93,CH,phones,temp 3.93,3.95,DH,phones,temp 3.95,4.02,AH0,phones,temp 4.02,4.12,S,phones,temp 4.12,4.21,EH1,phones,temp 4.21,4.27,N,phones,temp 4.27,4.34,S,phones,temp 4.34,4.42,D,phones,temp 4.42,4.45,IH0,phones,temp 4.45,4.59,S,phones,temp 4.59,4.79,IY1,phones,temp 4.79,4.87,V,phones,temp 4.87,4.97,Z,phones,temp 5.04,5.12,L,phones,temp 5.12,5.33,AO1,phones,temp 5.33,5.42,S,phones,temp 5.42,5.54,T,phones,temp 5.54,5.7,N,phones,temp 5.7,5.89,AA1,phones,temp 5.89,6.0,T,phones,temp 6.0,6.05,B,phones,temp 6.05,6.14,AY1,phones,temp 6.14,6.24,D,phones,temp 6.24,6.3,IH1,phones,temp 6.3,6.38,S,phones,temp 6.38,6.45,T,phones,temp 6.45,6.51,AH0,phones,temp 6.51,6.57,N,phones,temp 6.57,6.67,S,phones,temp 6.79,6.89,EH1,phones,temp 6.89,6.95,N,phones,temp 6.95,7.05,IY0,phones,temp 7.05,7.13,AH0,phones,temp 7.13,7.18,V,phones,temp 7.18,7.22,IH0,phones,temp 7.22,7.29,T,phones,temp 7.29,7.34,S,phones,temp 7.34,7.39,M,phones,temp 7.39,7.5,AA1,phones,temp 7.5,7.58,R,phones,temp 7.58,7.7,K,phones,temp 7.7,7.87,S,phones,temp ================================================ FILE: edit_utils.py ================================================ def get_span(orig, new, editType): orig_list = orig.split(" ") new_list = new.split(" ") flag = False # this indicate whether the actual edit follow the specified editType if editType == "deletion": assert len(orig_list) > len(new_list), f"the edit type is deletion, but new is not shorter than original:\n new: {new}\n orig: {orig}" diff = len(orig_list) - len(new_list) for i, (o, n) in enumerate(zip(orig_list, new_list)): if o != n: # assume the index of the first different word is the starting index of the orig_span orig_span = [i, i + diff - 1] # assume that the indices are starting and ending index of the deleted part new_span = [i-1, i] # but for the new span, the starting and ending index is the two words that surround the deleted part flag = True break elif editType == "insertion": assert len(orig_list) < len(new_list), f"the edit type is insertion, but the new is not longer than the original:\n new: {new}\n orig: {orig}" diff = len(new_list) - len(orig_list) for i, (o, n) in enumerate(zip(orig_list, new_list)): if o != n: # insertion is just the opposite of deletion new_span = [i, i + diff - 1] # NOTE if only inserted one word, s and e will be the same orig_span = [i-1, i] flag = True break elif editType == "substitution": new_span = [] orig_span = [] for i, (o, n) in enumerate(zip(orig_list, new_list)): if o != n: new_span = [i] orig_span = [i] break assert len(new_span) == 1 and len(orig_span) == 1, f"new_span: {new_span}, orig_span: {orig_span}" for j, (o, n) in enumerate(zip(orig_list[::-1], new_list[::-1])): if o != n: new_span.append(len(new_list) - j -1) orig_span.append(len(orig_list) - j - 1) flag = True break else: raise RuntimeError(f"editType unknown: {editType}") if not flag: raise RuntimeError(f"wrong editing with the specified edit type:\n original: {orig}\n new: {new}\n, editType: {editType}") return orig_span, new_span ================================================ FILE: environment.yml ================================================ name: voicecraft channels: - conda-forge - defaults dependencies: - _libgcc_mutex=0.1=conda_forge - _openmp_mutex=4.5=2_gnu - aom=3.8.2=h59595ed_0 - asttokens=2.4.1=pyhd8ed1ab_0 - atk-1.0=2.38.0=hd4edc92_1 - audioread=3.0.1=py39hf3d152e_1 - backcall=0.2.0=pyh9f0ad1d_0 - baumwelch=0.3.7=h00ab1b0_5 - biopython=1.79=py39hb9d737c_3 - brotli=1.1.0=hd590300_1 - brotli-bin=1.1.0=hd590300_1 - brotli-python=1.1.0=py39h3d6467e_1 - bzip2=1.0.8=hd590300_5 - ca-certificates=2024.2.2=hbcca054_0 - cairo=1.18.0=h3faef2a_0 - certifi=2024.2.2=pyhd8ed1ab_0 - cffi=1.16.0=py39h7a31438_0 - charset-normalizer=3.3.2=pyhd8ed1ab_0 - click=8.1.7=unix_pyh707e725_0 - colorama=0.4.6=pyhd8ed1ab_0 - comm=0.2.2=pyhd8ed1ab_0 - contourpy=1.2.0=py39h7633fee_0 - cycler=0.12.1=pyhd8ed1ab_0 - dataclassy=1.0.1=pyhd8ed1ab_0 - dav1d=1.2.1=hd590300_0 - debugpy=1.8.1=py39h3d6467e_0 - decorator=5.1.1=pyhd8ed1ab_0 - executing=2.0.1=pyhd8ed1ab_0 - expat=2.6.2=h59595ed_0 - ffmpeg=6.1.1=gpl_h38e077a_106 - font-ttf-dejavu-sans-mono=2.37=hab24e00_0 - font-ttf-inconsolata=3.000=h77eed37_0 - font-ttf-source-code-pro=2.038=h77eed37_0 - font-ttf-ubuntu=0.83=h77eed37_1 - fontconfig=2.14.2=h14ed4e7_0 - fonts-conda-ecosystem=1=0 - fonts-conda-forge=1=0 - fonttools=4.49.0=py39hd1e30aa_0 - freetype=2.12.1=h267a509_2 - fribidi=1.0.10=h36c2ea0_0 - gdk-pixbuf=2.42.10=h829c605_5 - gettext=0.21.1=h27087fc_0 - giflib=5.2.1=h0b41bf4_3 - gmp=6.3.0=h59595ed_1 - gnutls=3.7.9=hb077bed_0 - graphite2=1.3.13=h58526e2_1001 - graphviz=9.0.0=h78e8752_1 - greenlet=3.0.3=py39h3d6467e_0 - gtk2=2.24.33=h280cfa0_4 - gts=0.7.6=h977cf35_4 - harfbuzz=8.3.0=h3d44ed6_0 - hdbscan=0.8.33=py39h44dd56e_4 - icu=73.2=h59595ed_0 - idna=3.6=pyhd8ed1ab_0 - importlib-metadata=7.0.2=pyha770c72_0 - importlib-resources=6.3.0=pyhd8ed1ab_0 - importlib_metadata=7.0.2=hd8ed1ab_0 - importlib_resources=6.3.0=pyhd8ed1ab_0 - ipykernel=6.29.3=pyhd33586a_0 - jedi=0.19.1=pyhd8ed1ab_0 - joblib=1.3.2=pyhd8ed1ab_0 - jupyter_client=8.6.1=pyhd8ed1ab_0 - jupyter_core=5.7.2=py39hf3d152e_0 - kaldi=5.5.1068=cpu_h31769b2_2 - keyutils=1.6.1=h166bdaf_0 - kiwisolver=1.4.5=py39h7633fee_1 - kneed=0.8.5=pyhd8ed1ab_0 - krb5=1.21.2=h659d440_0 - lame=3.100=h166bdaf_1003 - lazy_loader=0.3=pyhd8ed1ab_0 - lcms2=2.16=hb7c19ff_0 - ld_impl_linux-64=2.40=h41732ed_0 - lerc=4.0.0=h27087fc_0 - libabseil=20240116.1=cxx17_h59595ed_2 - libass=0.17.1=h8fe9dca_1 - libblas=3.9.0=21_linux64_openblas - libbrotlicommon=1.1.0=hd590300_1 - libbrotlidec=1.1.0=hd590300_1 - libbrotlienc=1.1.0=hd590300_1 - libcblas=3.9.0=21_linux64_openblas - libclang-cpp15=15.0.7=default_hb11cfb5_4 - libdeflate=1.19=hd590300_0 - libdrm=2.4.120=hd590300_0 - libedit=3.1.20191231=he28a2e2_2 - libexpat=2.6.2=h59595ed_0 - libffi=3.4.2=h7f98852_5 - libflac=1.4.3=h59595ed_0 - libgcc-ng=13.2.0=h807b86a_5 - libgd=2.3.3=h119a65a_9 - libgfortran-ng=13.2.0=h69a702a_5 - libgfortran5=13.2.0=ha4646dd_5 - libglib=2.80.0=hf2295e7_0 - libgomp=13.2.0=h807b86a_5 - libhwloc=2.9.3=default_h554bfaf_1009 - libiconv=1.17=hd590300_2 - libidn2=2.3.7=hd590300_0 - libjpeg-turbo=3.0.0=hd590300_1 - liblapack=3.9.0=21_linux64_openblas - liblapacke=3.9.0=21_linux64_openblas - libllvm14=14.0.6=hcd5def8_4 - libllvm15=15.0.7=hb3ce162_4 - libllvmspirv15=15.0.0=h0cdce71_1 - libnsl=2.0.1=hd590300_0 - libogg=1.3.4=h7f98852_1 - libopenblas=0.3.26=pthreads_h413a1c8_0 - libopenvino=2024.0.0=h2e90f83_1 - libopenvino-auto-batch-plugin=2024.0.0=hd5fc58b_1 - libopenvino-auto-plugin=2024.0.0=hd5fc58b_1 - libopenvino-hetero-plugin=2024.0.0=h3ecfda7_1 - libopenvino-intel-cpu-plugin=2024.0.0=h2e90f83_1 - libopenvino-intel-gpu-plugin=2024.0.0=h2e90f83_1 - libopenvino-ir-frontend=2024.0.0=h3ecfda7_1 - libopenvino-onnx-frontend=2024.0.0=h757c851_1 - libopenvino-paddle-frontend=2024.0.0=h757c851_1 - libopenvino-pytorch-frontend=2024.0.0=h59595ed_1 - libopenvino-tensorflow-frontend=2024.0.0=hca94c1a_1 - libopenvino-tensorflow-lite-frontend=2024.0.0=h59595ed_1 - libopus=1.3.1=h7f98852_1 - libpciaccess=0.18=hd590300_0 - libpng=1.6.43=h2797004_0 - libpq=16.2=h33b98f1_0 - libprotobuf=4.25.3=h08a7969_0 - librosa=0.10.1=pyhd8ed1ab_0 - librsvg=2.56.3=he3f83f7_1 - libsndfile=1.2.2=hc60ed4a_1 - libsodium=1.0.18=h36c2ea0_1 - libsqlite=3.45.2=h2797004_0 - libstdcxx-ng=13.2.0=h7e041cc_5 - libtasn1=4.19.0=h166bdaf_0 - libtiff=4.6.0=ha9c0a0a_2 - libunistring=0.9.10=h7f98852_0 - libuuid=2.38.1=h0b41bf4_0 - libva=2.21.0=hd590300_0 - libvorbis=1.3.7=h9c3ff4c_0 - libvpx=1.14.0=h59595ed_0 - libwebp=1.3.2=h658648e_1 - libwebp-base=1.3.2=hd590300_0 - libxcb=1.15=h0b41bf4_0 - libxcrypt=4.4.36=hd590300_1 - libxml2=2.12.5=h232c23b_0 - libzlib=1.2.13=hd590300_5 - llvm-spirv-15=15.0.0=h0cdce71_1 - mad=0.15.1b=h9c3ff4c_1 - markdown-it-py=3.0.0=pyhd8ed1ab_0 - matplotlib-base=3.8.3=py39he9076e7_0 - matplotlib-inline=0.1.6=pyhd8ed1ab_0 - mdurl=0.1.2=pyhd8ed1ab_0 - montreal-forced-aligner=2.2.17=pyhd8ed1ab_0 - mpg123=1.32.4=h59595ed_0 - msgpack-python=1.0.7=py39h7633fee_0 - munkres=1.1.4=pyh9f0ad1d_0 - ncurses=6.4=h59595ed_2 - nest-asyncio=1.6.0=pyhd8ed1ab_0 - nettle=3.9.1=h7ab15ed_0 - ngram=1.3.14=h924138e_2 - numba=0.59.0=py39h615d6bd_1 - numpy=1.26.4=py39h474f0d3_0 - ocl-icd=2.3.2=hd590300_0 - openfst=1.8.2=h924138e_2 - openh264=2.4.1=h59595ed_0 - openjpeg=2.5.2=h488ebb8_0 - openssl=3.2.1=hd590300_0 - p11-kit=0.24.1=hc5aa10d_0 - packaging=24.0=pyhd8ed1ab_0 - pandas=2.2.1=py39hddac248_0 - pango=1.52.1=ha41ecd1_0 - parso=0.8.3=pyhd8ed1ab_0 - patsy=0.5.6=pyhd8ed1ab_0 - pcre2=10.43=hcad00b1_0 - pexpect=4.9.0=pyhd8ed1ab_0 - pgvector-python=0.2.5=pyhe093146_0 - pickleshare=0.7.5=py_1003 - pillow=10.2.0=py39had0adad_0 - pip=24.0=pyhd8ed1ab_0 - pixman=0.43.2=h59595ed_0 - platformdirs=4.2.0=pyhd8ed1ab_0 - pocl=5.0=h03a6ac1_2 - pocl-core=5.0=hdaecddf_2 - pocl-cpu=5.0=he901f76_2 - pocl-cpu-minimal=5.0=h5ccd973_2 - pocl-cuda=5.0=hdaecddf_2 - pocl-remote=5.0=h5ccd973_2 - pooch=1.8.1=pyhd8ed1ab_0 - postgresql=16.2=h7387d8b_0 - prompt-toolkit=3.0.42=pyha770c72_0 - prompt_toolkit=3.0.42=hd8ed1ab_0 - psutil=5.9.8=py39hd1e30aa_0 - psycopg2=2.9.9=py39h89197e3_0 - pthread-stubs=0.4=h36c2ea0_1001 - ptyprocess=0.7.0=pyhd3deb0d_0 - pugixml=1.14=h59595ed_0 - pure_eval=0.2.2=pyhd8ed1ab_0 - pycparser=2.21=pyhd8ed1ab_0 - pygments=2.17.2=pyhd8ed1ab_0 - pyparsing=3.1.2=pyhd8ed1ab_0 - pysocks=1.7.1=pyha2e5f31_6 - pysoundfile=0.12.1=pypyhd8ed1ab_1 - python=3.9.18=h0755675_1_cpython - python-tzdata=2024.1=pyhd8ed1ab_0 - python_abi=3.9=4_cp39 - pytz=2024.1=pyhd8ed1ab_0 - pyyaml=6.0.1=py39hd1e30aa_1 - pyzmq=25.1.2=py39h8c080ef_0 - readline=8.2=h8228510_1 - requests=2.31.0=pyhd8ed1ab_0 - rich=13.7.1=pyhd8ed1ab_0 - rich-click=1.7.4=pyhd8ed1ab_0 - scikit-learn=1.2.2=py39hc236052_2 - scipy=1.12.0=py39h474f0d3_2 - seaborn=0.13.2=hd8ed1ab_0 - seaborn-base=0.13.2=pyhd8ed1ab_0 - setuptools=69.2.0=pyhd8ed1ab_0 - six=1.16.0=pyh6c4a22f_0 - snappy=1.1.10=h9fff704_0 - sox=14.4.2=ha5cc309_1018 - soxr=0.1.3=h0b41bf4_3 - soxr-python=0.3.7=py39h44dd56e_0 - sqlalchemy=2.0.28=py39hd1e30aa_0 - sqlite=3.45.2=h2c6b66d_0 - stack_data=0.6.2=pyhd8ed1ab_0 - statsmodels=0.14.1=py39h44dd56e_0 - svt-av1=1.8.0=h59595ed_0 - tbb=2021.11.0=h00ab1b0_1 - threadpoolctl=3.3.0=pyhc1e730c_0 - tk=8.6.13=noxft_h4845f30_101 - tornado=6.4=py39hd1e30aa_0 - tqdm=4.66.2=pyhd8ed1ab_0 - traitlets=5.14.2=pyhd8ed1ab_0 - typing-extensions=4.10.0=hd8ed1ab_0 - typing_extensions=4.10.0=pyha770c72_0 - tzcode=2024a=h3f72095_0 - tzdata=2024a=h0c530f3_0 - unicodedata2=15.1.0=py39hd1e30aa_0 - urllib3=2.2.1=pyhd8ed1ab_0 - wcwidth=0.2.13=pyhd8ed1ab_0 - wheel=0.42.0=pyhd8ed1ab_0 - x264=1!164.3095=h166bdaf_2 - x265=3.5=h924138e_3 - xorg-fixesproto=5.0=h7f98852_1002 - xorg-kbproto=1.0.7=h7f98852_1002 - xorg-libice=1.1.1=hd590300_0 - xorg-libsm=1.2.4=h7391055_0 - xorg-libx11=1.8.7=h8ee46fc_0 - xorg-libxau=1.0.11=hd590300_0 - xorg-libxdmcp=1.1.3=h7f98852_0 - xorg-libxext=1.3.4=h0b41bf4_2 - xorg-libxfixes=5.0.3=h7f98852_1004 - xorg-libxrender=0.9.11=hd590300_0 - xorg-renderproto=0.11.1=h7f98852_1002 - xorg-xextproto=7.3.0=h0b41bf4_1003 - xorg-xproto=7.0.31=h7f98852_1007 - xz=5.2.6=h166bdaf_0 - yaml=0.2.5=h7f98852_2 - zeromq=4.3.5=h59595ed_1 - zipp=3.17.0=pyhd8ed1ab_0 - zlib=1.2.13=hd590300_5 - zstd=1.5.5=hfc55251_0 - pip: - absl-py==2.1.0 - aiofiles==23.2.1 - aiohttp==3.9.3 - aiosignal==1.3.1 - altair==5.2.0 - antlr4-python3-runtime==4.9.3 - anyio==4.3.0 - async-timeout==4.0.3 - attrs==23.2.0 - av==11.0.0 - babel==2.14.0 - beautifulsoup4==4.12.3 - bibtexparser==2.0.0b7 - bleach==6.1.0 - blis==0.7.11 - catalogue==2.0.10 - clldutils==3.22.2 - cloudpickle==3.0.0 - cmake==3.28.3 - colorlog==6.8.2 - confection==0.1.4 - csvw==3.3.0 - cymem==2.0.8 - cython==0.29.37 - datasets==2.16.0 - defusedxml==0.7.1 - demucs==4.0.1 - dill==0.3.6 - dlinfo==1.2.1 - docopt==0.6.2 - dora-search==0.1.12 - einops==0.7.0 - encodec==0.1.1 - exceptiongroup==1.2.0 - fastapi==0.110.0 - fastjsonschema==2.19.1 - ffmpy==0.3.2 - filelock==3.13.1 - flashy==0.0.2 - frozenlist==1.4.1 - fsspec==2023.10.0 - gradio==3.50.2 - gradio-client==0.6.1 - grpcio==1.62.1 - h11==0.14.0 - httpcore==1.0.4 - httpx==0.27.0 - huggingface-hub==0.22.2 - hydra-colorlog==1.2.0 - hydra-core==1.3.2 - ipython==8.12.3 - isodate==0.6.1 - jinja2==3.1.3 - jsonschema==4.21.1 - jsonschema-specifications==2023.12.1 - julius==0.2.7 - jupyterlab-pygments==0.3.0 - lameenc==1.7.0 - langcodes==3.3.0 - language-tags==1.2.0 - lit==18.1.1 - llvmlite==0.42.0 - lxml==5.1.0 - markdown==3.5.2 - markupsafe==2.1.5 - mistune==3.0.2 - mpmath==1.3.0 - msgpack==1.0.8 - multidict==6.0.5 - multiprocess==0.70.14 - murmurhash==1.0.10 - nbclient==0.10.0 - nbconvert==7.16.3 - nbformat==5.10.3 - networkx==3.2.1 - num2words==0.5.13 - nvidia-cublas-cu11==11.10.3.66 - nvidia-cuda-cupti-cu11==11.7.101 - nvidia-cuda-nvrtc-cu11==11.7.99 - nvidia-cuda-runtime-cu11==11.7.99 - nvidia-cudnn-cu11==8.5.0.96 - nvidia-cufft-cu11==10.9.0.58 - nvidia-curand-cu11==10.2.10.91 - nvidia-cusolver-cu11==11.4.0.1 - nvidia-cusparse-cu11==11.7.4.91 - nvidia-nccl-cu11==2.14.3 - nvidia-nvtx-cu11==11.7.91 - omegaconf==2.3.0 - openunmix==1.2.1 - orjson==3.9.15 - pandocfilters==1.5.1 - pathlib-abc==0.1.1 - pathy==0.11.0 - pgvector==0.2.2 - phonemizer==3.2.1 - pipreqs==0.5.0 - praatio==6.2.0 - preshed==3.0.9 - protobuf==4.25.3 - pyarrow==15.0.2 - pyarrow-hotfix==0.6 - pydantic==1.10.14 - pydub==0.25.1 - pylatexenc==2.10 - pynini==2.1.6 - pypinyin==0.48.0 - python-dateutil==2.9.0.post0 - python-multipart==0.0.9 - rdflib==7.0.0 - referencing==0.33.0 - regex==2023.12.25 - responses==0.18.0 - retrying==1.3.4 - rfc3986==1.5.0 - rpds-py==0.18.0 - safetensors==0.4.2 - segments==2.2.1 - semantic-version==2.10.0 - sentencepiece==0.2.0 - smart-open==6.4.0 - sniffio==1.3.1 - soupsieve==2.5 - spacy==3.5.2 - spacy-legacy==3.0.12 - spacy-loggers==1.0.5 - srsly==2.4.8 - starlette==0.36.3 - submitit==1.5.1 - sympy==1.12 - tabulate==0.9.0 - tensorboard==2.16.2 - tensorboard-data-server==0.7.2 - thinc==8.1.12 - tinycss2==1.2.1 - tokenizers==0.15.2 - toolz==0.12.1 - torch==2.0.1 - torchaudio==2.0.2 - torchmetrics==0.11.1 - transformers==4.38.2 - treetable==0.2.5 - triton==2.0.0 - typer==0.7.0 - uritemplate==4.1.1 - uvicorn==0.28.0 - wasabi==1.1.2 - webencodings==0.5.1 - websockets==11.0.3 - werkzeug==3.0.1 - xformers==0.0.22 - xxhash==3.4.1 - yarg==0.1.9 - yarl==1.9.4 prefix: /home/pyp/miniconda3/envs/voicecraft ================================================ FILE: gradio_app.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "id": "9b6a0c92", "metadata": {}, "source": [ "### Only do the below if you are using docker" ] }, { "cell_type": "code", "execution_count": null, "id": "961faa43", "metadata": {}, "outputs": [], "source": [ "!source ~/.bashrc && \\\n", " apt-get update && \\\n", " apt-get install -y espeak espeak-data libespeak1 libespeak-dev && \\\n", " apt-get install -y festival* && \\\n", " apt-get install -y build-essential && \\\n", " apt-get install -y flac libasound2-dev libsndfile1-dev vorbis-tools && \\\n", " apt-get install -y libxml2-dev libxslt-dev zlib1g-dev" ] }, { "cell_type": "code", "execution_count": null, "id": "598d75cf", "metadata": {}, "outputs": [], "source": [ "!source ~/.bashrc && \\\n", " conda activate voicecraft && \\\n", " pip install -r gradio_requirements.txt" ] }, { "cell_type": "markdown", "id": "8b9c4436", "metadata": {}, "source": [ "# STOP\n", "You have to do this part manually using the mouse/keyboard and the tabs at the top.\n", "\n", "* Refresh your browser to make sure it picks up the new kernel.\n", "* Kernel -> Change Kernel -> Select Kernel -> voicecraft\n", "* Kernel -> Restart Kernel -> Yes\n", "\n", "Now you can run the rest of the notebook and get an audio sample output. It will automatically download more models and such. The next time you use this container, you can just start below here as the dependencies will remain available until you delete the docker container." ] }, { "cell_type": "code", "execution_count": null, "id": "f089aa96", "metadata": {}, "outputs": [], "source": [ "from gradio_app import app\n", "app.launch()" ] } ], "metadata": { "kernelspec": { "display_name": "voicecraft", "language": "python", "name": "voicecraft" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.19" } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: gradio_app.py ================================================ import os import re from num2words import num2words import gradio as gr import torch import torchaudio from data.tokenizer import ( AudioTokenizer, TextTokenizer, ) from models import voicecraft import io import numpy as np import random import uuid import nltk nltk.download('punkt') DEMO_PATH = os.getenv("DEMO_PATH", "./demo") TMP_PATH = os.getenv("TMP_PATH", "./demo/temp") MODELS_PATH = os.getenv("MODELS_PATH", "./pretrained_models") device = "cuda" if torch.cuda.is_available() else "cpu" whisper_model, align_model, voicecraft_model = None, None, None _whitespace_re = re.compile(r"\s+") def get_random_string(): return "".join(str(uuid.uuid4()).split("-")) def seed_everything(seed): if seed != -1: os.environ['PYTHONHASHSEED'] = str(seed) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True class WhisperxAlignModel: def __init__(self): from whisperx import load_align_model self.model, self.metadata = load_align_model(language_code="en", device=device) def align(self, segments, audio_path): from whisperx import align, load_audio audio = load_audio(audio_path) return align(segments, self.model, self.metadata, audio, device, return_char_alignments=False)["segments"] class WhisperModel: def __init__(self, model_name): from whisper import load_model self.model = load_model(model_name, device) from whisper.tokenizer import get_tokenizer tokenizer = get_tokenizer(multilingual=False) self.supress_tokens = [-1] + [ i for i in range(tokenizer.eot) if all(c in "0123456789" for c in tokenizer.decode([i]).removeprefix(" ")) ] def transcribe(self, audio_path): return self.model.transcribe(audio_path, suppress_tokens=self.supress_tokens, word_timestamps=True)["segments"] class WhisperxModel: def __init__(self, model_name, align_model: WhisperxAlignModel): from whisperx import load_model self.model = load_model(model_name, device, asr_options={"suppress_numerals": True, "max_new_tokens": None, "clip_timestamps": None, "hallucination_silence_threshold": None}) self.align_model = align_model def transcribe(self, audio_path): segments = self.model.transcribe(audio_path, batch_size=8)["segments"] for segment in segments: segment['text'] = replace_numbers_with_words(segment['text']) return self.align_model.align(segments, audio_path) def load_models(whisper_backend_name, whisper_model_name, alignment_model_name, voicecraft_model_name): global transcribe_model, align_model, voicecraft_model if voicecraft_model_name == "330M": voicecraft_model_name = "giga330M" elif voicecraft_model_name == "830M": voicecraft_model_name = "giga830M" elif voicecraft_model_name == "330M_TTSEnhanced": voicecraft_model_name = "330M_TTSEnhanced" elif voicecraft_model_name == "830M_TTSEnhanced": voicecraft_model_name = "830M_TTSEnhanced" if alignment_model_name is not None: align_model = WhisperxAlignModel() if whisper_model_name is not None: if whisper_backend_name == "whisper": transcribe_model = WhisperModel(whisper_model_name) else: if align_model is None: raise gr.Error("Align model required for whisperx backend") transcribe_model = WhisperxModel(whisper_model_name, align_model) voicecraft_name = f"{voicecraft_model_name}.pth" model = voicecraft.VoiceCraft.from_pretrained(f"pyp1/VoiceCraft_{voicecraft_name.replace('.pth', '')}") phn2num = model.args.phn2num config = model.args model.to(device) encodec_fn = f"{MODELS_PATH}/encodec_4cb2048_giga.th" if not os.path.exists(encodec_fn): os.system(f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th -O " + encodec_fn) voicecraft_model = { "config": config, "phn2num": phn2num, "model": model, "text_tokenizer": TextTokenizer(backend="espeak"), "audio_tokenizer": AudioTokenizer(signature=encodec_fn) } return gr.Accordion() def get_transcribe_state(segments): words_info = [word_info for segment in segments for word_info in segment["words"]] transcript = " ".join([segment["text"] for segment in segments]) transcript = transcript[1:] if transcript[0] == " " else transcript return { "segments": segments, "transcript": transcript, "words_info": words_info, "transcript_with_start_time": " ".join([f"{word['start']} {word['word']}" for word in words_info]), "transcript_with_end_time": " ".join([f"{word['word']} {word['end']}" for word in words_info]), "word_bounds": [f"{word['start']} {word['word']} {word['end']}" for word in words_info] } def transcribe(seed, audio_path): if transcribe_model is None: raise gr.Error("Transcription model not loaded") seed_everything(seed) segments = transcribe_model.transcribe(audio_path) state = get_transcribe_state(segments) return [ state["transcript"], state["transcript_with_start_time"], state["transcript_with_end_time"], gr.Dropdown(value=state["word_bounds"][-1], choices=state["word_bounds"], interactive=True), # prompt_to_word gr.Dropdown(value=state["word_bounds"][0], choices=state["word_bounds"], interactive=True), # edit_from_word gr.Dropdown(value=state["word_bounds"][-1], choices=state["word_bounds"], interactive=True), # edit_to_word state ] def align_segments(transcript, audio_path): from aeneas.executetask import ExecuteTask from aeneas.task import Task import json config_string = 'task_language=eng|os_task_file_format=json|is_text_type=plain' tmp_transcript_path = os.path.join(TMP_PATH, f"{get_random_string()}.txt") tmp_sync_map_path = os.path.join(TMP_PATH, f"{get_random_string()}.json") with open(tmp_transcript_path, "w") as f: f.write(transcript) task = Task(config_string=config_string) task.audio_file_path_absolute = os.path.abspath(audio_path) task.text_file_path_absolute = os.path.abspath(tmp_transcript_path) task.sync_map_file_path_absolute = os.path.abspath(tmp_sync_map_path) ExecuteTask(task).execute() task.output_sync_map_file() with open(tmp_sync_map_path, "r") as f: return json.load(f) def align(seed, transcript, audio_path): if align_model is None: raise gr.Error("Align model not loaded") seed_everything(seed) transcript = replace_numbers_with_words(transcript).replace(" ", " ").replace(" ", " ") fragments = align_segments(transcript, audio_path) segments = [{ "start": float(fragment["begin"]), "end": float(fragment["end"]), "text": " ".join(fragment["lines"]) } for fragment in fragments["fragments"]] segments = align_model.align(segments, audio_path) state = get_transcribe_state(segments) return [ state["transcript_with_start_time"], state["transcript_with_end_time"], gr.Dropdown(value=state["word_bounds"][-1], choices=state["word_bounds"], interactive=True), # prompt_to_word gr.Dropdown(value=state["word_bounds"][0], choices=state["word_bounds"], interactive=True), # edit_from_word gr.Dropdown(value=state["word_bounds"][-1], choices=state["word_bounds"], interactive=True), # edit_to_word state ] def get_output_audio(audio_tensors, codec_audio_sr): result = torch.cat(audio_tensors, 1) buffer = io.BytesIO() torchaudio.save(buffer, result, int(codec_audio_sr), format="wav") buffer.seek(0) return buffer.read() def replace_numbers_with_words(sentence): sentence = re.sub(r'(\d+)', r' \1 ', sentence) # add spaces around numbers def replace_with_words(match): num = match.group(0) try: return num2words(num) # Convert numbers to words except: return num # In case num2words fails (unlikely with digits but just to be safe) return re.sub(r'\b\d+\b', replace_with_words, sentence) # Regular expression that matches numbers def run(seed, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p, temperature, stop_repetition, sample_batch_size, kvcache, silence_tokens, audio_path, transcribe_state, transcript, smart_transcript, mode, prompt_end_time, edit_start_time, edit_end_time, split_text, selected_sentence, previous_audio_tensors): if voicecraft_model is None: raise gr.Error("VoiceCraft model not loaded") if smart_transcript and (transcribe_state is None): raise gr.Error("Can't use smart transcript: whisper transcript not found") seed_everything(seed) transcript = replace_numbers_with_words(transcript).replace(" ", " ").replace(" ", " ") # replace numbers with words, so that the phonemizer can do a better job if mode == "Long TTS": if split_text == "Newline": sentences = transcript.split('\n') else: from nltk.tokenize import sent_tokenize sentences = sent_tokenize(transcript.replace("\n", " ")) elif mode == "Rerun": colon_position = selected_sentence.find(':') selected_sentence_idx = int(selected_sentence[:colon_position]) sentences = [selected_sentence[colon_position + 1:]] else: sentences = [transcript.replace("\n", " ")] info = torchaudio.info(audio_path) audio_dur = info.num_frames / info.sample_rate audio_tensors = [] inference_transcript = "" for sentence in sentences: decode_config = {"top_k": top_k, "top_p": top_p, "temperature": temperature, "stop_repetition": stop_repetition, "kvcache": kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr, "silence_tokens": silence_tokens, "sample_batch_size": sample_batch_size} if mode != "Edit": from inference_tts_scale import inference_one_sample if smart_transcript: target_transcript = "" for word in transcribe_state["words_info"]: if word["end"] < prompt_end_time: target_transcript += word["word"] + (" " if word["word"][-1] != " " else "") elif (word["start"] + word["end"]) / 2 < prompt_end_time: # include part of the word it it's big, but adjust prompt_end_time target_transcript += word["word"] + (" " if word["word"][-1] != " " else "") prompt_end_time = word["end"] break else: break target_transcript += f" {sentence}" else: target_transcript = sentence inference_transcript += target_transcript + "\n" target_transcript = re.sub(_whitespace_re, " ", target_transcript) prompt_end_frame = int(min(audio_dur, prompt_end_time) * info.sample_rate) _, gen_audio = inference_one_sample(voicecraft_model["model"], voicecraft_model["config"], voicecraft_model["phn2num"], voicecraft_model["text_tokenizer"], voicecraft_model["audio_tokenizer"], audio_path, target_transcript, device, decode_config, prompt_end_frame) else: from inference_speech_editing_scale import inference_one_sample if smart_transcript: target_transcript = "" for word in transcribe_state["words_info"]: if word["start"] < edit_start_time: target_transcript += word["word"] + (" " if word["word"][-1] != " " else "") else: break target_transcript += f" {sentence}" for word in transcribe_state["words_info"]: if word["end"] > edit_end_time: target_transcript += word["word"] + (" " if word["word"][-1] != " " else "") else: target_transcript = sentence inference_transcript += target_transcript + "\n" target_transcript = re.sub(_whitespace_re, " ", target_transcript) morphed_span = (max(edit_start_time - left_margin, 1 / codec_sr), min(edit_end_time + right_margin, audio_dur)) mask_interval = [[round(morphed_span[0]*codec_sr), round(morphed_span[1]*codec_sr)]] mask_interval = torch.LongTensor(mask_interval) _, gen_audio = inference_one_sample(voicecraft_model["model"], voicecraft_model["config"], voicecraft_model["phn2num"], voicecraft_model["text_tokenizer"], voicecraft_model["audio_tokenizer"], audio_path, target_transcript, mask_interval, device, decode_config) gen_audio = gen_audio[0].cpu() audio_tensors.append(gen_audio) if mode != "Rerun": output_audio = get_output_audio(audio_tensors, codec_audio_sr) sentences = [f"{idx}: {text}" for idx, text in enumerate(sentences)] component = gr.Dropdown(choices=sentences, value=sentences[0]) return output_audio, inference_transcript, component, audio_tensors else: previous_audio_tensors[selected_sentence_idx] = audio_tensors[0] output_audio = get_output_audio(previous_audio_tensors, codec_audio_sr) sentence_audio = get_output_audio(audio_tensors, codec_audio_sr) return output_audio, inference_transcript, sentence_audio, previous_audio_tensors def update_input_audio(audio_path): if audio_path is None: return 0, 0, 0 info = torchaudio.info(audio_path) max_time = round(info.num_frames / info.sample_rate, 2) return [ gr.Slider(maximum=max_time, value=max_time), gr.Slider(maximum=max_time, value=0), gr.Slider(maximum=max_time, value=max_time), ] def change_mode(mode): # tts_mode_controls, edit_mode_controls, edit_word_mode, split_text, long_tts_sentence_editor return [ gr.Group(visible=mode != "Edit"), gr.Group(visible=mode == "Edit"), gr.Radio(visible=mode == "Edit"), gr.Radio(visible=mode == "Long TTS"), gr.Group(visible=mode == "Long TTS"), ] def load_sentence(selected_sentence, codec_audio_sr, audio_tensors): if selected_sentence is None: return None colon_position = selected_sentence.find(':') selected_sentence_idx = int(selected_sentence[:colon_position]) return get_output_audio([audio_tensors[selected_sentence_idx]], codec_audio_sr) def update_bound_word(is_first_word, selected_word, edit_word_mode): if selected_word is None: return None word_start_time = float(selected_word.split(' ')[0]) word_end_time = float(selected_word.split(' ')[-1]) if edit_word_mode == "Replace half": bound_time = (word_start_time + word_end_time) / 2 elif is_first_word: bound_time = word_start_time else: bound_time = word_end_time return bound_time def update_bound_words(from_selected_word, to_selected_word, edit_word_mode): return [ update_bound_word(True, from_selected_word, edit_word_mode), update_bound_word(False, to_selected_word, edit_word_mode), ] smart_transcript_info = """ If enabled, the target transcript will be constructed for you:
- In TTS and Long TTS mode just write the text you want to synthesize.
- In Edit mode just write the text to replace selected editing segment.
If disabled, you should write the target transcript yourself:
- In TTS mode write prompt transcript followed by generation transcript.
- In Long TTS select split by newline (SENTENCE SPLIT WON'T WORK) and start each line with a prompt transcript.
- In Edit mode write full prompt
""" demo_original_transcript = "Gwynplaine had, besides, for his work and for his feats of strength, round his neck and over his shoulders, an esclavine of leather." demo_text = { "TTS": { "smart": "I cannot believe that the same model can also do text to speech synthesis too!", "regular": "Gwynplaine had, besides, for his work and for his feats of strength, I cannot believe that the same model can also do text to speech synthesis too!" }, "Edit": { "smart": "take over the stage for half an hour,", "regular": "Gwynplaine had, besides, for his work and for his feats of strength, take over the stage for half an hour, an esclavine of leather." }, "Long TTS": { "smart": "You can run the model on a big text!\n" "Just write it line-by-line. Or sentence-by-sentence.\n" "If some sentences sound odd, just rerun the model on them, no need to generate the whole text again!", "regular": "Gwynplaine had, besides, for his work and for his feats of strength, You can run the model on a big text!\n" "Gwynplaine had, besides, for his work and for his feats of strength, Just write it line-by-line. Or sentence-by-sentence.\n" "Gwynplaine had, besides, for his work and for his feats of strength, If some sentences sound odd, just rerun the model on them, no need to generate the whole text again!" } } all_demo_texts = {vv for k, v in demo_text.items() for kk, vv in v.items()} demo_words = ['0.069 Gwynplain 0.611', '0.671 had, 0.912', '0.952 besides, 1.414', '1.494 for 1.634', '1.695 his 1.835', '1.915 work 2.136', '2.196 and 2.297', '2.337 for 2.517', '2.557 his 2.678', '2.758 feats 3.019', '3.079 of 3.139', '3.2 strength, 3.561', '4.022 round 4.263', '4.303 his 4.444', '4.524 neck 4.705', '4.745 and 4.825', '4.905 over 5.086', '5.146 his 5.266', '5.307 shoulders, 5.768', '6.23 an 6.33', '6.531 esclavine 7.133', '7.213 of 7.293', '7.353 leather. 7.614'] demo_words_info = [{'word': 'Gwynplain', 'start': 0.069, 'end': 0.611, 'score': 0.833}, {'word': 'had,', 'start': 0.671, 'end': 0.912, 'score': 0.879}, {'word': 'besides,', 'start': 0.952, 'end': 1.414, 'score': 0.863}, {'word': 'for', 'start': 1.494, 'end': 1.634, 'score': 0.89}, {'word': 'his', 'start': 1.695, 'end': 1.835, 'score': 0.669}, {'word': 'work', 'start': 1.915, 'end': 2.136, 'score': 0.916}, {'word': 'and', 'start': 2.196, 'end': 2.297, 'score': 0.766}, {'word': 'for', 'start': 2.337, 'end': 2.517, 'score': 0.808}, {'word': 'his', 'start': 2.557, 'end': 2.678, 'score': 0.786}, {'word': 'feats', 'start': 2.758, 'end': 3.019, 'score': 0.97}, {'word': 'of', 'start': 3.079, 'end': 3.139, 'score': 0.752}, {'word': 'strength,', 'start': 3.2, 'end': 3.561, 'score': 0.742}, {'word': 'round', 'start': 4.022, 'end': 4.263, 'score': 0.916}, {'word': 'his', 'start': 4.303, 'end': 4.444, 'score': 0.666}, {'word': 'neck', 'start': 4.524, 'end': 4.705, 'score': 0.908}, {'word': 'and', 'start': 4.745, 'end': 4.825, 'score': 0.882}, {'word': 'over', 'start': 4.905, 'end': 5.086, 'score': 0.847}, {'word': 'his', 'start': 5.146, 'end': 5.266, 'score': 0.791}, {'word': 'shoulders,', 'start': 5.307, 'end': 5.768, 'score': 0.729}, {'word': 'an', 'start': 6.23, 'end': 6.33, 'score': 0.854}, {'word': 'esclavine', 'start': 6.531, 'end': 7.133, 'score': 0.803}, {'word': 'of', 'start': 7.213, 'end': 7.293, 'score': 0.772}, {'word': 'leather.', 'start': 7.353, 'end': 7.614, 'score': 0.896}] def update_demo(mode, smart_transcript, edit_word_mode, transcript, edit_from_word, edit_to_word): if transcript not in all_demo_texts: return transcript, edit_from_word, edit_to_word replace_half = edit_word_mode == "Replace half" change_edit_from_word = edit_from_word == demo_words[2] or edit_from_word == demo_words[3] change_edit_to_word = edit_to_word == demo_words[11] or edit_to_word == demo_words[12] demo_edit_from_word_value = demo_words[2] if replace_half else demo_words[3] demo_edit_to_word_value = demo_words[12] if replace_half else demo_words[11] return [ demo_text[mode]["smart" if smart_transcript else "regular"], demo_edit_from_word_value if change_edit_from_word else edit_from_word, demo_edit_to_word_value if change_edit_to_word else edit_to_word, ] def get_app(): with gr.Blocks() as app: with gr.Row(): with gr.Column(scale=2): load_models_btn = gr.Button(value="Load models") with gr.Column(scale=5): with gr.Accordion("Select models", open=False) as models_selector: with gr.Row(): voicecraft_model_choice = gr.Radio(label="VoiceCraft model", value="830M_TTSEnhanced", choices=["330M", "830M", "330M_TTSEnhanced", "830M_TTSEnhanced"]) whisper_backend_choice = gr.Radio(label="Whisper backend", value="whisperX", choices=["whisperX", "whisper"]) whisper_model_choice = gr.Radio(label="Whisper model", value="base.en", choices=[None, "base.en", "small.en", "medium.en", "large"]) align_model_choice = gr.Radio(label="Forced alignment model", value="whisperX", choices=["whisperX", None]) with gr.Row(): with gr.Column(scale=2): input_audio = gr.Audio(value=f"{DEMO_PATH}/5895_34622_000026_000002.wav", label="Input Audio", type="filepath", interactive=True) with gr.Group(): original_transcript = gr.Textbox(label="Original transcript", lines=5, value=demo_original_transcript, info="Use whisperx model to get the transcript. Fix and align it if necessary.") with gr.Accordion("Word start time", open=False): transcript_with_start_time = gr.Textbox(label="Start time", lines=5, interactive=False, info="Start time before each word") with gr.Accordion("Word end time", open=False): transcript_with_end_time = gr.Textbox(label="End time", lines=5, interactive=False, info="End time after each word") transcribe_btn = gr.Button(value="Transcribe") align_btn = gr.Button(value="Align") with gr.Column(scale=3): with gr.Group(): transcript = gr.Textbox(label="Text", lines=7, value=demo_text["TTS"]["smart"]) with gr.Row(): smart_transcript = gr.Checkbox(label="Smart transcript", value=True) with gr.Accordion(label="?", open=False): info = gr.Markdown(value=smart_transcript_info) with gr.Row(): mode = gr.Radio(label="Mode", choices=["TTS", "Edit", "Long TTS"], value="TTS") split_text = gr.Radio(label="Split text", choices=["Newline", "Sentence"], value="Newline", info="Split text into parts and run TTS for each part.", visible=False) edit_word_mode = gr.Radio(label="Edit word mode", choices=["Replace half", "Replace all"], value="Replace all", info="What to do with first and last word", visible=False) with gr.Group() as tts_mode_controls: prompt_to_word = gr.Dropdown(label="Last word in prompt", choices=demo_words, value=demo_words[11], interactive=True) prompt_end_time = gr.Slider(label="Prompt end time", minimum=0, maximum=7.614, step=0.001, value=3.600) with gr.Group(visible=False) as edit_mode_controls: with gr.Row(): edit_from_word = gr.Dropdown(label="First word to edit", choices=demo_words, value=demo_words[12], interactive=True) edit_to_word = gr.Dropdown(label="Last word to edit", choices=demo_words, value=demo_words[18], interactive=True) with gr.Row(): edit_start_time = gr.Slider(label="Edit from time", minimum=0, maximum=7.614, step=0.001, value=4.022) edit_end_time = gr.Slider(label="Edit to time", minimum=0, maximum=7.614, step=0.001, value=5.768) run_btn = gr.Button(value="Run") with gr.Column(scale=2): output_audio = gr.Audio(label="Output Audio") with gr.Accordion("Inference transcript", open=False): inference_transcript = gr.Textbox(label="Inference transcript", lines=5, interactive=False, info="Inference was performed on this transcript.") with gr.Group(visible=False) as long_tts_sentence_editor: sentence_selector = gr.Dropdown(label="Sentence", value=None, info="Select sentence you want to regenerate") sentence_audio = gr.Audio(label="Sentence Audio", scale=2) rerun_btn = gr.Button(value="Rerun") with gr.Row(): with gr.Accordion("Generation Parameters - change these if you are unhappy with the generation", open=False): stop_repetition = gr.Radio(label="stop_repetition", choices=[-1, 1, 2, 3, 4], value=3, info="if there are long silence in the generated audio, reduce the stop_repetition to 2 or 1. -1 = disabled") sample_batch_size = gr.Number(label="speech rate", value=3, precision=0, info="The higher the number, the faster the output will be. " "Under the hood, the model will generate this many samples and choose the shortest one. " "For giga330M_TTSEnhanced, 1 or 2 should be fine since the model is trained to do TTS.") seed = gr.Number(label="seed", value=-1, precision=0, info="random seeds always works :)") kvcache = gr.Radio(label="kvcache", choices=[0, 1], value=1, info="set to 0 to use less VRAM, but with slower inference") left_margin = gr.Number(label="left_margin", value=0.08, info="margin to the left of the editing segment") right_margin = gr.Number(label="right_margin", value=0.08, info="margin to the right of the editing segment") top_p = gr.Number(label="top_p", value=1, info="do not do topp sampling therefore set it to 1") temperature = gr.Number(label="temperature", value=1, info="haven't try other values, do not recommend to change") top_k = gr.Number(label="top_k", value=40, info="40 is a good default, can also try 20, 30") codec_audio_sr = gr.Number(label="codec_audio_sr", value=16000, info='encodec specific, Do not change') codec_sr = gr.Number(label="codec_sr", value=50, info='encodec specific, Do not change') silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]", info="encodec specific, do not change") audio_tensors = gr.State() transcribe_state = gr.State(value={"words_info": demo_words_info}) mode.change(fn=update_demo, inputs=[mode, smart_transcript, edit_word_mode, transcript, edit_from_word, edit_to_word], outputs=[transcript, edit_from_word, edit_to_word]) edit_word_mode.change(fn=update_demo, inputs=[mode, smart_transcript, edit_word_mode, transcript, edit_from_word, edit_to_word], outputs=[transcript, edit_from_word, edit_to_word]) smart_transcript.change(fn=update_demo, inputs=[mode, smart_transcript, edit_word_mode, transcript, edit_from_word, edit_to_word], outputs=[transcript, edit_from_word, edit_to_word]) load_models_btn.click(fn=load_models, inputs=[whisper_backend_choice, whisper_model_choice, align_model_choice, voicecraft_model_choice], outputs=[models_selector]) input_audio.upload(fn=update_input_audio, inputs=[input_audio], outputs=[prompt_end_time, edit_start_time, edit_end_time]) transcribe_btn.click(fn=transcribe, inputs=[seed, input_audio], outputs=[original_transcript, transcript_with_start_time, transcript_with_end_time, prompt_to_word, edit_from_word, edit_to_word, transcribe_state]) align_btn.click(fn=align, inputs=[seed, original_transcript, input_audio], outputs=[transcript_with_start_time, transcript_with_end_time, prompt_to_word, edit_from_word, edit_to_word, transcribe_state]) mode.change(fn=change_mode, inputs=[mode], outputs=[tts_mode_controls, edit_mode_controls, edit_word_mode, split_text, long_tts_sentence_editor]) run_btn.click(fn=run, inputs=[ seed, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p, temperature, stop_repetition, sample_batch_size, kvcache, silence_tokens, input_audio, transcribe_state, transcript, smart_transcript, mode, prompt_end_time, edit_start_time, edit_end_time, split_text, sentence_selector, audio_tensors ], outputs=[output_audio, inference_transcript, sentence_selector, audio_tensors]) sentence_selector.change(fn=load_sentence, inputs=[sentence_selector, codec_audio_sr, audio_tensors], outputs=[sentence_audio]) rerun_btn.click(fn=run, inputs=[ seed, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p, temperature, stop_repetition, sample_batch_size, kvcache, silence_tokens, input_audio, transcribe_state, transcript, smart_transcript, gr.State(value="Rerun"), prompt_end_time, edit_start_time, edit_end_time, split_text, sentence_selector, audio_tensors ], outputs=[output_audio, inference_transcript, sentence_audio, audio_tensors]) prompt_to_word.change(fn=update_bound_word, inputs=[gr.State(False), prompt_to_word, gr.State("Replace all")], outputs=[prompt_end_time]) edit_from_word.change(fn=update_bound_word, inputs=[gr.State(True), edit_from_word, edit_word_mode], outputs=[edit_start_time]) edit_to_word.change(fn=update_bound_word, inputs=[gr.State(False), edit_to_word, edit_word_mode], outputs=[edit_end_time]) edit_word_mode.change(fn=update_bound_words, inputs=[edit_from_word, edit_to_word, edit_word_mode], outputs=[edit_start_time, edit_end_time]) return app if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="VoiceCraft gradio app.") parser.add_argument("--demo-path", default="./demo", help="Path to demo directory") parser.add_argument("--tmp-path", default="./demo/temp", help="Path to tmp directory") parser.add_argument("--models-path", default="./pretrained_models", help="Path to voicecraft models directory") parser.add_argument("--port", default=7860, type=int, help="App port") parser.add_argument("--share", action="store_true", help="Launch with public url") parser.add_argument("--server_name", default="127.0.0.1", type=str, help="Server name for launching the app. 127.0.0.1 for localhost; 0.0.0.0 to allow access from other machines in the local network. Might also give access to external users depends on the firewall settings.") os.environ["USER"] = os.getenv("USER", "user") args = parser.parse_args() DEMO_PATH = args.demo_path TMP_PATH = args.tmp_path MODELS_PATH = args.models_path app = get_app() app.queue().launch(share=args.share, server_name=args.server_name, server_port=args.port) ================================================ FILE: gradio_requirements.txt ================================================ gradio==3.50.2 nltk>=3.8.1 openai-whisper>=20231117 aeneas>=1.7.3.0 whisperx>=3.1.1 huggingface_hub==0.22.2 num2words==0.5.13 ================================================ FILE: inference_speech_editing.ipynb ================================================ { "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" \n", "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"7\"\n", "os.environ[\"USER\"] = \"YOUR_USERNAME\" # TODO change this to your username" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/pyp/miniconda3/envs/voicecraft/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "# import libs\n", "import torch\n", "import torchaudio\n", "import numpy as np\n", "import random\n", "from argparse import Namespace\n", "\n", "from data.tokenizer import (\n", " AudioTokenizer,\n", " TextTokenizer,\n", ")\n", "\n", "from models import voicecraft" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# install MFA models and dictionaries if you haven't done so already\n", "!source ~/.bashrc && \\\n", " conda activate voicecraft && \\\n", " mfa model download dictionary english_us_arpa && \\\n", " mfa model download acoustic english_us_arpa" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# hyperparameters for inference\n", "left_margin = 0.08\n", "right_margin = 0.08\n", "codec_audio_sr = 16000\n", "codec_sr = 50\n", "top_k = 40\n", "top_p = 1\n", "temperature = 1\n", "kvcache = 0\n", "# adjust the below three arguments if the generation is not as good\n", "seed = 1 # random seed magic\n", "silence_tokens = [1388,1898,131] # if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1\n", "stop_repetition = -1 # -1 means do not adjust prob of silence tokens. if there are long silence or unnaturally strecthed words, increase sample_batch_size to 2, 3 or even 4\n", "# what this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest\n", "def seed_everything(seed):\n", " os.environ['PYTHONHASHSEED'] = str(seed)\n", " random.seed(seed)\n", " np.random.seed(seed)\n", " torch.manual_seed(seed)\n", " torch.cuda.manual_seed(seed)\n", " torch.backends.cudnn.benchmark = False\n", " torch.backends.cudnn.deterministic = True\n", "seed_everything(seed)\n", "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "# load model, tokenizer, and other necessary files\n", "voicecraft_name=\"giga330M.pth\" # or gigaHalfLibri330M_TTSEnhanced_max16s.pth, giga830M.pth\n", "\n", "# the new way of loading the model, with huggingface, recommended\n", "from models import voicecraft\n", "model = voicecraft.VoiceCraft.from_pretrained(f\"pyp1/VoiceCraft_{voicecraft_name.replace('.pth', '')}\")\n", "phn2num = model.args.phn2num\n", "config = vars(model.args)\n", "model.to(device)\n", "\n", "# # the old way of loading the model\n", "# from models import voicecraft\n", "# filepath = hf_hub_download(repo_id=\"pyp1/VoiceCraft\", filename=voicecraft_name, repo_type=\"model\")\n", "# ckpt = torch.load(filepath, map_location=\"cpu\")\n", "# model = voicecraft.VoiceCraft(ckpt[\"config\"])\n", "# model.load_state_dict(ckpt[\"model\"])\n", "# config = vars(model.args)\n", "# phn2num = ckpt[\"phn2num\"]\n", "# model.to(device)\n", "# model.eval()\n", "\n", "encodec_fn = \"./pretrained_models/encodec_4cb2048_giga.th\"\n", "if not os.path.exists(encodec_fn):\n", " os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th\")\n", " os.system(f\"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th\")\n", "audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu\n", "\n", "text_tokenizer = TextTokenizer(backend=\"espeak\")\n", "\n", "# point to the original file or record the file\n", "# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file\n", "orig_audio = \"./demo/84_121550_000074_000000.wav\"\n", "orig_transcript = \"But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,\"\n", "# move the audio and transcript to temp folder\n", "temp_folder = \"./demo/temp\"\n", "os.makedirs(temp_folder, exist_ok=True)\n", "os.system(f\"cp {orig_audio} {temp_folder}\")\n", "filename = os.path.splitext(orig_audio.split(\"/\")[-1])[0]\n", "with open(f\"{temp_folder}/{filename}.txt\", \"w\") as f:\n", " f.write(orig_transcript)\n", "# run MFA to get the alignment\n", "align_temp = f\"{temp_folder}/mfa_alignments\"\n", "os.makedirs(align_temp, exist_ok=True)\n", "os.system(f\"mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp}\")\n", "# if it fail, it could be because the audio is too hard for the alignment model, increasing the beam size usually solves the issue\n", "# os.system(f\"mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000\")\n", "audio_fn = f\"{temp_folder}/{filename}.wav\"\n", "transcript_fn = f\"{temp_folder}/{filename}.txt\"\n", "align_fn = f\"{align_temp}/{filename}.csv\"\n", "\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "original:\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "edited:\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "editTypes_set = set(['substitution', 'insertion', 'deletion'])\n", "# propose what do you want the target modified transcript to be\n", "target_transcript = \"But when I saw the mirage of the lake in the distance, which the sense deceives, Lost not by distance any of its marks,\"\n", "edit_type = \"substitution\"\n", "assert edit_type in editTypes_set, f\"Invalid edit type {edit_type}. Must be one of {editTypes_set}.\"\n", "\n", "# if you want to do a second modification on top of the first one, write down the second modification (target_transcript2, type_of_modification2)\n", "# make sure the two modification do not overlap, if they do, you need to combine them into one modification\n", "\n", "# run the script to turn user input to the format that the model can take\n", "from edit_utils import get_span\n", "orig_span, new_span = get_span(orig_transcript, target_transcript, edit_type)\n", "if orig_span[0] > orig_span[1]:\n", " RuntimeError(f\"example {audio_fn} failed\")\n", "if orig_span[0] == orig_span[1]:\n", " orig_span_save = [orig_span[0]]\n", "else:\n", " orig_span_save = orig_span\n", "if new_span[0] == new_span[1]:\n", " new_span_save = [new_span[0]]\n", "else:\n", " new_span_save = new_span\n", "\n", "orig_span_save = \",\".join([str(item) for item in orig_span_save])\n", "new_span_save = \",\".join([str(item) for item in new_span_save])\n", "from inference_speech_editing_scale import get_mask_interval\n", "\n", "start, end = get_mask_interval(align_fn, orig_span_save, edit_type)\n", "info = torchaudio.info(audio_fn)\n", "audio_dur = info.num_frames / info.sample_rate\n", "morphed_span = (max(start - left_margin, 1/codec_sr), min(end + right_margin, audio_dur)) # in seconds\n", "\n", "# span in codec frames\n", "mask_interval = [[round(morphed_span[0]*codec_sr), round(morphed_span[1]*codec_sr)]]\n", "mask_interval = torch.LongTensor(mask_interval) # [M,2], M==1 for now\n", "\n", "\n", "\n", "# run the model to get the output\n", "from inference_speech_editing_scale import inference_one_sample\n", "\n", "decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, 'kvcache': kvcache, \"codec_audio_sr\": codec_audio_sr, \"codec_sr\": codec_sr, \"silence_tokens\": silence_tokens}\n", "orig_audio, new_audio = inference_one_sample(model, Namespace(**config), phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_transcript, mask_interval, device, decode_config)\n", " \n", "# save segments for comparison\n", "orig_audio, new_audio = orig_audio[0].cpu(), new_audio[0].cpu()\n", "# logging.info(f\"length of the resynthesize orig audio: {orig_audio.shape}\")\n", "\n", "# display the audio\n", "from IPython.display import Audio\n", "print(\"original:\")\n", "display(Audio(orig_audio, rate=codec_audio_sr))\n", "\n", "print(\"edited:\")\n", "display(Audio(new_audio, rate=codec_audio_sr))\n", "\n", "# # save the audio\n", "# # output_dir\n", "# output_dir = \"./demo/generated_se\"\n", "# os.makedirs(output_dir, exist_ok=True)\n", "\n", "# save_fn_new = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_new_seed{seed}.wav\"\n", "\n", "# torchaudio.save(save_fn_new, new_audio, codec_audio_sr)\n", "\n", "# save_fn_orig = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_orig.wav\"\n", "# if not os.path.isfile(save_fn_orig):\n", "# orig_audio, orig_sr = torchaudio.load(audio_fn)\n", "# if orig_sr != codec_audio_sr:\n", "# orig_audio = torchaudio.transforms.Resample(orig_sr, codec_audio_sr)(orig_audio)\n", "# torchaudio.save(save_fn_orig, orig_audio, codec_audio_sr)\n", "\n", "# # if you get error importing T5 in transformers\n", "# # try \n", "# # pip uninstall Pillow\n", "# # pip install Pillow\n", "# # you are likely to get warning looks like WARNING:phonemizer:words count mismatch on 300.0% of the lines (3/1), this can be safely ignored" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "voicecraft", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.18" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: inference_speech_editing_scale.py ================================================ import argparse, pickle import logging import os, random import numpy as np import torch import torchaudio from data.tokenizer import ( AudioTokenizer, TextTokenizer, tokenize_audio, tokenize_text ) from models import voicecraft import argparse, time, tqdm # this script only works for the musicgen architecture def get_args(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--manifest_fn", type=str, default="path/to/eval_metadata_file") parser.add_argument("--audio_root", type=str, default="path/to/audio_folder") parser.add_argument("--exp_dir", type=str, default="path/to/model_folder") parser.add_argument("--left_margin", type=float, default=0.08, help="extra space on the left to the word boundary") parser.add_argument("--right_margin", type=float, default=0.08, help="extra space on the right to the word boundary") parser.add_argument("--seed", type=int, default=1) parser.add_argument("--codec_audio_sr", type=int, default=16000, help='the sample rate of audio that the codec is trained for') parser.add_argument("--codec_sr", type=int, default=50, help='the sample rate of the codec codes') parser.add_argument("--top_k", type=int, default=-1, help="sampling param") parser.add_argument("--top_p", type=float, default=0.8, help="sampling param") parser.add_argument("--temperature", type=float, default=1.0, help="sampling param") parser.add_argument("--output_dir", type=str, default=None) parser.add_argument("--device", type=str, default="cuda") parser.add_argument("--signature", type=str, default=None, help="path to the encodec model") parser.add_argument("--stop_repetition", type=int, default=2, help="used for inference, when the number of consecutive repetition of a token is bigger than this, stop it") parser.add_argument("--kvcache", type=int, default=1, help='if true, use kv cache, which is 4-8x faster than without') parser.add_argument("--silence_tokens", type=str, default="[1388,1898,131]", help="note that if you are not using the pretrained encodec 6f79c6a8, make sure you specified it yourself, rather than using the default") return parser.parse_args() @torch.no_grad() def inference_one_sample(model, model_args, phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_text, mask_interval, device, decode_config): # phonemize text_tokens = [phn2num[phn] for phn in tokenize_text( text_tokenizer, text=target_text.strip() ) if phn in phn2num ] text_tokens = torch.LongTensor(text_tokens).unsqueeze(0) text_tokens_lens = torch.LongTensor([text_tokens.shape[-1]]) encoded_frames = tokenize_audio(audio_tokenizer, audio_fn) original_audio = encoded_frames[0][0].transpose(2,1) # [1,T,K] assert original_audio.ndim==3 and original_audio.shape[0] == 1 and original_audio.shape[2] == model_args.n_codebooks, original_audio.shape logging.info(f"with direct encodec encoding before input, original audio length: {original_audio.shape[1]} codec frames, which is {original_audio.shape[1]/decode_config['codec_sr']:.2f} sec.") # forward stime = time.time() encoded_frames = model.inference( text_tokens.to(device), text_tokens_lens.to(device), original_audio[...,:model_args.n_codebooks].to(device), # [1,T,8] mask_interval=mask_interval.unsqueeze(0).to(device), top_k=decode_config['top_k'], top_p=decode_config['top_p'], temperature=decode_config['temperature'], stop_repetition=decode_config['stop_repetition'], kvcache=decode_config['kvcache'], silence_tokens=eval(decode_config['silence_tokens']) if type(decode_config['silence_tokens']) == str else decode_config['silence_tokens'], ) # output is [1,K,T] logging.info(f"inference on one sample take: {time.time() - stime:.4f} sec.") if type(encoded_frames) == tuple: encoded_frames = encoded_frames[0] logging.info(f"generated encoded_frames.shape: {encoded_frames.shape}, which is {encoded_frames.shape[-1]/decode_config['codec_sr']} sec.") # decode (both original and generated) original_sample = audio_tokenizer.decode( [(original_audio.transpose(2,1), None)] # [1,T,8] -> [1,8,T] ) generated_sample = audio_tokenizer.decode( [(encoded_frames, None)] ) return original_sample, generated_sample def get_model(exp_dir, device=None): with open(os.path.join(exp_dir, "args.pkl"), "rb") as f: model_args = pickle.load(f) logging.info("load model weights...") model = voicecraft.VoiceCraft(model_args) ckpt_fn = os.path.join(exp_dir, "best_bundle.pth") ckpt = torch.load(ckpt_fn, map_location='cpu')['model'] phn2num = torch.load(ckpt_fn, map_location='cpu')['phn2num'] model.load_state_dict(ckpt) del ckpt logging.info("done loading weights...") if device == None: device = torch.device("cpu") if torch.cuda.is_available(): device = torch.device("cuda:0") model.to(device) model.eval() return model, model_args, phn2num def get_mask_interval(ali_fn, word_span_ind, editType): with open(ali_fn, "r") as rf: data = [l.strip().split(",") for l in rf.readlines()] data = data[1:] tmp = word_span_ind.split(",") s, e = int(tmp[0]), int(tmp[-1]) start = None for j, item in enumerate(data): if j == s and item[3] == "words": if editType == 'insertion': start = float(item[1]) else: start = float(item[0]) if j == e and item[3] == "words": if editType == 'insertion': end = float(item[0]) else: end = float(item[1]) assert start != None break return (start, end) if __name__ == "__main__": def seed_everything(seed): os.environ['PYTHONHASHSEED'] = str(seed) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True formatter = ( "%(asctime)s [%(levelname)s] %(filename)s:%(lineno)d || %(message)s" ) logging.basicConfig(format=formatter, level=logging.INFO) args = get_args() # args.device = 'cpu' args.allowed_repeat_tokens = eval(args.allowed_repeat_tokens) seed_everything(args.seed) # load model stime = time.time() logging.info(f"loading model from {args.exp_dir}") model, model_args, phn2num = get_model(args.exp_dir) if not os.path.isfile(model_args.exp_dir): model_args.exp_dir = args.exp_dir logging.info(f"loading model done, took {time.time() - stime:.4f} sec") # setup text and audio tokenizer text_tokenizer = TextTokenizer(backend="espeak") audio_tokenizer = AudioTokenizer(signature=args.signature) # will also put the neural codec model on gpu with open(args.manifest_fn, "r") as rf: manifest = [l.strip().split("\t") for l in rf.readlines()] manifest = manifest[1:] # wav_fn txt_fn alingment_fn num_words word_span_ind audio_fns = [] target_texts = [] mask_intervals = [] edit_types = [] new_spans = [] orig_spans = [] os.makedirs(args.output_dir, exist_ok=True) if args.crop_concat: mfa_temp = f"{args.output_dir}/mfa_temp" os.makedirs(mfa_temp, exist_ok=True) for item in manifest: audio_fn = os.path.join(args.audio_root, item[0]) temp = torchaudio.info(audio_fn) audio_dur = temp.num_frames/temp.sample_rate audio_fns.append(audio_fn) target_text = item[2].split("|")[-1] edit_types.append(item[5].split("|")) new_spans.append(item[4].split("|")) orig_spans.append(item[3].split("|")) target_texts.append(target_text) # the last transcript is the target # mi needs to be created from word_ind_span and alignment_fn, along with args.left_margin and args.right_margin mis = [] all_ind_intervals = item[3].split("|") editTypes = item[5].split("|") smaller_indx = [] alignment_fn = os.path.join(args.audio_root, "aligned", item[0].replace(".wav", ".csv")) if not os.path.isfile(alignment_fn): alignment_fn = alignment_fn.replace("/aligned/", "/aligned_csv/") assert os.path.isfile(alignment_fn), alignment_fn for ind_inter,editType in zip(all_ind_intervals, editTypes): # print(ind_inter) mi = get_mask_interval(alignment_fn, ind_inter, editType) mi = (max(mi[0] - args.left_margin, 1/args.codec_sr), min(mi[1] + args.right_margin, audio_dur)) # in seconds mis.append(mi) smaller_indx.append(mi[0]) ind = np.argsort(smaller_indx) mis = [mis[id] for id in ind] mask_intervals.append(mis) for i, (audio_fn, target_text, mask_interval) in enumerate(tqdm.tqdm(zip(audio_fns, target_texts, mask_intervals))): orig_mask_interval = mask_interval mask_interval = [[round(cmi[0]*args.codec_sr), round(cmi[1]*args.codec_sr)] for cmi in mask_interval] # logging.info(f"i: {i}, mask_interval: {mask_interval}") mask_interval = torch.LongTensor(mask_interval) # [M,2] orig_audio, new_audio = inference_one_sample(model, model_args, phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_text, mask_interval, args.device, vars(args)) # save segments for comparison orig_audio, new_audio = orig_audio[0].cpu(), new_audio[0].cpu() # logging.info(f"length of the resynthesize orig audio: {orig_audio.shape}") save_fn_new = f"{args.output_dir}/{os.path.basename(audio_fn)[:-4]}_new_seed{args.seed}.wav" torchaudio.save(save_fn_new, new_audio, args.codec_audio_sr) save_fn_orig = f"{args.output_dir}/{os.path.basename(audio_fn)[:-4]}_orig.wav" if not os.path.isfile(save_fn_orig): orig_audio, orig_sr = torchaudio.load(audio_fn) if orig_sr != args.codec_audio_sr: orig_audio = torchaudio.transforms.Resample(orig_sr, args.codec_audio_sr)(orig_audio) torchaudio.save(save_fn_orig, orig_audio, args.codec_audio_sr) ================================================ FILE: inference_tts.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "VoiceCraft Inference Text To Speech Demo\n", "===" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Select 'voicecraft' as the kernel" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# import libs\n", "# if this throws an error, something went wrong installing dependencies or changing the kernel above!\n", "import os\n", "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" \n", "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n", "os.environ[\"USER\"] = \"me\" # TODO change this to your username\n", "\n", "import torch\n", "import torchaudio\n", "import numpy as np\n", "import random\n", "from argparse import Namespace\n", "\n", "from data.tokenizer import (\n", " AudioTokenizer,\n", " TextTokenizer,\n", ")\n", "from huggingface_hub import hf_hub_download" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# # install MFA models and dictionaries if you haven't done so already, already done in the dockerfile or envrionment setup\n", "# !source ~/.bashrc && \\\n", "# conda activate voicecraft && \\\n", "# mfa model download dictionary english_us_arpa && \\\n", "# mfa model download acoustic english_us_arpa" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Dora directory: /tmp/audiocraft_me\n" ] } ], "source": [ "# load model, encodec, and phn2num\n", "# # load model, tokenizer, and other necessary files\n", "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "voicecraft_name=\"830M_TTSEnhanced.pth\" # or giga330M.pth, 330M_TTSEnhanced.pth, giga830M.pth\n", "\n", "# the new way of loading the model, with huggingface, recommended\n", "from models import voicecraft\n", "model = voicecraft.VoiceCraft.from_pretrained(f\"pyp1/VoiceCraft_{voicecraft_name.replace('.pth', '')}\")\n", "phn2num = model.args.phn2num\n", "config = vars(model.args)\n", "model.to(device)\n", "\n", "\n", "# # the old way of loading the model\n", "# from models import voicecraft\n", "# filepath = hf_hub_download(repo_id=\"pyp1/VoiceCraft\", filename=voicecraft_name, repo_type=\"model\")\n", "# ckpt = torch.load(filepath, map_location=\"cpu\")\n", "# model = voicecraft.VoiceCraft(ckpt[\"config\"])\n", "# model.load_state_dict(ckpt[\"model\"])\n", "# config = vars(model.args)\n", "# phn2num = ckpt[\"phn2num\"]\n", "# model.to(device)\n", "# model.eval()\n", "\n", "\n", "encodec_fn = \"./pretrained_models/encodec_4cb2048_giga.th\"\n", "if not os.path.exists(encodec_fn):\n", " os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th\")\n", " os.system(f\"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th\")\n", "audio_tokenizer = AudioTokenizer(signature=encodec_fn, device=device) # will also put the neural codec model on gpu\n", "\n", "text_tokenizer = TextTokenizer(backend=\"espeak\")\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# Prepare your audio\n", "# point to the original audio whose speech you want to clone\n", "# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file\n", "orig_audio = \"./demo/5895_34622_000026_000002.wav\"\n", "orig_transcript = \"Gwynplaine had, besides, for his work and for his feats of strength, round his neck and over his shoulders, an esclavine of leather.\"\n", "\n", "# move the audio and transcript to temp folder\n", "temp_folder = \"./demo/temp\"\n", "os.makedirs(temp_folder, exist_ok=True)\n", "os.system(f\"cp {orig_audio} {temp_folder}\")\n", "filename = os.path.splitext(orig_audio.split(\"/\")[-1])[0]\n", "with open(f\"{temp_folder}/{filename}.txt\", \"w\") as f:\n", " f.write(orig_transcript)\n", "# run MFA to get the alignment\n", "align_temp = f\"{temp_folder}/mfa_alignments\"\n", "!source ~/.bashrc && \\\n", " conda activate voicecraft && \\\n", " mfa align -v --clean -j 1 --output_format csv {temp_folder} \\\n", " english_us_arpa english_us_arpa {align_temp}\n", "\n", "# # if the above fails, it could be because the audio is too hard for the alignment model, increasing the beam size usually solves the issue\n", "# !source ~/.bashrc && \\\n", "# conda activate voicecraft && \\\n", "# mfa align -v --clean -j 1 --output_format csv {temp_folder} \\\n", "# english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt\n", "cut_off_sec = 3.6 # NOTE: according to forced-alignment file demo/temp/mfa_alignments/5895_34622_000026_000002.wav, the word \"strength\" stop as 3.561 sec, so we use first 3.6 sec as the prompt. this should be different for different audio\n", "target_transcript = \"Gwynplaine had, besides, for his work and for his feats of strength, I cannot believe that the same model can also do text to speech synthesis too!\"\n", "# NOTE: 3 sec of reference is generally enough for high quality voice cloning, but longer is generally better, try e.g. 3~6 sec.\n", "audio_fn = f\"{temp_folder}/{filename}.wav\"\n", "info = torchaudio.info(audio_fn)\n", "audio_dur = info.num_frames / info.sample_rate\n", "\n", "assert cut_off_sec < audio_dur, f\"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}\"\n", "prompt_end_frame = int(cut_off_sec * info.sample_rate)\n", "\n", "# run the model to get the output\n", "# hyperparameters for inference\n", "codec_audio_sr = 16000\n", "codec_sr = 50\n", "top_k = 40 # can also try 20, 30, 50\n", "top_p = 1 # 1 means do not do top-p sampling\n", "temperature = 1\n", "silence_tokens=[1388,1898,131]\n", "kvcache = 1 # NOTE if OOM, change this to 0, or try the 330M model\n", "\n", "# NOTE adjust the below three arguments if the generation is not as good\n", "stop_repetition = 3 # NOTE if the model generate long silence, reduce the stop_repetition to 3, 2 or even 1\n", "sample_batch_size = 3 # NOTE: if the if there are long silence or unnaturally strecthed words, increase sample_batch_size to 4 or higher. What this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest. So if the speech rate of the generated is too fast change it to a smaller number.\n", "seed = 1 # change seed if you are still unhappy with the result\n", "\n", "def seed_everything(seed):\n", " os.environ['PYTHONHASHSEED'] = str(seed)\n", " random.seed(seed)\n", " np.random.seed(seed)\n", " torch.manual_seed(seed)\n", " torch.cuda.manual_seed(seed)\n", " torch.backends.cudnn.benchmark = False\n", " torch.backends.cudnn.deterministic = True\n", "seed_everything(seed)\n", "\n", "decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, 'kvcache': kvcache, \"codec_audio_sr\": codec_audio_sr, \"codec_sr\": codec_sr, \"silence_tokens\": silence_tokens, \"sample_batch_size\": sample_batch_size}\n", "from inference_tts_scale import inference_one_sample\n", "concated_audio, gen_audio = inference_one_sample(model, Namespace(**config), phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_transcript, device, decode_config, prompt_end_frame)\n", " \n", "# save segments for comparison\n", "concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()\n", "# logging.info(f\"length of the resynthesize orig audio: {orig_audio.shape}\")\n", "\n", "\n", "# display the audio\n", "from IPython.display import Audio\n", "print(\"concatenate prompt and generated:\")\n", "display(Audio(concated_audio, rate=codec_audio_sr))\n", "\n", "print(\"generated:\")\n", "display(Audio(gen_audio, rate=codec_audio_sr))\n", "\n", "# # save the audio\n", "# # output_dir\n", "# output_dir = \"/home/pyp/VoiceCraft/demo/generated_tts\"\n", "# os.makedirs(output_dir, exist_ok=True)\n", "# seg_save_fn_gen = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_gen_seed{seed}.wav\"\n", "# seg_save_fn_concat = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_concat_seed{seed}.wav\" \n", "\n", "# torchaudio.save(seg_save_fn_gen, gen_audio, codec_audio_sr)\n", "# torchaudio.save(seg_save_fn_concat, concated_audio, codec_audio_sr)\n", "\n", "# you are might get warnings like WARNING:phonemizer:words count mismatch on 300.0% of the lines (3/1), this can be safely ignored" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "voicecraft", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.18" } }, "nbformat": 4, "nbformat_minor": 4 } ================================================ FILE: inference_tts_scale.py ================================================ import argparse, pickle import logging import os, random import numpy as np import torch import torchaudio from data.tokenizer import ( AudioTokenizer, TextTokenizer, tokenize_audio, tokenize_text ) from models import voicecraft import argparse, time, tqdm # this script only works for the musicgen architecture def get_args(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--manifest_fn", type=str, default="path/to/eval_metadata_file") parser.add_argument("--audio_root", type=str, default="path/to/audio_folder") parser.add_argument("--exp_dir", type=str, default="path/to/model_folder") parser.add_argument("--seed", type=int, default=1) parser.add_argument("--codec_audio_sr", type=int, default=16000, help='the sample rate of audio that the codec is trained for') parser.add_argument("--codec_sr", type=int, default=50, help='the sample rate of the codec codes') parser.add_argument("--top_k", type=int, default=40, help="sampling param") parser.add_argument("--top_p", type=float, default=1, help="sampling param") parser.add_argument("--temperature", type=float, default=1.0, help="sampling param") parser.add_argument("--output_dir", type=str, default=None) parser.add_argument("--device", type=str, default="cuda") parser.add_argument("--signature", type=str, default=None, help="path to the encodec model") parser.add_argument("--crop_concat", type=int, default=0) parser.add_argument("--stop_repetition", type=int, default=-1, help="used for inference, when the number of consecutive repetition of a token is bigger than this, stop it") parser.add_argument("--kvcache", type=int, default=1, help='if true, use kv cache, which is 4-8x faster than without') parser.add_argument("--sample_batch_size", type=int, default=1, help="batch size for sampling, NOTE that it's not running inference for several samples, but duplicate one input sample batch_size times, and during inference, we only return the shortest generation") parser.add_argument("--silence_tokens", type=str, default="[1388,1898,131]", help="note that if you are not using the pretrained encodec 6f79c6a8, make sure you specified it yourself, rather than using the default") return parser.parse_args() @torch.no_grad() def inference_one_sample(model, model_args, phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_text, device, decode_config, prompt_end_frame): # phonemize text_tokens = [phn2num[phn] for phn in tokenize_text( text_tokenizer, text=target_text.strip() ) if phn in phn2num ] text_tokens = torch.LongTensor(text_tokens).unsqueeze(0) text_tokens_lens = torch.LongTensor([text_tokens.shape[-1]]) # encode audio encoded_frames = tokenize_audio(audio_tokenizer, audio_fn, offset=0, num_frames=prompt_end_frame) original_audio = encoded_frames[0][0].transpose(2,1) # [1,T,K] assert original_audio.ndim==3 and original_audio.shape[0] == 1 and original_audio.shape[2] == model_args.n_codebooks, original_audio.shape logging.info(f"original audio length: {original_audio.shape[1]} codec frames, which is {original_audio.shape[1]/decode_config['codec_sr']:.2f} sec.") # forward stime = time.time() if decode_config['sample_batch_size'] <= 1: logging.info(f"running inference with batch size 1") concat_frames, gen_frames = model.inference_tts( text_tokens.to(device), text_tokens_lens.to(device), original_audio[...,:model_args.n_codebooks].to(device), # [1,T,8] top_k=decode_config['top_k'], top_p=decode_config['top_p'], temperature=decode_config['temperature'], stop_repetition=decode_config['stop_repetition'], kvcache=decode_config['kvcache'], silence_tokens=eval(decode_config['silence_tokens']) if type(decode_config['silence_tokens'])==str else decode_config['silence_tokens'] ) # output is [1,K,T] else: logging.info(f"running inference with batch size {decode_config['sample_batch_size']}, i.e. return the shortest among {decode_config['sample_batch_size']} generations.") concat_frames, gen_frames = model.inference_tts_batch( text_tokens.to(device), text_tokens_lens.to(device), original_audio[...,:model_args.n_codebooks].to(device), # [1,T,8] top_k=decode_config['top_k'], top_p=decode_config['top_p'], temperature=decode_config['temperature'], stop_repetition=decode_config['stop_repetition'], kvcache=decode_config['kvcache'], batch_size = decode_config['sample_batch_size'], silence_tokens=eval(decode_config['silence_tokens']) if type(decode_config['silence_tokens'])==str else decode_config['silence_tokens'] ) # output is [1,K,T] logging.info(f"inference on one sample take: {time.time() - stime:.4f} sec.") logging.info(f"generated encoded_frames.shape: {gen_frames.shape}, which is {gen_frames.shape[-1]/decode_config['codec_sr']} sec.") # for timestamp, codes in enumerate(gen_frames[0].transpose(1,0)): # logging.info(f"{timestamp}: {codes.tolist()}") # decode (both original and generated) concat_sample = audio_tokenizer.decode( [(concat_frames, None)] # [1,T,8] -> [1,8,T] ) gen_sample = audio_tokenizer.decode( [(gen_frames, None)] ) #Empty cuda cache between runs if torch.cuda.is_available(): torch.cuda.empty_cache() # return return concat_sample, gen_sample def get_model(exp_dir, device=None): with open(os.path.join(exp_dir, "args.pkl"), "rb") as f: model_args = pickle.load(f) logging.info("load model weights...") model = voicecraft.VoiceCraft(model_args) ckpt_fn = os.path.join(exp_dir, "best_bundle.pth") ckpt = torch.load(ckpt_fn, map_location='cpu')['model'] phn2num = torch.load(ckpt_fn, map_location='cpu')['phn2num'] model.load_state_dict(ckpt) del ckpt logging.info("done loading weights...") if device == None: device = torch.device("cpu") if torch.cuda.is_available(): device = torch.device("cuda:0") model.to(device) model.eval() return model, model_args, phn2num if __name__ == "__main__": def seed_everything(seed): os.environ['PYTHONHASHSEED'] = str(seed) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True formatter = ( "%(asctime)s [%(levelname)s] %(filename)s:%(lineno)d || %(message)s" ) logging.basicConfig(format=formatter, level=logging.INFO) args = get_args() # args.device='cpu' seed_everything(args.seed) os.makedirs(args.output_dir, exist_ok=True) # load model with open(args.manifest_fn, "r") as rf: manifest = [l.strip().split("\t") for l in rf.readlines()] manifest = manifest[1:] manifest = [[item[0], item[2], item[3], item[1], item[5]] for item in manifest] stime = time.time() logging.info(f"loading model from {args.exp_dir}") model, model_args, phn2num = get_model(args.exp_dir) logging.info(f"loading model done, took {time.time() - stime:.4f} sec") # setup text and audio tokenizer text_tokenizer = TextTokenizer(backend="espeak") audio_tokenizer = AudioTokenizer(signature=args.signature) # will also put the neural codec model on gpu audio_fns = [] texts = [] prompt_end_frames = [] new_audio_fns = [] text_to_syn = [] for item in manifest: audio_fn = os.path.join(args.audio_root, item[0]) audio_fns.append(audio_fn) temp = torchaudio.info(audio_fn) prompt_end_frames.append(round(float(item[2])*temp.sample_rate)) texts.append(item[1]) new_audio_fns.append(item[-2]) all_text = item[1].split(" ") start_ind = int(item[-1].split(",")[0]) text_to_syn.append(" ".join(all_text[start_ind:])) for i, (audio_fn, text, prompt_end_frame, new_audio_fn, to_syn) in enumerate(tqdm.tqdm((zip(audio_fns, texts, prompt_end_frames, new_audio_fns, text_to_syn)))): output_expected_sr = args.codec_audio_sr concated_audio, gen_audio = inference_one_sample(model, model_args, phn2num, text_tokenizer, audio_tokenizer, audio_fn, text, args.device, vars(args), prompt_end_frame) # save segments for comparison concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu() if output_expected_sr != args.codec_audio_sr: gen_audio = torchaudio.transforms.Resample(output_expected_sr, args.codec_audio_sr)(gen_audio) concated_audio = torchaudio.transforms.Resample(output_expected_sr, args.codec_audio_sr)(concated_audio) seg_save_fn_gen = f"{args.output_dir}/gen_{new_audio_fn[:-4]}_{i}_seed{args.seed}.wav" seg_save_fn_concat = f"{args.output_dir}/concat_{new_audio_fn[:-4]}_{i}_seed{args.seed}.wav" torchaudio.save(seg_save_fn_gen, gen_audio, args.codec_audio_sr) torchaudio.save(seg_save_fn_concat, concated_audio, args.codec_audio_sr) ================================================ FILE: main.py ================================================ from pathlib import Path import torch import pickle import argparse import logging import torch.distributed as dist from config import MyParser from steps import trainer if __name__ == "__main__": formatter = ( "%(asctime)s [%(levelname)s] %(filename)s:%(lineno)d || %(message)s" ) logging.basicConfig(format=formatter, level=logging.INFO) torch.cuda.empty_cache() args = MyParser().parse_args() logging.info(args) exp_dir = Path(args.exp_dir) exp_dir.mkdir(exist_ok=True, parents=True) logging.info(f"exp_dir: {str(exp_dir)}") if args.resume: resume = args.resume assert(bool(args.exp_dir)) with open("%s/args.pkl" % args.exp_dir, "rb") as f: old_args = pickle.load(f) new_args = vars(args) old_args = vars(old_args) for key in new_args: if key not in old_args or old_args[key] != new_args[key]: old_args[key] = new_args[key] args = argparse.Namespace(**old_args) args.resume = resume else: with open("%s/args.pkl" % args.exp_dir, "wb") as f: pickle.dump(args, f) dist.init_process_group(backend='nccl', init_method='env://') rank = dist.get_rank() world_size = dist.get_world_size() torch.cuda.set_device(rank) my_trainer = trainer.Trainer(args, world_size, rank) my_trainer.train() ================================================ FILE: models/codebooks_patterns.py ================================================ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. from collections import namedtuple from dataclasses import dataclass from functools import lru_cache import logging import typing as tp from abc import ABC, abstractmethod import torch LayoutCoord = namedtuple('LayoutCoord', ['t', 'q']) # (timestep, codebook index) PatternLayout = tp.List[tp.List[LayoutCoord]] # Sequence of coordinates @dataclass class Pattern: """Base implementation of a pattern over a sequence with multiple codebooks. The codebook pattern consists in a layout, defining for each sequence step the list of coordinates of each codebook timestep in the resulting interleaved sequence. The first item of the pattern is always an empty list in order to properly insert a special token to start with. For convenience, we also keep track of ``n_q`` the number of codebooks used for the pattern and ``timesteps`` the number of timesteps corresponding to the original sequence. The pattern provides convenient methods to build and revert interleaved sequences from it: ``build_pattern_sequence`` maps a given a dense input tensor of multi-codebook sequence from [B, K, T] to the interleaved sequence of shape [B, K, S] applying the pattern, with S being the batch size, K being the number of codebooks, T the number of original timesteps and S the number of sequence steps for the output sequence. The unfilled positions are replaced with a special token and the built sequence is returned along with a mask indicating valid tokens. ``revert_pattern_sequence`` maps back an interleaved sequence of shape [B, K, S] to the original alignment of codebooks across timesteps to an output tensor of shape [B, K, T], using again a special token and a mask to fill and specify invalid positions if needed. See the dedicated methods for more details. """ # Pattern layout, for each sequence step, we have a list of coordinates # corresponding to the original codebook timestep and position. # The first list is always an empty list in order to properly insert # a special token to start with. layout: PatternLayout timesteps: int n_q: int def __post_init__(self): assert len(self.layout) > 0 assert self.layout[0] == [] self._validate_layout() self._build_reverted_sequence_scatter_indexes = lru_cache(100)(self._build_reverted_sequence_scatter_indexes) self._build_pattern_sequence_scatter_indexes = lru_cache(100)(self._build_pattern_sequence_scatter_indexes) # logging.info("New pattern, time steps: %d, sequence steps: %d", self.timesteps, len(self.layout)) def _validate_layout(self): """Runs checks on the layout to ensure a valid pattern is defined. A pattern is considered invalid if: - Multiple timesteps for a same codebook are defined in the same sequence step - The timesteps for a given codebook are not in ascending order as we advance in the sequence (this would mean that we have future timesteps before past timesteps). """ q_timesteps = {q: 0 for q in range(self.n_q)} for s, seq_coords in enumerate(self.layout): if len(seq_coords) > 0: qs = set() for coord in seq_coords: qs.add(coord.q) last_q_timestep = q_timesteps[coord.q] assert coord.t >= last_q_timestep, \ f"Past timesteps are found in the sequence for codebook = {coord.q} at step {s}" q_timesteps[coord.q] = coord.t # each sequence step contains at max 1 coordinate per codebook assert len(qs) == len(seq_coords), \ f"Multiple entries for a same codebook are found at step {s}" @property def num_sequence_steps(self): return len(self.layout) - 1 @property def max_delay(self): max_t_in_seq_coords = 0 for seq_coords in self.layout[1:]: for coords in seq_coords: max_t_in_seq_coords = max(max_t_in_seq_coords, coords.t + 1) return max_t_in_seq_coords - self.timesteps @property def valid_layout(self): valid_step = len(self.layout) - self.max_delay return self.layout[:valid_step] def get_sequence_coords_with_timestep(self, t: int, q: tp.Optional[int] = None): """Get codebook coordinates in the layout that corresponds to the specified timestep t and optionally to the codebook q. Coordinates are returned as a tuple with the sequence step and the actual codebook coordinates. """ assert t <= self.timesteps, "provided timesteps is greater than the pattern's number of timesteps" if q is not None: assert q <= self.n_q, "provided number of codebooks is greater than the pattern's number of codebooks" coords = [] for s, seq_codes in enumerate(self.layout): for code in seq_codes: if code.t == t and (q is None or code.q == q): coords.append((s, code)) return coords def get_steps_with_timestep(self, t: int, q: tp.Optional[int] = None) -> tp.List[int]: return [step for step, coords in self.get_sequence_coords_with_timestep(t, q)] def get_first_step_with_timesteps(self, t: int, q: tp.Optional[int] = None) -> tp.Optional[int]: steps_with_timesteps = self.get_steps_with_timestep(t, q) return steps_with_timesteps[0] if len(steps_with_timesteps) > 0 else None def _build_pattern_sequence_scatter_indexes(self, timesteps: int, n_q: int, keep_only_valid_steps: bool, device: tp.Union[torch.device, str] = 'cpu'): """Build scatter indexes corresponding to the pattern, up to the provided sequence_steps. Args: timesteps (int): Maximum number of timesteps steps to consider. keep_only_valid_steps (bool): Restrict the pattern layout to match only valid steps. device (Union[torch.device, str]): Device for created tensors. Returns: indexes (torch.Tensor): Indexes corresponding to the sequence, of shape [K, S]. mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes, of shape [K, S]. """ assert n_q == self.n_q, f"invalid number of codebooks for the sequence and the pattern: {n_q} != {self.n_q}" assert timesteps <= self.timesteps, "invalid number of timesteps used to build the sequence from the pattern" # use the proper layout based on whether we limit ourselves to valid steps only or not, # note that using the valid_layout will result in a truncated sequence up to the valid steps ref_layout = self.valid_layout if keep_only_valid_steps else self.layout # single item indexing being super slow with pytorch vs. numpy, so we use numpy here indexes = torch.zeros(n_q, len(ref_layout), dtype=torch.long).numpy() mask = torch.zeros(n_q, len(ref_layout), dtype=torch.bool).numpy() # fill indexes with last sequence step value that will correspond to our special token # the last value is n_q * timesteps as we have flattened z and append special token as the last token # which will correspond to the index: n_q * timesteps indexes[:] = n_q * timesteps # iterate over the pattern and fill scattered indexes and mask for s, sequence_coords in enumerate(ref_layout): for coords in sequence_coords: if coords.t < timesteps: indexes[coords.q, s] = coords.t + coords.q * timesteps mask[coords.q, s] = 1 indexes = torch.from_numpy(indexes).to(device) mask = torch.from_numpy(mask).to(device) return indexes, mask def build_pattern_sequence(self, z: torch.Tensor, special_token: int, keep_only_valid_steps: bool = False): """Build sequence corresponding to the pattern from the input tensor z. The sequence is built using up to sequence_steps if specified, and non-pattern coordinates are filled with the special token. Args: z (torch.Tensor): Input tensor of multi-codebooks sequence, of shape [B, K, T]. special_token (int): Special token used to fill non-pattern coordinates in the new sequence. keep_only_valid_steps (bool): Build a sequence from the pattern up to valid (= fully defined) steps. Steps that are beyond valid steps will be replaced by the special_token in that case. Returns: values (torch.Tensor): Interleaved sequence matching the pattern, of shape [B, K, S] with S corresponding either to the sequence_steps if provided, otherwise to the length of the pattern. indexes (torch.Tensor): Indexes corresponding to the interleaved sequence, of shape [K, S]. mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes of shape [K, S]. """ B, K, T = z.shape indexes, mask = self._build_pattern_sequence_scatter_indexes( T, K, keep_only_valid_steps=keep_only_valid_steps, device=str(z.device) ) z = z.view(B, -1) # we append the special token as the last index of our flattened z tensor z = torch.cat([z, torch.zeros_like(z[:, :1]) + special_token], dim=1) values = z[:, indexes.view(-1)] values = values.view(B, K, indexes.shape[-1]) return values, indexes, mask def _build_reverted_sequence_scatter_indexes(self, sequence_steps: int, n_q: int, keep_only_valid_steps: bool = False, is_model_output: bool = False, device: tp.Union[torch.device, str] = 'cpu'): """Builds scatter indexes required to retrieve the original multi-codebook sequence from interleaving pattern. Args: sequence_steps (int): Sequence steps. n_q (int): Number of codebooks. keep_only_valid_steps (bool): Build a sequence from the pattern up to valid (= fully defined) steps. Steps that are beyond valid steps will be replaced by the special_token in that case. is_model_output (bool): Whether to keep the sequence item corresponding to initial special token or not. device (Union[torch.device, str]): Device for created tensors. Returns: torch.Tensor: Indexes for reconstructing the output, of shape [K, T]. mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes of shape [K, T]. """ ref_layout = self.valid_layout if keep_only_valid_steps else self.layout # TODO(jade): Do we want to further truncate to only valid timesteps here as well? timesteps = self.timesteps assert n_q == self.n_q, f"invalid number of codebooks for the sequence and the pattern: {n_q} != {self.n_q}" assert sequence_steps <= len(ref_layout), \ f"sequence to revert is longer than the defined pattern: {sequence_steps} > {len(ref_layout)}" # ensure we take the appropriate indexes to keep the model output from the first special token as well if is_model_output: ref_layout = ref_layout[1:] # single item indexing being super slow with pytorch vs. numpy, so we use numpy here indexes = torch.zeros(n_q, timesteps, dtype=torch.long).numpy() mask = torch.zeros(n_q, timesteps, dtype=torch.bool).numpy() # fill indexes with last sequence step value that will correspond to our special token indexes[:] = n_q * sequence_steps for s, sequence_codes in enumerate(ref_layout): if s < sequence_steps: for code in sequence_codes: if code.t < timesteps: indexes[code.q, code.t] = s + code.q * sequence_steps mask[code.q, code.t] = 1 indexes = torch.from_numpy(indexes).to(device) mask = torch.from_numpy(mask).to(device) return indexes, mask def revert_pattern_sequence(self, s: torch.Tensor, special_token: int, keep_only_valid_steps: bool = False): """Revert a sequence built from the pattern back to the original multi-codebook sequence without interleaving. The sequence is reverted using up to timesteps if specified, and non-pattern coordinates are filled with the special token. Args: s (torch.Tensor): Interleaved sequence tensor obtained from the pattern, of shape [B, K, S]. special_token (int or float): Special token used to fill non-pattern coordinates in the new sequence. Returns: values (torch.Tensor): Interleaved sequence matching the pattern, of shape [B, K, T] with T corresponding either to the timesteps if provided, or the total timesteps in pattern otherwise. indexes (torch.Tensor): Indexes corresponding to the interleaved sequence, of shape [K, T]. mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes of shape [K, T]. """ B, K, S = s.shape indexes, mask = self._build_reverted_sequence_scatter_indexes( S, K, keep_only_valid_steps, is_model_output=False, device=str(s.device) ) s = s.view(B, -1) # we append the special token as the last index of our flattened z tensor s = torch.cat([s, torch.zeros_like(s[:, :1]) + special_token], dim=1) values = s[:, indexes.view(-1)] values = values.view(B, K, indexes.shape[-1]) return values, indexes, mask def revert_pattern_logits(self, logits: torch.Tensor, special_token: float, keep_only_valid_steps: bool = False): """Revert model logits obtained on a sequence built from the pattern back to a tensor matching the original sequence. This method is similar to ``revert_pattern_sequence`` with the following specificities: 1. It is designed to work with the extra cardinality dimension 2. We return the logits for the first sequence item that matches the special_token and which matching target in the original sequence is the first item of the sequence, while we skip the last logits as there is no matching target """ B, card, K, S = logits.shape indexes, mask = self._build_reverted_sequence_scatter_indexes( S, K, keep_only_valid_steps, is_model_output=True, device=logits.device ) logits = logits.reshape(B, card, -1) # we append the special token as the last index of our flattened z tensor logits = torch.cat([logits, torch.zeros_like(logits[:, :, :1]) + special_token], dim=-1) # [B, card, K x S] values = logits[:, :, indexes.view(-1)] values = values.view(B, card, K, indexes.shape[-1]) return values, indexes, mask class CodebooksPatternProvider(ABC): """Abstraction around providing pattern for interleaving codebooks. The CodebooksPatternProvider abstraction allows to implement various strategies to define interleaving pattern of sequences composed of multiple codebooks. For a given number of codebooks `n_q`, the pattern provider can generate a specified pattern corresponding to a sequence of `T` timesteps with `n_q` parallel codebooks. This pattern can be used to construct a new sequence from the original codes respecting the specified pattern. The pattern is defined as a list of list of code coordinates, code coordinate being a tuple with the original timestep and codebook to build the new sequence. Note that all patterns must start with an empty list that is then used to insert a first sequence step of special tokens in the newly generated sequence. Args: n_q (int): number of codebooks. cached (bool): if True, patterns for a given length are cached. In general that should be true for efficiency reason to avoid synchronization points. """ def __init__(self, n_q: int, cached: bool = True): assert n_q > 0 self.n_q = n_q self.get_pattern = lru_cache(100)(self.get_pattern) # type: ignore @abstractmethod def get_pattern(self, timesteps: int) -> Pattern: """Builds pattern with specific interleaving between codebooks. Args: timesteps (int): Total numer of timesteps. """ raise NotImplementedError() class DelayedPatternProvider(CodebooksPatternProvider): """Provider for delayed pattern across delayed codebooks. Codebooks are delayed in the sequence and sequence steps will contain codebooks from different timesteps. Example: Taking timesteps=4 and n_q=3, delays=None, the multi-codebook sequence: [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]] The resulting sequence obtained from the returned pattern is: [[S, 1, 2, 3, 4], [S, S, 1, 2, 3], [S, S, S, 1, 2]] (with S being a special token) Args: n_q (int): Number of codebooks. delays (Optional[List[int]]): Delay for each of the codebooks. If delays not defined, each codebook is delayed by 1 compared to the previous one. flatten_first (int): Flatten the first N timesteps. empty_initial (int): Prepend with N empty list of coordinates. """ def __init__(self, n_q: int, delays: tp.Optional[tp.List[int]] = None, flatten_first: int = 0, empty_initial: int = 0): super().__init__(n_q) if delays is None: delays = list(range(n_q)) self.delays = delays self.flatten_first = flatten_first self.empty_initial = empty_initial assert len(self.delays) == self.n_q assert sorted(self.delays) == self.delays def get_pattern(self, timesteps: int) -> Pattern: out: PatternLayout = [[]] max_delay = max(self.delays) if self.empty_initial: out += [[] for _ in range(self.empty_initial)] if self.flatten_first: for t in range(min(timesteps, self.flatten_first)): for q in range(self.n_q): out.append([LayoutCoord(t, q)]) for t in range(self.flatten_first, timesteps + max_delay): v = [] for q, delay in enumerate(self.delays): t_for_q = t - delay if t_for_q >= self.flatten_first: v.append(LayoutCoord(t_for_q, q)) out.append(v) return Pattern(out, n_q=self.n_q, timesteps=timesteps) class ParallelPatternProvider(DelayedPatternProvider): """Provider for parallel pattern across codebooks. This pattern provider is a special case of the delayed pattern with actually no delay, hence delays=repeat(0, n_q). Args: n_q (int): Number of codebooks. """ def __init__(self, n_q: int): super().__init__(n_q, [0] * n_q) class UnrolledPatternProvider(CodebooksPatternProvider): """Provider for unrolling codebooks pattern. This pattern provider enables to represent the codebook flattened completely or only to some extend while also specifying a given delay between the flattened codebooks representation, allowing to unroll the codebooks in the sequence. Example: 1. Flattening of the codebooks. By default, the pattern provider will fully flatten the codebooks such as flattening=range(n_q), taking n_q = 3 and timesteps = 4: [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]] will result into: [[S, S, 1, S, S, 2, S, S, 3, S, S, 4], [S, 1, S, S, 2, S, S, 3, S, S, 4, S], [1, S, S, 2, S, S, 3, S, S, 4, S, S]] 2. Partial flattening of the codebooks. The ``flattening`` parameter allows to specify the inner step for each of the codebook, allowing to define which codebook to flatten (or keep in parallel), for example taking n_q = 3, timesteps = 4 and flattening = [0, 1, 1]: [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]] will result into: [[S, 1, S, S, 2, S, S, 3, S, S, 4, S], [S, 1, S, S, 2, S, S, 3, S, S, 4, S], [1, S, S, 2, S, S, 3, S, S, 4, S, S]] 3. Flattening with delay. The ``delay`` parameter allows to further unroll the sequence of codebooks allowing to specify the delay per codebook. Note that the delay between codebooks flattened to the same inner timestep should be coherent. For example, taking n_q = 3, timesteps = 4, flattening = [0, 1, 1] and delays = [0, 3, 3]: [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]] will result into: [[S, S, S, 1, S, 2, S, 3, S, 4], [S, S, S, 1, S, 2, S, 3, S, 4], [1, 2, 3, S, 4, S, 5, S, 6, S]] Args: n_q (int): Number of codebooks. flattening (Optional[List[int]]): Flattening schema over the codebooks. If not defined, the codebooks will be flattened to 1 codebook per step, meaning that the sequence will have n_q extra steps for each timestep. delays (Optional[List[int]]): Delay for each of the codebooks. If not defined, no delay is added and therefore will default to [0] * ``n_q``. Note that two codebooks that will be flattened to the same inner step should have the same delay, otherwise the pattern is considered as invalid. """ FlattenedCodebook = namedtuple('FlattenedCodebook', ['codebooks', 'delay']) def __init__(self, n_q: int, flattening: tp.Optional[tp.List[int]] = None, delays: tp.Optional[tp.List[int]] = None): super().__init__(n_q) if flattening is None: flattening = list(range(n_q)) if delays is None: delays = [0] * n_q assert len(flattening) == n_q assert len(delays) == n_q assert sorted(flattening) == flattening assert sorted(delays) == delays self._flattened_codebooks = self._build_flattened_codebooks(delays, flattening) self.max_delay = max(delays) def _build_flattened_codebooks(self, delays: tp.List[int], flattening: tp.List[int]): """Build a flattened codebooks representation as a dictionary of inner step and the actual codebook indices corresponding to the flattened codebook. For convenience, we also store the delay associated to the flattened codebook to avoid maintaining an extra mapping. """ flattened_codebooks: dict = {} for q, (inner_step, delay) in enumerate(zip(flattening, delays)): if inner_step not in flattened_codebooks: flat_codebook = UnrolledPatternProvider.FlattenedCodebook(codebooks=[q], delay=delay) else: flat_codebook = flattened_codebooks[inner_step] assert flat_codebook.delay == delay, ( "Delay and flattening between codebooks is inconsistent: ", "two codebooks flattened to the same position should have the same delay." ) flat_codebook.codebooks.append(q) flattened_codebooks[inner_step] = flat_codebook return flattened_codebooks @property def _num_inner_steps(self): """Number of inner steps to unroll between timesteps in order to flatten the codebooks. """ return max([inner_step for inner_step in self._flattened_codebooks.keys()]) + 1 def num_virtual_steps(self, timesteps: int) -> int: return timesteps * self._num_inner_steps + 1 def get_pattern(self, timesteps: int) -> Pattern: """Builds pattern for delay across codebooks. Args: timesteps (int): Total numer of timesteps. """ # the PatternLayout is built as a tuple of sequence position and list of coordinates # so that it can be reordered properly given the required delay between codebooks of given timesteps indexed_out: list = [(-1, [])] max_timesteps = timesteps + self.max_delay for t in range(max_timesteps): # for each timestep, we unroll the flattened codebooks, # emitting the sequence step with the corresponding delay for step in range(self._num_inner_steps): if step in self._flattened_codebooks: # we have codebooks at this virtual step to emit step_codebooks = self._flattened_codebooks[step] t_for_q = t + step_codebooks.delay coords = [LayoutCoord(t, q) for q in step_codebooks.codebooks] if t_for_q < max_timesteps and t < max_timesteps: indexed_out.append((t_for_q, coords)) else: # there is no codebook in this virtual step so we emit an empty list indexed_out.append((t, [])) out = [coords for _, coords in sorted(indexed_out)] return Pattern(out, n_q=self.n_q, timesteps=timesteps) class VALLEPattern(CodebooksPatternProvider): """Almost VALL-E style pattern. We futher allow some delays for the codebooks other than the first one. Args: n_q (int): Number of codebooks. delays (Optional[List[int]]): Delay for each of the codebooks. If delays not defined, each codebook is delayed by 1 compared to the previous one. """ def __init__(self, n_q: int, delays: tp.Optional[tp.List[int]] = None): super().__init__(n_q) if delays is None: delays = [0] * (n_q - 1) self.delays = delays assert len(self.delays) == self.n_q - 1 assert sorted(self.delays) == self.delays def get_pattern(self, timesteps: int) -> Pattern: out: PatternLayout = [[]] for t in range(timesteps): out.append([LayoutCoord(t, 0)]) max_delay = max(self.delays) for t in range(timesteps + max_delay): v = [] for q, delay in enumerate(self.delays): t_for_q = t - delay if t_for_q >= 0: v.append(LayoutCoord(t_for_q, q + 1)) out.append(v) return Pattern(out, n_q=self.n_q, timesteps=timesteps) class MusicLMPattern(CodebooksPatternProvider): """Almost MusicLM style pattern. This is equivalent to full flattening but in a different order. Args: n_q (int): Number of codebooks. group_by (int): Number of codebooks to group together. """ def __init__(self, n_q: int, group_by: int = 2): super().__init__(n_q) self.group_by = group_by def get_pattern(self, timesteps: int) -> Pattern: out: PatternLayout = [[]] for offset in range(0, self.n_q, self.group_by): for t in range(timesteps): for q in range(offset, offset + self.group_by): out.append([LayoutCoord(t, q)]) return Pattern(out, n_q=self.n_q, timesteps=timesteps) ================================================ FILE: models/modules/__init__.py ================================================ ================================================ FILE: models/modules/activation.py ================================================ # cp from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/activation.py, modified by Puyuan Peng, 2024 from typing import Optional, Tuple import torch from torch import Tensor from torch.nn import Linear, Module from torch.nn import functional as F from torch.nn.init import constant_, xavier_normal_, xavier_uniform_ from torch.nn.modules.linear import NonDynamicallyQuantizableLinear from torch.nn.parameter import Parameter import logging from typing import Callable, List, Optional, Tuple, Union from typing import TYPE_CHECKING if TYPE_CHECKING: from torch.types import _dtype as DType else: # The JIT doesn't understand Union, nor torch.dtype here DType = int def _canonical_mask( mask: Optional[Tensor], mask_name: str, other_type: Optional[DType], other_name: str, target_type: DType, check_other: bool = True, ) -> Optional[Tensor]: if mask is not None: _mask_dtype = mask.dtype _mask_is_float = torch.is_floating_point(mask) if _mask_dtype != torch.bool and not _mask_is_float: raise AssertionError( f"only bool and floating types of {mask_name} are supported") if check_other and other_type is not None: if _mask_dtype != other_type: warnings.warn( f"Support for mismatched {mask_name} and {other_name} " "is deprecated. Use same type for both instead." ) if not _mask_is_float: mask = ( torch.zeros_like(mask, dtype=target_type) .masked_fill_(mask, float("-inf")) ) return mask def _in_projection_packed( q: Tensor, k: Tensor, v: Tensor, w: Tensor, b: Optional[Tensor] = None, ) -> List[Tensor]: r""" Performs the in-projection step of the attention operation, using packed weights. Output is a triple containing projection tensors for query, key and value. Args: q, k, v: query, key and value tensors to be projected. For self-attention, these are typically the same tensor; for encoder-decoder attention, k and v are typically the same tensor. (We take advantage of these identities for performance if they are present.) Regardless, q, k and v must share a common embedding dimension; otherwise their shapes may vary. w: projection weights for q, k and v, packed into a single tensor. Weights are packed along dimension 0, in q, k, v order. b: optional projection biases for q, k and v, packed into a single tensor in q, k, v order. Shape: Inputs: - q: :math:`(..., E)` where E is the embedding dimension - k: :math:`(..., E)` where E is the embedding dimension - v: :math:`(..., E)` where E is the embedding dimension - w: :math:`(E * 3, E)` where E is the embedding dimension - b: :math:`E * 3` where E is the embedding dimension Output: - in output list :math:`[q', k', v']`, each output tensor will have the same shape as the corresponding input tensor. """ E = q.size(-1) if k is v: if q is k: # self-attention proj = F.linear(q, w, b) # reshape to 3, E and not E, 3 is deliberate for better memory coalescing and keeping same order as chunk() proj = proj.unflatten(-1, (3, E)).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous() return proj[0], proj[1], proj[2] else: # encoder-decoder attention w_q, w_kv = w.split([E, E * 2]) if b is None: b_q = b_kv = None else: b_q, b_kv = b.split([E, E * 2]) q_proj = F.linear(q, w_q, b_q) kv_proj = F.linear(k, w_kv, b_kv) # reshape to 2, E and not E, 2 is deliberate for better memory coalescing and keeping same order as chunk() kv_proj = kv_proj.unflatten(-1, (2, E)).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous() return (q_proj, kv_proj[0], kv_proj[1]) else: w_q, w_k, w_v = w.chunk(3) if b is None: b_q = b_k = b_v = None else: b_q, b_k, b_v = b.chunk(3) return F.linear(q, w_q, b_q), F.linear(k, w_k, b_k), F.linear(v, w_v, b_v) def _none_or_dtype(input: Optional[Tensor]) -> Optional[DType]: if input is None: return None elif isinstance(input, torch.Tensor): return input.dtype raise RuntimeError("input to _none_or_dtype() must be None or torch.Tensor") class MultiheadAttention(Module): r"""Allows the model to jointly attend to information from different representation subspaces as described in the paper: `Attention Is All You Need `_. Multi-Head Attention is defined as: .. math:: \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`. ``forward()`` will use a special optimized implementation if all of the following conditions are met: - self attention is being computed (i.e., ``query``, ``key``, and ``value`` are the same tensor. This restriction will be loosened in the future.) - Either autograd is disabled (using ``torch.inference_mode`` or ``torch.no_grad``) or no tensor argument ``requires_grad`` - training is disabled (using ``.eval()``) - dropout is 0 - ``add_bias_kv`` is ``False`` - ``add_zero_attn`` is ``False`` - ``batch_first`` is ``True`` and the input is batched - ``kdim`` and ``vdim`` are equal to ``embed_dim`` - at most one of ``key_padding_mask`` or ``attn_mask`` is passed - if a `NestedTensor `_ is passed, neither ``key_padding_mask`` nor ``attn_mask`` is passed If the optimized implementation is in use, a `NestedTensor `_ can be passed for ``query``/``key``/``value`` to represent padding more efficiently than using a padding mask. In this case, a `NestedTensor `_ will be returned, and an additional speedup proportional to the fraction of the input that is padding can be expected. Args: embed_dim: Total dimension of the model. num_heads: Number of parallel attention heads. Note that ``embed_dim`` will be split across ``num_heads`` (i.e. each head will have dimension ``embed_dim // num_heads``). dropout: Dropout probability on ``attn_output_weights``. Default: ``0.0`` (no dropout). bias: If specified, adds bias to input / output projection layers. Default: ``True``. add_bias_kv: If specified, adds bias to the key and value sequences at dim=0. Default: ``False``. add_zero_attn: If specified, adds a new batch of zeros to the key and value sequences at dim=1. Default: ``False``. kdim: Total number of features for keys. Default: ``None`` (uses ``kdim=embed_dim``). vdim: Total number of features for values. Default: ``None`` (uses ``vdim=embed_dim``). batch_first: If ``True``, then the input and output tensors are provided as (batch, seq, feature). Default: ``False`` (seq, batch, feature). Examples:: >>> # xdoctest: +SKIP >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads) >>> attn_output, attn_output_weights = multihead_attn(query, key, value) """ __constants__ = ["batch_first"] bias_k: Optional[torch.Tensor] bias_v: Optional[torch.Tensor] def __init__( self, embed_dim, num_heads, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None, vdim=None, batch_first=False, linear1_cls=Linear, linear2_cls=Linear, device=None, dtype=None, ) -> None: factory_kwargs = {"device": device, "dtype": dtype} super(MultiheadAttention, self).__init__() self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim self._qkv_same_embed_dim = ( self.kdim == embed_dim and self.vdim == embed_dim ) self.num_heads = num_heads self.dropout = dropout self.batch_first = batch_first self.head_dim = embed_dim // num_heads assert ( self.head_dim * num_heads == self.embed_dim ), "embed_dim must be divisible by num_heads" if add_bias_kv: self.bias_k = Parameter( torch.empty((1, 1, embed_dim), **factory_kwargs) ) self.bias_v = Parameter( torch.empty((1, 1, embed_dim), **factory_kwargs) ) else: self.bias_k = self.bias_v = None if linear1_cls == Linear: if not self._qkv_same_embed_dim: self.q_proj_weight = Parameter( torch.empty((embed_dim, embed_dim), **factory_kwargs) ) self.k_proj_weight = Parameter( torch.empty((embed_dim, self.kdim), **factory_kwargs) ) self.v_proj_weight = Parameter( torch.empty((embed_dim, self.vdim), **factory_kwargs) ) self.register_parameter("in_proj_weight", None) else: # go down this route with voicecraft self.in_proj_weight = Parameter( torch.empty((3 * embed_dim, embed_dim), **factory_kwargs) ) self.register_parameter("q_proj_weight", None) self.register_parameter("k_proj_weight", None) self.register_parameter("v_proj_weight", None) if bias: # True by default self.in_proj_bias = Parameter( torch.empty(3 * embed_dim, **factory_kwargs) ) else: self.register_parameter("in_proj_bias", None) self.out_proj = NonDynamicallyQuantizableLinear( embed_dim, embed_dim, bias=bias, **factory_kwargs ) self._reset_parameters() else: if not self._qkv_same_embed_dim: raise NotImplementedError else: self.in_proj_linear = linear1_cls( embed_dim, 3 * embed_dim, bias=bias, **factory_kwargs ) self.in_proj_weight = self.in_proj_linear.weight self.register_parameter("q_proj_weight", None) self.register_parameter("k_proj_weight", None) self.register_parameter("v_proj_weight", None) if bias: self.in_proj_bias = self.in_proj_linear.bias else: self.register_parameter("in_proj_bias", None) self.out_proj = linear2_cls( embed_dim, embed_dim, bias=bias, **factory_kwargs ) if self.bias_k is not None: xavier_normal_(self.bias_k) if self.bias_v is not None: xavier_normal_(self.bias_v) self.add_zero_attn = add_zero_attn def _reset_parameters(self): if self._qkv_same_embed_dim: xavier_uniform_(self.in_proj_weight) else: xavier_uniform_(self.q_proj_weight) xavier_uniform_(self.k_proj_weight) xavier_uniform_(self.v_proj_weight) if self.in_proj_bias is not None: constant_(self.in_proj_bias, 0.0) constant_(self.out_proj.bias, 0.0) if self.bias_k is not None: xavier_normal_(self.bias_k) if self.bias_v is not None: xavier_normal_(self.bias_v) def __setstate__(self, state): # Support loading old MultiheadAttention checkpoints generated by v1.1.0 if "_qkv_same_embed_dim" not in state: state["_qkv_same_embed_dim"] = True super(MultiheadAttention, self).__setstate__(state) def forward( self, query: Tensor, key: Tensor, value: Tensor, key_padding_mask: Optional[Tensor] = None, need_weights: bool = True, attn_mask: Optional[Tensor] = None, average_attn_weights: bool = True, past: Optional[Tensor] = None, ) -> Tuple[Tensor, Optional[Tensor]]: r""" Args: query: Query embeddings of shape :math:`(L, E_q)` for unbatched input, :math:`(L, N, E_q)` when ``batch_first=False`` or :math:`(N, L, E_q)` when ``batch_first=True``, where :math:`L` is the target sequence length, :math:`N` is the batch size, and :math:`E_q` is the query embedding dimension ``embed_dim``. Queries are compared against key-value pairs to produce the output. See "Attention Is All You Need" for more details. key: Key embeddings of shape :math:`(S, E_k)` for unbatched input, :math:`(S, N, E_k)` when ``batch_first=False`` or :math:`(N, S, E_k)` when ``batch_first=True``, where :math:`S` is the source sequence length, :math:`N` is the batch size, and :math:`E_k` is the key embedding dimension ``kdim``. See "Attention Is All You Need" for more details. value: Value embeddings of shape :math:`(S, E_v)` for unbatched input, :math:`(S, N, E_v)` when ``batch_first=False`` or :math:`(N, S, E_v)` when ``batch_first=True``, where :math:`S` is the source sequence length, :math:`N` is the batch size, and :math:`E_v` is the value embedding dimension ``vdim``. See "Attention Is All You Need" for more details. key_padding_mask: If specified, a mask of shape :math:`(N, S)` indicating which elements within ``key`` to ignore for the purpose of attention (i.e. treat as "padding"). For unbatched `query`, shape should be :math:`(S)`. Binary and byte masks are supported. For a binary mask, a ``True`` value indicates that the corresponding ``key`` value will be ignored for the purpose of attention. For a float mask, it will be directly added to the corresponding ``key`` value. need_weights: If specified, returns ``attn_output_weights`` in addition to ``attn_outputs``. Default: ``True``. attn_mask: If specified, a 2D or 3D mask preventing attention to certain positions. Must be of shape :math:`(L, S)` or :math:`(N\cdot\text{num\_heads}, L, S)`, where :math:`N` is the batch size, :math:`L` is the target sequence length, and :math:`S` is the source sequence length. A 2D mask will be broadcasted across the batch while a 3D mask allows for a different mask for each entry in the batch. Binary, byte, and float masks are supported. For a binary mask, a ``True`` value indicates that the corresponding position is not allowed to attend. For a byte mask, a non-zero value indicates that the corresponding position is not allowed to attend. For a float mask, the mask values will be added to the attention weight. average_attn_weights: If true, indicates that the returned ``attn_weights`` should be averaged across heads. Otherwise, ``attn_weights`` are provided separately per head. Note that this flag only has an effect when ``need_weights=True``. Default: ``True`` (i.e. average weights across heads) Outputs: - **attn_output** - Attention outputs of shape :math:`(L, E)` when input is unbatched, :math:`(L, N, E)` when ``batch_first=False`` or :math:`(N, L, E)` when ``batch_first=True``, where :math:`L` is the target sequence length, :math:`N` is the batch size, and :math:`E` is the embedding dimension ``embed_dim``. - **attn_output_weights** - Only returned when ``need_weights=True``. If ``average_attn_weights=True``, returns attention weights averaged across heads of shape :math:`(L, S)` when input is unbatched or :math:`(N, L, S)`, where :math:`N` is the batch size, :math:`L` is the target sequence length, and :math:`S` is the source sequence length. If ``average_attn_weights=False``, returns attention weights per head of shape :math:`(\text{num\_heads}, L, S)` when input is unbatched or :math:`(N, \text{num\_heads}, L, S)`. .. note:: `batch_first` argument is ignored for unbatched inputs. """ is_batched = query.dim() == 3 if key_padding_mask is not None: _kpm_dtype = key_padding_mask.dtype if _kpm_dtype != torch.bool and not torch.is_floating_point( key_padding_mask ): raise AssertionError( "only bool and floating types of key_padding_mask are supported" ) why_not_fast_path = "" if not is_batched: why_not_fast_path = f"input not batched; expected query.dim() of 3 but got {query.dim()}" elif query is not key or key is not value: # When lifting this restriction, don't forget to either # enforce that the dtypes all match or test cases where # they don't! why_not_fast_path = "non-self attention was used (query, key, and value are not the same Tensor)" elif ( self.in_proj_bias is not None and query.dtype != self.in_proj_bias.dtype ): why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_bias ({self.in_proj_bias.dtype}) don't match" elif ( self.in_proj_weight is not None and query.dtype != self.in_proj_weight.dtype ): # this case will fail anyway, but at least they'll get a useful error message. why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_weight ({self.in_proj_weight.dtype}) don't match" elif self.training: why_not_fast_path = "training is enabled" elif not self.batch_first: why_not_fast_path = "batch_first was not True" elif self.bias_k is not None: why_not_fast_path = "self.bias_k was not None" elif self.bias_v is not None: why_not_fast_path = "self.bias_v was not None" elif self.dropout: why_not_fast_path = f"dropout was {self.dropout}, required zero" elif self.add_zero_attn: why_not_fast_path = "add_zero_attn was enabled" elif not self._qkv_same_embed_dim: why_not_fast_path = "_qkv_same_embed_dim was not True" elif attn_mask is not None: why_not_fast_path = "attn_mask was not None" elif query.is_nested and key_padding_mask is not None: why_not_fast_path = ( "key_padding_mask is not supported with NestedTensor input" ) elif self.num_heads % 2 == 1: why_not_fast_path = "num_heads is odd" elif torch.is_autocast_enabled(): why_not_fast_path = "autocast is enabled" if not why_not_fast_path: tensor_args = ( query, key, value, self.in_proj_weight, self.in_proj_bias, self.out_proj.weight, self.out_proj.bias, ) # We have to use list comprehensions below because TorchScript does not support # generator expressions. if torch.overrides.has_torch_function(tensor_args): why_not_fast_path = "some Tensor argument has_torch_function" elif not all( [ (x is None or x.is_cuda or "cpu" in str(x.device)) for x in tensor_args ] ): why_not_fast_path = ( "some Tensor argument is neither CUDA nor CPU" ) elif torch.is_grad_enabled() and any( [x is not None and x.requires_grad for x in tensor_args] ): why_not_fast_path = ( "grad is enabled and at least one of query or the " "input/output projection weights or biases requires_grad" ) if not why_not_fast_path: return torch._native_multi_head_attention( query, key, value, self.embed_dim, self.num_heads, self.in_proj_weight, self.in_proj_bias, self.out_proj.weight, self.out_proj.bias, key_padding_mask if key_padding_mask is not None else attn_mask, need_weights, average_attn_weights, 1 if key_padding_mask is not None else 0 if attn_mask is not None else None, ) any_nested = query.is_nested or key.is_nested or value.is_nested assert not any_nested, ( "MultiheadAttention does not support NestedTensor outside of its fast path. " + f"The fast path was not hit because {why_not_fast_path}" ) if self.batch_first and is_batched: # make sure that the transpose op does not affect the "is" property if key is value: if query is key: query = key = value = query.transpose(1, 0) else: query, key = [x.transpose(1, 0) for x in (query, key)] value = key else: query, key, value = [ x.transpose(1, 0) for x in (query, key, value) ] if not self._qkv_same_embed_dim: attn_output, attn_output_weights = F.multi_head_attention_forward( query, key, value, self.embed_dim, self.num_heads, self.in_proj_weight, self.in_proj_bias, self.bias_k, self.bias_v, self.add_zero_attn, self.dropout, self.out_proj.weight, self.out_proj.bias, training=self.training, key_padding_mask=key_padding_mask, need_weights=need_weights, attn_mask=attn_mask, use_separate_proj_weight=True, q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight, v_proj_weight=self.v_proj_weight, average_attn_weights=average_attn_weights, ) else: # re-write the self.attention here, to get k, v cache tgt_len, bsz, embed_dim = query.shape src_len, _, _ = key.shape num_heads = self.num_heads key_padding_mask = _canonical_mask( mask=key_padding_mask, mask_name="key_padding_mask", other_type=_none_or_dtype(attn_mask), other_name="attn_mask", target_type=query.dtype ) attn_mask = _canonical_mask( mask=attn_mask, mask_name="attn_mask", other_type=None, other_name="", target_type=query.dtype, check_other=False, ) head_dim = self.embed_dim // self.num_heads assert head_dim * self.num_heads == self.embed_dim, f"embed_dim {self.embed_dim} not divisible by num_heads {self.num_heads}" assert key.shape == value.shape, f"key shape {key.shape} does not match value shape {value.shape}" q, k, v = _in_projection_packed(query, key, value, self.in_proj_weight, self.in_proj_bias) # k_present, v_present = k, v # # reshape q, k, v for multihead attention and make em batch first # q = q.view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1) k = k.view(k.shape[0], bsz * num_heads, head_dim).transpose(0, 1) v = v.view(v.shape[0], bsz * num_heads, head_dim).transpose(0, 1) # (bsz * num_heads, src_len, head_dim) src_len = k.size(1) if past is not None and past.ndim > 2: expected_src_len = src_len + past[0].shape[-2] else: expected_src_len = src_len # ensure attn_mask's dim is 3 if attn_mask.dim() == 2: correct_2d_size = (tgt_len, expected_src_len) if attn_mask.shape != correct_2d_size: raise RuntimeError(f"The shape of the 2D attn_mask is {attn_mask.shape}, but should be {correct_2d_size}.") attn_mask = attn_mask.unsqueeze(0) elif attn_mask.dim() == 3: correct_3d_size = (bsz * num_heads, tgt_len, expected_src_len) if attn_mask.shape != correct_3d_size: raise RuntimeError(f"The shape of the 3D attn_mask is {attn_mask.shape}, but should be {correct_3d_size}.") else: raise RuntimeError(f"attn_mask's dimension {attn_mask.dim()} is not supported") if key_padding_mask is not None: assert key_padding_mask.shape == (bsz, expected_src_len), \ f"expecting key_padding_mask shape of {(bsz, expected_src_len)}, but got {key_padding_mask.shape}" key_padding_mask = key_padding_mask.view(bsz, 1, 1, expected_src_len). \ expand(-1, num_heads, -1, -1).reshape(bsz * num_heads, 1, expected_src_len) if attn_mask is None: attn_mask = key_padding_mask else: attn_mask = attn_mask + key_padding_mask if not self.training: dropout_p = 0.0 else: dropout_p = self.dropout if need_weights: raise NotImplementedError("need_weights not implemented for voicecraft") # B, Nt, E = q.shape # q_scaled = q / math.sqrt(E) # assert not (is_causal and attn_mask is None), "FIXME: is_causal not implemented for need_weights" # if attn_mask is not None: # attn_output_weights = torch.baddbmm(attn_mask, q_scaled, k.transpose(-2, -1)) # else: # attn_output_weights = torch.bmm(q_scaled, k.transpose(-2, -1)) # attn_output_weights = softmax(attn_output_weights, dim=-1) # if dropout_p > 0.0: # attn_output_weights = dropout(attn_output_weights, p=dropout_p) # attn_output = torch.bmm(attn_output_weights, v) # attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len * bsz, embed_dim) # attn_output = linear(attn_output, out_proj_weight, out_proj_bias) # attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1)) # # optionally average attention weights over heads # attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len) # if average_attn_weights: # attn_output_weights = attn_output_weights.mean(dim=1) # if not is_batched: # # squeeze the output if input was unbatched # attn_output = attn_output.squeeze(1) # attn_output_weights = attn_output_weights.squeeze(0) # return attn_output, attn_output_weights else: # attn_mask can be either (L,S) or (N*num_heads, L, S) # if attn_mask's shape is (1, L, S) we need to unsqueeze to (1, 1, L, S) # in order to match the input for SDPA of (N, num_heads, L, S) if attn_mask is not None: if attn_mask.size(0) == 1 and attn_mask.dim() == 3: attn_mask = attn_mask.unsqueeze(0) else: attn_mask = attn_mask.view(bsz, num_heads, -1, expected_src_len) q = q.view(bsz, num_heads, tgt_len, head_dim) k = k.view(bsz, num_heads, src_len, head_dim) v = v.view(bsz, num_heads, src_len, head_dim) # logging.info(f"shape of past: {past.shape}") if past is not None: present = torch.stack([k, v], dim=0) # (2, bsz, num_heads, src_len, head_dim) if past.ndim > 2: # this means we use kvcache, otherwise we just pass in a placeholder, but not actually using kvcache pk, pv = past k = torch.cat([pk, k], dim=-2) v = torch.cat([pv, v], dim=-2) else: present = None attn_output = F.scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal=False) attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim) attn_output = F.linear(attn_output, self.out_proj.weight, self.out_proj.bias) attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1)) if not is_batched: # squeeze the output if input was unbatched attn_output = attn_output.squeeze(1) # if self.training: # return attn_output, None # else: # return (attn_output, present), None # harded coded, the code do not support returning attn weigths yet attn_output_weights=None if self.batch_first and is_batched: return attn_output.transpose(1, 0), present else: return attn_output, present ================================================ FILE: models/modules/embedding.py ================================================ # cp from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py # Copyright 2023 (authors: Feiteng Li) # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import math import torch import torch.nn as nn class TokenEmbedding(nn.Module): def __init__( self, dim_model: int, vocab_size: int, dropout: float = 0.0, ): super().__init__() self.vocab_size = vocab_size self.dim_model = dim_model self.dropout = torch.nn.Dropout(p=dropout) self.word_embeddings = nn.Embedding(self.vocab_size, self.dim_model) @property def weight(self) -> torch.Tensor: return self.word_embeddings.weight def embedding(self, index: int) -> torch.Tensor: return self.word_embeddings.weight[index : index + 1] def forward(self, x: torch.Tensor): X = self.word_embeddings(x) X = self.dropout(X) return X class SinePositionalEmbedding(nn.Module): def __init__( self, dim_model: int, dropout: float = 0.0, scale: bool = False, alpha: bool = False, ): super().__init__() self.dim_model = dim_model self.x_scale = math.sqrt(dim_model) if scale else 1.0 self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha) self.dropout = torch.nn.Dropout(p=dropout) self.reverse = False self.pe = None self.extend_pe(torch.tensor(0.0).expand(1, 4000)) def extend_pe(self, x): """Reset the positional encodings.""" if self.pe is not None: if self.pe.size(1) >= x.size(1): if self.pe.dtype != x.dtype or self.pe.device != x.device: self.pe = self.pe.to(dtype=x.dtype, device=x.device) return pe = torch.zeros(x.size(1), self.dim_model) if self.reverse: position = torch.arange( x.size(1) - 1, -1, -1.0, dtype=torch.float32 ).unsqueeze(1) else: position = torch.arange( 0, x.size(1), dtype=torch.float32 ).unsqueeze(1) div_term = torch.exp( torch.arange(0, self.dim_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.dim_model) ) pe[:, 0::2] = torch.sin(position * div_term) pe[:, 1::2] = torch.cos(position * div_term) pe = pe.unsqueeze(0) self.pe = pe.to(device=x.device, dtype=x.dtype).detach() def forward(self, x: torch.Tensor) -> torch.Tensor: self.extend_pe(x) output = x.unsqueeze(-1) if x.ndim == 2 else x output = output * self.x_scale + self.alpha * self.pe[:, : x.size(1)] return self.dropout(output) ================================================ FILE: models/modules/sampling.py ================================================ import torch import torch.nn.functional as F def top_k_top_p_filtering( logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1 ): """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering Args: logits: logits distribution shape (batch size, vocabulary size) if top_k > 0: keep only top k tokens with highest probability (top-k filtering). if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering). Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751) Make sure we keep at least min_tokens_to_keep per batch example in the output From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 """ if top_k > 0: top_k = min( max(top_k, min_tokens_to_keep), logits.size(-1) ) # Safety check # Remove all tokens with a probability less than the last token of the top-k indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] logits[indices_to_remove] = filter_value if top_p < 1.0: sorted_logits, sorted_indices = torch.sort(logits, descending=True) cumulative_probs = torch.cumsum( F.softmax(sorted_logits, dim=-1), dim=-1 ) # Remove tokens with cumulative probability above the threshold (token with 0 are kept) sorted_indices_to_remove = cumulative_probs > top_p if min_tokens_to_keep > 1: # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below) sorted_indices_to_remove[..., :min_tokens_to_keep] = 0 # Shift the indices to the right to keep also the first token above the threshold sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[ ..., :-1 ].clone() sorted_indices_to_remove[..., 0] = 0 # scatter sorted tensors to original indexing indices_to_remove = sorted_indices_to_remove.scatter( 1, sorted_indices, sorted_indices_to_remove ) logits[indices_to_remove] = filter_value return logits def topk_sampling(logits, top_k=10, top_p=1.0, temperature=1.0): # temperature: (`optional`) float # The value used to module the next token probabilities. Must be strictly positive. Default to 1.0. # top_k: (`optional`) int # The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50. # top_p: (`optional`) float # The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1. # Temperature (higher temperature => more likely to sample low probability tokens) if temperature != 1.0: logits = logits / temperature # Top-p/top-k filtering logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p) # Sample token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1) return token ================================================ FILE: models/modules/scaling.py ================================================ # cp from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/scaling.py # Copyright 2022 Xiaomi Corp. (authors: Daniel Povey) # # See ../../../../LICENSE for clarification regarding multiple authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import collections import logging import random import math from functools import reduce from itertools import repeat from typing import Optional, Tuple, Union import torch import torch.nn as nn import torch.nn.functional as F from torch import Tensor from torch.nn import Embedding as ScaledEmbedding # from valle.utils import Transpose class Transpose(nn.Identity): """(N, T, D) -> (N, D, T)""" def forward(self, input: torch.Tensor) -> torch.Tensor: return input.transpose(1, 2) class ActivationBalancerFunction(torch.autograd.Function): @staticmethod def forward( ctx, x: Tensor, scale_factor: Tensor, sign_factor: Optional[Tensor], channel_dim: int, ) -> Tensor: if channel_dim < 0: channel_dim += x.ndim ctx.channel_dim = channel_dim xgt0 = x > 0 if sign_factor is None: ctx.save_for_backward(xgt0, scale_factor) else: ctx.save_for_backward(xgt0, scale_factor, sign_factor) return x @staticmethod def backward(ctx, x_grad: Tensor) -> Tuple[Tensor, None, None, None]: if len(ctx.saved_tensors) == 3: xgt0, scale_factor, sign_factor = ctx.saved_tensors for _ in range(ctx.channel_dim, x_grad.ndim - 1): scale_factor = scale_factor.unsqueeze(-1) sign_factor = sign_factor.unsqueeze(-1) factor = sign_factor + scale_factor * (xgt0.to(x_grad.dtype) - 0.5) else: xgt0, scale_factor = ctx.saved_tensors for _ in range(ctx.channel_dim, x_grad.ndim - 1): scale_factor = scale_factor.unsqueeze(-1) factor = scale_factor * (xgt0.to(x_grad.dtype) - 0.5) neg_delta_grad = x_grad.abs() * factor return ( x_grad - neg_delta_grad, None, None, None, ) def _compute_scale_factor( x: Tensor, channel_dim: int, min_abs: float, max_abs: float, gain_factor: float, max_factor: float, ) -> Tensor: if channel_dim < 0: channel_dim += x.ndim sum_dims = [d for d in range(x.ndim) if d != channel_dim] x_abs_mean = torch.mean(x.abs(), dim=sum_dims).to(torch.float32) if min_abs == 0.0: below_threshold = 0.0 else: # below_threshold is 0 if x_abs_mean > min_abs, can be at most max_factor if # x_abs)_mean , min_abs. below_threshold = ( (min_abs - x_abs_mean) * (gain_factor / min_abs) ).clamp(min=0, max=max_factor) above_threshold = ((x_abs_mean - max_abs) * (gain_factor / max_abs)).clamp( min=0, max=max_factor ) return below_threshold - above_threshold def _compute_sign_factor( x: Tensor, channel_dim: int, min_positive: float, max_positive: float, gain_factor: float, max_factor: float, ) -> Tensor: if channel_dim < 0: channel_dim += x.ndim sum_dims = [d for d in range(x.ndim) if d != channel_dim] proportion_positive = torch.mean((x > 0).to(torch.float32), dim=sum_dims) if min_positive == 0.0: factor1 = 0.0 else: # 0 if proportion_positive >= min_positive, else can be # as large as max_factor. factor1 = ( (min_positive - proportion_positive) * (gain_factor / min_positive) ).clamp_(min=0, max=max_factor) if max_positive == 1.0: factor2 = 0.0 else: # 0 if self.proportion_positive <= max_positive, else can be # as large as -max_factor. factor2 = ( (proportion_positive - max_positive) * (gain_factor / (1.0 - max_positive)) ).clamp_(min=0, max=max_factor) sign_factor = factor1 - factor2 # require min_positive != 0 or max_positive != 1: assert not isinstance(sign_factor, float) return sign_factor class ActivationScaleBalancerFunction(torch.autograd.Function): """ This object is used in class ActivationBalancer when the user specified min_positive=0, max_positive=1, so there are no constraints on the signs of the activations and only the absolute value has a constraint. """ @staticmethod def forward( ctx, x: Tensor, sign_factor: Tensor, scale_factor: Tensor, channel_dim: int, ) -> Tensor: if channel_dim < 0: channel_dim += x.ndim ctx.channel_dim = channel_dim xgt0 = x > 0 ctx.save_for_backward(xgt0, sign_factor, scale_factor) return x @staticmethod def backward(ctx, x_grad: Tensor) -> Tuple[Tensor, None, None, None]: xgt0, sign_factor, scale_factor = ctx.saved_tensors for _ in range(ctx.channel_dim, x_grad.ndim - 1): sign_factor = sign_factor.unsqueeze(-1) scale_factor = scale_factor.unsqueeze(-1) factor = sign_factor + scale_factor * (xgt0.to(x_grad.dtype) - 0.5) neg_delta_grad = x_grad.abs() * factor return ( x_grad - neg_delta_grad, None, None, None, ) class RandomClampFunction(torch.autograd.Function): @staticmethod def forward( ctx, x: Tensor, min: Optional[float], max: Optional[float], prob: float, reflect: float, ) -> Tensor: x_clamped = torch.clamp(x, min=min, max=max) mask = torch.rand_like(x) < prob ans = torch.where(mask, x_clamped, x) if x.requires_grad: ctx.save_for_backward(ans == x) ctx.reflect = reflect if reflect != 0.0: ans = ans * (1.0 + reflect) - (x * reflect) return ans @staticmethod def backward( ctx, ans_grad: Tensor ) -> Tuple[Tensor, None, None, None, None]: (is_same,) = ctx.saved_tensors x_grad = ans_grad * is_same.to(ans_grad.dtype) reflect = ctx.reflect if reflect != 0.0: x_grad = x_grad * (1.0 + reflect) - (ans_grad * reflect) return x_grad, None, None, None, None def random_clamp( x: Tensor, min: Optional[float] = None, max: Optional[float] = None, prob: float = 0.5, reflect: float = 0.0, ): return RandomClampFunction.apply(x, min, max, prob, reflect) def random_cast_to_half(x: Tensor, min_abs: float = 5.0e-06) -> Tensor: """ A randomized way of casting a floating point value to half precision. """ if x.dtype == torch.float16: return x x_abs = x.abs() is_too_small = x_abs < min_abs # for elements where is_too_small is true, random_val will contain +-min_abs with # probability (x.abs() / min_abs), and 0.0 otherwise. [so this preserves expectations, # for those elements]. random_val = min_abs * x.sign() * (torch.rand_like(x) * min_abs < x_abs) return torch.where(is_too_small, random_val, x).to(torch.float16) class RandomGradFunction(torch.autograd.Function): """ Does nothing in forward pass; in backward pass, gets rid of very small grads using randomized approach that preserves expectations (intended to reduce roundoff). """ @staticmethod def forward(ctx, x: Tensor, min_abs: float) -> Tensor: ctx.min_abs = min_abs return x @staticmethod def backward(ctx, ans_grad: Tensor) -> Tuple[Tensor, None]: if ans_grad.dtype == torch.float16: return ( random_cast_to_half( ans_grad.to(torch.float32), min_abs=ctx.min_abs ), None, ) else: return ans_grad, None class RandomGrad(torch.nn.Module): """ Gets rid of very small gradients using an expectation-preserving method, intended to increase accuracy of training when using amp (automatic mixed precision) """ def __init__(self, min_abs: float = 5.0e-06): super(RandomGrad, self).__init__() self.min_abs = min_abs def forward(self, x: Tensor): if ( torch.jit.is_scripting() or not self.training or torch.jit.is_tracing() ): return x else: return RandomGradFunction.apply(x, self.min_abs) class SoftmaxFunction(torch.autograd.Function): """ Tries to handle half-precision derivatives in a randomized way that should be more accurate for training than the default behavior. """ @staticmethod def forward(ctx, x: Tensor, dim: int): ans = x.softmax(dim=dim) # if x dtype is float16, x.softmax() returns a float32 because # (presumably) that op does not support float16, and autocast # is enabled. if torch.is_autocast_enabled(): ans = ans.to(torch.float16) ctx.save_for_backward(ans) ctx.x_dtype = x.dtype ctx.dim = dim return ans @staticmethod def backward(ctx, ans_grad: Tensor): (ans,) = ctx.saved_tensors with torch.cuda.amp.autocast(enabled=False): ans_grad = ans_grad.to(torch.float32) ans = ans.to(torch.float32) x_grad = ans_grad * ans x_grad = x_grad - ans * x_grad.sum(dim=ctx.dim, keepdim=True) return x_grad, None def softmax(x: Tensor, dim: int): if torch.jit.is_scripting() or torch.jit.is_tracing(): return x.softmax(dim) return SoftmaxFunction.apply(x, dim) class MaxEigLimiterFunction(torch.autograd.Function): @staticmethod def forward( ctx, x: Tensor, coeffs: Tensor, direction: Tensor, channel_dim: int, grad_scale: float, ) -> Tensor: ctx.channel_dim = channel_dim ctx.grad_scale = grad_scale ctx.save_for_backward(x.detach(), coeffs.detach(), direction.detach()) return x @staticmethod def backward(ctx, x_grad, *args): with torch.enable_grad(): (x_orig, coeffs, new_direction) = ctx.saved_tensors x_orig.requires_grad = True num_channels = x_orig.shape[ctx.channel_dim] x = x_orig.transpose(ctx.channel_dim, -1).reshape(-1, num_channels) new_direction.requires_grad = False x = x - x.mean(dim=0) x_var = (x ** 2).mean() x_residual = x - coeffs * new_direction x_residual_var = (x_residual ** 2).mean() # `variance_proportion` is the proportion of the variance accounted for # by the top eigen-direction. This is to be minimized. variance_proportion = (x_var - x_residual_var) / (x_var + 1.0e-20) variance_proportion.backward() x_orig_grad = x_orig.grad x_extra_grad = ( x_orig.grad * ctx.grad_scale * x_grad.norm() / (x_orig_grad.norm() + 1.0e-20) ) return x_grad + x_extra_grad.detach(), None, None, None, None class BasicNorm(torch.nn.Module): """ This is intended to be a simpler, and hopefully cheaper, replacement for LayerNorm. The observation this is based on, is that Transformer-type networks, especially with pre-norm, sometimes seem to set one of the feature dimensions to a large constant value (e.g. 50), which "defeats" the LayerNorm because the output magnitude is then not strongly dependent on the other (useful) features. Presumably the weight and bias of the LayerNorm are required to allow it to do this. So the idea is to introduce this large constant value as an explicit parameter, that takes the role of the "eps" in LayerNorm, so the network doesn't have to do this trick. We make the "eps" learnable. Args: num_channels: the number of channels, e.g. 512. channel_dim: the axis/dimension corresponding to the channel, interprted as an offset from the input's ndim if negative. shis is NOT the num_channels; it should typically be one of {-2, -1, 0, 1, 2, 3}. eps: the initial "epsilon" that we add as ballast in: scale = ((input_vec**2).mean() + epsilon)**-0.5 Note: our epsilon is actually large, but we keep the name to indicate the connection with conventional LayerNorm. learn_eps: if true, we learn epsilon; if false, we keep it at the initial value. eps_min: float eps_max: float """ def __init__( self, num_channels: int, channel_dim: int = -1, # CAUTION: see documentation. eps: float = 0.25, learn_eps: bool = True, eps_min: float = -3.0, eps_max: float = 3.0, ) -> None: super(BasicNorm, self).__init__() self.num_channels = num_channels self.channel_dim = channel_dim if learn_eps: self.eps = nn.Parameter(torch.tensor(eps).log().detach()) else: self.register_buffer("eps", torch.tensor(eps).log().detach()) self.eps_min = eps_min self.eps_max = eps_max def forward(self, x: Tensor) -> Tensor: assert x.shape[self.channel_dim] == self.num_channels eps = self.eps if self.training and random.random() < 0.25: # with probability 0.25, in training mode, clamp eps between the min # and max; this will encourage it to learn parameters within the # allowed range by making parameters that are outside the allowed # range noisy. # gradients to allow the parameter to get back into the allowed region if it happens to exit it. eps = eps.clamp(min=self.eps_min, max=self.eps_max) scales = ( torch.mean(x ** 2, dim=self.channel_dim, keepdim=True) + eps.exp() ) ** -0.5 return x * scales def ScaledLinear(*args, initial_scale: float = 1.0, **kwargs) -> nn.Linear: """ Behaves like a constructor of a modified version of nn.Linear that gives an easy way to set the default initial parameter scale. Args: Accepts the standard args and kwargs that nn.Linear accepts e.g. in_features, out_features, bias=False. initial_scale: you can override this if you want to increase or decrease the initial magnitude of the module's output (affects the initialization of weight_scale and bias_scale). Another option, if you want to do something like this, is to re-initialize the parameters. """ ans = nn.Linear(*args, **kwargs) with torch.no_grad(): ans.weight[:] *= initial_scale if ans.bias is not None: torch.nn.init.uniform_( ans.bias, -0.1 * initial_scale, 0.1 * initial_scale ) return ans def ScaledConv1d( *args, initial_scale: float = 1.0, kernel_size: int = 3, padding: str = "same", **kwargs, ) -> nn.Conv1d: """ Behaves like a constructor of a modified version of nn.Conv1d that gives an easy way to set the default initial parameter scale. Args: Accepts the standard args and kwargs that nn.Linear accepts e.g. in_features, out_features, bias=False. initial_scale: you can override this if you want to increase or decrease the initial magnitude of the module's output (affects the initialization of weight_scale and bias_scale). Another option, if you want to do something like this, is to re-initialize the parameters. """ ans = nn.Conv1d(*args, kernel_size=kernel_size, padding=padding, **kwargs) with torch.no_grad(): ans.weight[:] *= initial_scale if ans.bias is not None: torch.nn.init.uniform_( ans.bias, -0.1 * initial_scale, 0.1 * initial_scale ) return ans def TransposeScaledConv1d( *args, initial_scale: float = 1.0, kernel_size: int = 3, padding: str = "same", **kwargs, ) -> nn.Sequential: """ Transpose -> ScaledConv1d """ return nn.Sequential( Transpose(), ScaledConv1d( *args, initial_scale=initial_scale, kernel_size=kernel_size, padding=padding, **kwargs, ), ) def ScaledConv1dTranspose( *args, initial_scale: float = 1.0, kernel_size: int = 3, padding: str = "same", **kwargs, ) -> nn.Sequential: """ Transpose -> ScaledConv1d """ return nn.Sequential( ScaledConv1d( *args, initial_scale=initial_scale, kernel_size=kernel_size, padding=padding, **kwargs, ), Transpose(), ) def TransposeConv1d( *args, kernel_size: int = 3, padding: str = "same", **kwargs ) -> nn.Sequential: """ Transpose -> Conv1d """ return nn.Sequential( Transpose(), nn.Conv1d(*args, kernel_size=kernel_size, padding=padding, **kwargs), ) def Conv1dTranspose( *args, kernel_size: int = 3, padding: str = "same", **kwargs ) -> nn.Sequential: """ ScaledConv1d -> Transpose """ return nn.Sequential( nn.Conv1d(*args, kernel_size=kernel_size, padding=padding, **kwargs), Transpose(), ) class SRLinear(nn.Linear): """https://arxiv.org/abs/2303.06296 Stabilizing Transformer Training by Preventing Attention Entropy Collapse """ def __init__(self, in_features, out_features, bias=True, **kwargs): super().__init__(in_features, out_features, bias=bias, **kwargs) self.register_buffer( "u", nn.functional.normalize(torch.randn(in_features), dim=0) ) with torch.no_grad(): sigma = self.get_sigma() self.register_buffer("spectral_norm", sigma) self.sigma = nn.Parameter(torch.ones(1)) def get_sigma(self): with torch.no_grad(): u = self.u v = self.weight.mv(u) v = nn.functional.normalize(v, dim=0) u = self.weight.T.mv(v) u = nn.functional.normalize(u, dim=0) self.u.data.copy_(u) return torch.einsum("c,cd,d->", v, self.weight, u) def get_weight(self): sigma = self.get_sigma() if self.training: self.spectral_norm.data.copy_(sigma) weight = (self.sigma / sigma) * self.weight return weight def forward(self, x): return nn.functional.linear(x, self.get_weight(), self.bias) class SRConv1d(SRLinear): def __init__( self, in_features, out_features, kernel_size, stride: int = 1, padding: str = "same", bias: bool = True, **kwargs, ): in_features = in_features * kernel_size super().__init__(in_features, out_features, bias=bias, **kwargs) nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5)) self.kernel_size = kernel_size self.stride = stride self.padding = padding def forward(self, x): in_features = self.in_features // self.kernel_size weight = self.get_weight().view( self.out_features, in_features, self.kernel_size ) return nn.functional.conv1d( x, weight, bias=self.bias, stride=self.stride, padding=self.padding ) def TransposeSRConv1d( *args, kernel_size: int = 3, padding: str = "same", **kwargs ) -> nn.Sequential: """ Transpose -> SRConv1d """ return nn.Sequential( Transpose(), SRConv1d(*args, kernel_size=kernel_size, padding=padding, **kwargs), ) def SRConv1dTranspose( *args, kernel_size: int = 3, padding: str = "same", **kwargs ) -> nn.Sequential: """ SRConv1d -> Transpose """ return nn.Sequential( SRConv1d(*args, kernel_size=kernel_size, padding=padding, **kwargs), Transpose(), ) class ActivationBalancer(torch.nn.Module): """ Modifies the backpropped derivatives of a function to try to encourage, for each channel, that it is positive at least a proportion `threshold` of the time. It does this by multiplying negative derivative values by up to (1+max_factor), and positive derivative values by up to (1-max_factor), interpolated from 1 at the threshold to those extremal values when none of the inputs are positive. Args: num_channels: the number of channels channel_dim: the dimension/axis corresponding to the channel, e.g. -1, 0, 1, 2; will be interpreted as an offset from x.ndim if negative. min_positive: the minimum, per channel, of the proportion of the time that (x > 0), below which we start to modify the derivatives. max_positive: the maximum, per channel, of the proportion of the time that (x > 0), above which we start to modify the derivatives. max_factor: the maximum factor by which we modify the derivatives for either the sign constraint or the magnitude constraint; e.g. with max_factor=0.02, the the derivatives would be multiplied by values in the range [0.98..1.02]. sign_gain_factor: determines the 'gain' with which we increase the change in gradient once the constraints on min_positive and max_positive are violated. scale_gain_factor: determines the 'gain' with which we increase the change in gradient once the constraints on min_abs and max_abs are violated. min_abs: the minimum average-absolute-value difference from the mean value per channel, which we allow, before we start to modify the derivatives to prevent this. max_abs: the maximum average-absolute-value difference from the mean value per channel, which we allow, before we start to modify the derivatives to prevent this. min_prob: determines the minimum probability with which we modify the gradients for the {min,max}_positive and {min,max}_abs constraints, on each forward(). This is done randomly to prevent all layers from doing it at the same time. Early in training we may use higher probabilities than this; it will decay to this value. """ def __init__( self, num_channels: int, channel_dim: int, min_positive: float = 0.05, max_positive: float = 0.95, max_factor: float = 0.04, sign_gain_factor: float = 0.01, scale_gain_factor: float = 0.02, min_abs: float = 0.2, max_abs: float = 100.0, min_prob: float = 0.1, ): super(ActivationBalancer, self).__init__() self.num_channels = num_channels self.channel_dim = channel_dim self.min_positive = min_positive self.max_positive = max_positive self.max_factor = max_factor self.min_abs = min_abs self.max_abs = max_abs self.min_prob = min_prob self.sign_gain_factor = sign_gain_factor self.scale_gain_factor = scale_gain_factor # count measures how many times the forward() function has been called. # We occasionally sync this to a tensor called `count`, that exists to # make sure it is synced to disk when we load and save the model. self.cpu_count = 0 self.register_buffer("count", torch.tensor(0, dtype=torch.int64)) def forward(self, x: Tensor) -> Tensor: if ( torch.jit.is_scripting() or not x.requires_grad or torch.jit.is_tracing() ): return _no_op(x) count = self.cpu_count self.cpu_count += 1 if random.random() < 0.01: # Occasionally sync self.cpu_count with self.count. # count affects the decay of 'prob'. don't do this on every iter, # because syncing with the GPU is slow. self.cpu_count = max(self.cpu_count, self.count.item()) self.count.fill_(self.cpu_count) # the prob of doing some work exponentially decreases from 0.5 till it hits # a floor at min_prob (==0.1, by default) prob = max(self.min_prob, 0.5 ** (1 + (count / 4000.0))) if random.random() < prob: sign_gain_factor = 0.5 if self.min_positive != 0.0 or self.max_positive != 1.0: sign_factor = _compute_sign_factor( x, self.channel_dim, self.min_positive, self.max_positive, gain_factor=self.sign_gain_factor / prob, max_factor=self.max_factor, ) else: sign_factor = None scale_factor = _compute_scale_factor( x.detach(), self.channel_dim, min_abs=self.min_abs, max_abs=self.max_abs, gain_factor=self.scale_gain_factor / prob, max_factor=self.max_factor, ) return ActivationBalancerFunction.apply( x, scale_factor, sign_factor, self.channel_dim, ) else: return _no_op(x) def penalize_abs_values_gt(x: Tensor, limit: float, penalty: float) -> Tensor: """ Returns x unmodified, but in backprop will put a penalty for the excess of the absolute values of elements of x over the limit "limit". E.g. if limit == 10.0, then if x has any values over 10 it will get a penalty. Caution: the value of this penalty will be affected by grad scaling used in automatic mixed precision training. For this reasons we use this, it shouldn't really matter, or may even be helpful; we just use this to disallow really implausible values of scores to be given to softmax. """ x_sign = x.sign() over_limit = (x.abs() - limit) > 0 # The following is a memory efficient way to penalize the absolute values of # x that's over the limit. (The memory efficiency comes when you think # about which items torch needs to cache for the autograd, and which ones it # can throw away). The numerical value of aux_loss as computed here will # actually be larger than it should be, by limit * over_limit.sum(), but it # has the same derivative as the real aux_loss which is penalty * (x.abs() - # limit).relu(). aux_loss = penalty * ((x_sign * over_limit).to(torch.int8) * x) # note: we don't do sum() here on aux)_loss, but it's as if we had done # sum() due to how with_loss() works. x = with_loss(x, aux_loss) # you must use x for something, or this will be ineffective. return x def _diag(x: Tensor): # like .diag(), but works for tensors with 3 dims. if x.ndim == 2: return x.diag() else: (batch, dim, dim) = x.shape x = x.reshape(batch, dim * dim) x = x[:, :: dim + 1] assert x.shape == (batch, dim) return x def _whitening_metric(x: Tensor, num_groups: int): """ Computes the "whitening metric", a value which will be 1.0 if all the eigenvalues of of the centered feature covariance are the same within each group's covariance matrix and also between groups. Args: x: a Tensor of shape (*, num_channels) num_groups: the number of groups of channels, a number >=1 that divides num_channels Returns: Returns a scalar Tensor that will be 1.0 if the data is "perfectly white" and greater than 1.0 otherwise. """ assert x.dtype != torch.float16 x = x.reshape(-1, x.shape[-1]) (num_frames, num_channels) = x.shape assert num_channels % num_groups == 0 channels_per_group = num_channels // num_groups x = x.reshape(num_frames, num_groups, channels_per_group).transpose(0, 1) # x now has shape (num_groups, num_frames, channels_per_group) # subtract the mean so we use the centered, not uncentered, covariance. # My experience has been that when we "mess with the gradients" like this, # it's better not do anything that tries to move the mean around, because # that can easily cause instability. x = x - x.mean(dim=1, keepdim=True) # x_covar: (num_groups, channels_per_group, channels_per_group) x_covar = torch.matmul(x.transpose(1, 2), x) x_covar_mean_diag = _diag(x_covar).mean() # the following expression is what we'd get if we took the matrix product # of each covariance and measured the mean of its trace, i.e. # the same as _diag(torch.matmul(x_covar, x_covar)).mean(). x_covarsq_mean_diag = (x_covar ** 2).sum() / ( num_groups * channels_per_group ) # this metric will be >= 1.0; the larger it is, the less 'white' the data was. metric = x_covarsq_mean_diag / (x_covar_mean_diag ** 2 + 1.0e-20) return metric class WhiteningPenaltyFunction(torch.autograd.Function): @staticmethod def forward( ctx, x: Tensor, num_groups: int, whitening_limit: float, grad_scale: float, ) -> Tensor: ctx.save_for_backward(x) ctx.num_groups = num_groups ctx.whitening_limit = whitening_limit ctx.grad_scale = grad_scale return x @staticmethod def backward(ctx, x_grad: Tensor): (x_orig,) = ctx.saved_tensors with torch.enable_grad(): with torch.cuda.amp.autocast(enabled=False): x_detached = x_orig.to(torch.float32).detach() x_detached.requires_grad = True metric = _whitening_metric(x_detached, ctx.num_groups) if random.random() < 0.005 or __name__ == "__main__": logging.info( f"Whitening: num_groups={ctx.num_groups}, num_channels={x_orig.shape[-1]}, " f"metric={metric.item():.2f} vs. limit={ctx.whitening_limit}" ) (metric - ctx.whitening_limit).relu().backward() penalty_grad = x_detached.grad scale = ctx.grad_scale * ( x_grad.to(torch.float32).norm() / (penalty_grad.norm() + 1.0e-20) ) penalty_grad = penalty_grad * scale return x_grad + penalty_grad.to(x_grad.dtype), None, None, None class Whiten(nn.Module): def __init__( self, num_groups: int, whitening_limit: float, prob: Union[float, Tuple[float, float]], grad_scale: float, ): """ Args: num_groups: the number of groups to divide the channel dim into before whitening. We will attempt to make the feature covariance within each group, after mean subtraction, as "white" as possible, while having the same trace across all groups. whitening_limit: a value greater than 1.0, that dictates how much freedom we have to violate the constraints. 1.0 would mean perfectly white, with exactly the same trace across groups; larger values give more freedom. E.g. 2.0. prob: the probability with which we apply the gradient modification (also affects the grad scale). May be supplied as a float, or as a pair (min_prob, max_prob) grad_scale: determines the scale on the gradient term from this object, relative to the rest of the gradient on the attention weights. E.g. 0.02 (you may want to use smaller values than this if prob is large) """ super(Whiten, self).__init__() assert num_groups >= 1 assert whitening_limit >= 1 assert grad_scale >= 0 self.num_groups = num_groups self.whitening_limit = whitening_limit if isinstance(prob, float): assert 0 < prob <= 1 self.prob = prob else: (self.min_prob, self.max_prob) = prob assert 0 < self.min_prob < self.max_prob <= 1 self.prob = self.max_prob self.grad_scale = grad_scale def forward(self, x: Tensor) -> Tensor: """ In the forward pass, this function just returns the input unmodified. In the backward pass, it will modify the gradients to ensure that the distribution in each group has close to (lambda times I) as the covariance after mean subtraction, with the same lambda across groups. For whitening_limit > 1, there will be more freedom to violate this constraint. Args: x: the input of shape (*, num_channels) Returns: x, unmodified. You should make sure you use the returned value, or the graph will be freed and nothing will happen in backprop. """ if ( not x.requires_grad or random.random() > self.prob or self.grad_scale == 0 ): return _no_op(x) else: if hasattr(self, "min_prob") and random.random() < 0.25: # occasionally switch between min_prob and max_prob, based on whether # we are above or below the threshold. if ( _whitening_metric(x.to(torch.float32), self.num_groups) > self.whitening_limit ): # there would be a change to the grad. self.prob = self.max_prob else: self.prob = self.min_prob return WhiteningPenaltyFunction.apply( x, self.num_groups, self.whitening_limit, self.grad_scale ) class WithLoss(torch.autograd.Function): @staticmethod def forward(ctx, x: Tensor, y: Tensor): ctx.y_shape = y.shape return x @staticmethod def backward(ctx, ans_grad: Tensor): return ans_grad, torch.ones( ctx.y_shape, dtype=ans_grad.dtype, device=ans_grad.device ) def with_loss(x, y): if torch.jit.is_scripting() or torch.jit.is_tracing(): return x # returns x but adds y.sum() to the loss function. return WithLoss.apply(x, y) def _no_op(x: Tensor) -> Tensor: if torch.jit.is_scripting() or torch.jit.is_tracing(): return x else: # a no-op function that will have a node in the autograd graph, # to avoid certain bugs relating to backward hooks return x.chunk(1, dim=-1)[0] class Identity(torch.nn.Module): def __init__(self): super(Identity, self).__init__() def forward(self, x): return _no_op(x) class MaxEig(torch.nn.Module): """ Modifies the backpropped derivatives of a function to try to discourage that any given direction in activation space accounts for more than a specified proportion of the covariance (e.g. 0.2). Args: num_channels: the number of channels channel_dim: the dimension/axis corresponding to the channel, e.g. -1, 0, 1, 2; will be interpreted as an offset from x.ndim if negative. max_var_per_eig: the maximum proportion of the variance of the features/channels, after mean subtraction, that can come from any given eigenvalue. min_prob: the minimum probability with which we apply this during any invocation of forward(), assuming last time we applied the constraint it was not active; supplied for speed. scale: determines the scale with which we modify the gradients, relative to the existing / unmodified gradients """ def __init__( self, num_channels: int, channel_dim: int, max_var_per_eig: float = 0.2, min_prob: float = 0.01, scale: float = 0.01, ): super(MaxEig, self).__init__() self.num_channels = num_channels self.channel_dim = channel_dim self.scale = scale assert max_var_per_eig == 0.0 or max_var_per_eig > 1.0 / num_channels self.max_var_per_eig = max_var_per_eig # we figure out the dominant direction using the power method: starting with # a random vector, keep multiplying by the covariance and renormalizing. with torch.no_grad(): # arbitrary.. would use randn() but want to leave the rest of the model's # random parameters unchanged for comparison direction = torch.arange(num_channels).to(torch.float) direction = direction / direction.norm() self.register_buffer("max_eig_direction", direction) self.min_prob = min_prob # cur_prob is the current probability we'll use to apply the ActivationBalancer. # We'll regress this towards prob, each tiem we try to apply it and it is not # active. self.cur_prob = 1.0 def forward(self, x: Tensor) -> Tensor: if ( torch.jit.is_scripting() or self.max_var_per_eig <= 0 or random.random() > self.cur_prob or torch.jit.is_tracing() ): return _no_op(x) with torch.cuda.amp.autocast(enabled=False): eps = 1.0e-20 orig_x = x x = x.to(torch.float32) with torch.no_grad(): x = x.transpose(self.channel_dim, -1).reshape( -1, self.num_channels ) x = x - x.mean(dim=0) new_direction, coeffs = self._find_direction_coeffs( x, self.max_eig_direction ) x_var = (x ** 2).mean() x_residual = x - coeffs * new_direction x_residual_var = (x_residual ** 2).mean() # `variance_proportion` is the proportion of the variance accounted for # by the top eigen-direction. variance_proportion = (x_var - x_residual_var) / ( x_var + 1.0e-20 ) # ensure new direction is nonzero even if x == 0, by including `direction`. self._set_direction( 0.1 * self.max_eig_direction + new_direction ) if random.random() < 0.01 or __name__ == "__main__": logging.info( f"variance_proportion = {variance_proportion.item()}, shape={tuple(orig_x.shape)}, cur_prob={self.cur_prob}" ) if variance_proportion >= self.max_var_per_eig: # The constraint is active. Note, we should quite rarely # reach here, only near the beginning of training if we are # starting to diverge, should this constraint be active. cur_prob = self.cur_prob self.cur_prob = ( 1.0 # next time, do the update with probability 1.0. ) return MaxEigLimiterFunction.apply( orig_x, coeffs, new_direction, self.channel_dim, self.scale ) else: # let self.cur_prob exponentially approach self.min_prob, as # long as the constraint is inactive. self.cur_prob = 0.75 * self.cur_prob + 0.25 * self.min_prob return orig_x def _set_direction(self, direction: Tensor): """ Sets self.max_eig_direction to a normalized version of `direction` """ direction = direction.detach() direction = direction / direction.norm() direction_sum = direction.sum().item() if direction_sum - direction_sum == 0: # no inf/nan self.max_eig_direction[:] = direction else: logging.info( f"Warning: sum of direction in MaxEig is {direction_sum}, " "num_channels={self.num_channels}, channel_dim={self.channel_dim}" ) def _find_direction_coeffs( self, x: Tensor, prev_direction: Tensor ) -> Tuple[Tensor, Tensor, Tensor]: """ Figure out (an approximation to) the proportion of the variance of a set of feature vectors that can be attributed to the top eigen-direction. Args: x: a Tensor of shape (num_frames, num_channels), with num_frames > 1. prev_direction: a Tensor of shape (num_channels,), that is our previous estimate of the top eigen-direction, or a random direction if this is the first iteration. Does not have to be normalized, but should be nonzero. Returns: (cur_direction, coeffs), where: cur_direction: a Tensor of shape (num_channels,) that is the current estimate of the top eigen-direction. coeffs: a Tensor of shape (num_frames, 1) that minimizes, or approximately minimizes, (x - coeffs * cur_direction).norm() """ (num_frames, num_channels) = x.shape assert num_channels > 1 and num_frames > 1 assert prev_direction.shape == (num_channels,) # `coeffs` are the coefficients of `prev_direction` in x. # actually represent the coeffs up to a constant positive factor. coeffs = (x * prev_direction).sum(dim=1, keepdim=True) + 1.0e-10 cur_direction = (x * coeffs).sum(dim=0) / ( (coeffs ** 2).sum() + 1.0e-20 ) return cur_direction, coeffs class DoubleSwishFunction(torch.autograd.Function): """ double_swish(x) = x * torch.sigmoid(x-1) This is a definition, originally motivated by its close numerical similarity to swish(swish(x)), where swish(x) = x * sigmoid(x). Memory-efficient derivative computation: double_swish(x) = x * s, where s(x) = torch.sigmoid(x-1) double_swish'(x) = d/dx double_swish(x) = x * s'(x) + x' * s(x) = x * s'(x) + s(x). Now, s'(x) = s(x) * (1-s(x)). double_swish'(x) = x * s'(x) + s(x). = x * s(x) * (1-s(x)) + s(x). = double_swish(x) * (1-s(x)) + s(x) ... so we just need to remember s(x) but not x itself. """ @staticmethod def forward(ctx, x: Tensor) -> Tensor: requires_grad = x.requires_grad x_dtype = x.dtype if x.dtype == torch.float16: x = x.to(torch.float32) s = torch.sigmoid(x - 1.0) y = x * s if requires_grad: deriv = y * (1 - s) + s # notes on derivative of x * sigmoid(x - 1): # https://www.wolframalpha.com/input?i=d%2Fdx+%28x+*+sigmoid%28x-1%29%29 # min \simeq -0.043638. Take floor as -0.043637 so it's a lower bund # max \simeq 1.1990. Take ceil to be 1.2 so it's an upper bound. # the combination of "+ torch.rand_like(deriv)" and casting to torch.uint8 (which # floors), should be expectation-preserving. floor = -0.043637 ceil = 1.2 d_scaled = (deriv - floor) * ( 255.0 / (ceil - floor) ) + torch.rand_like(deriv) if __name__ == "__main__": # for self-testing only. assert d_scaled.min() >= 0.0 assert d_scaled.max() < 256.0 d_int = d_scaled.to(torch.uint8) ctx.save_for_backward(d_int) if x.dtype == torch.float16 or torch.is_autocast_enabled(): y = y.to(torch.float16) return y @staticmethod def backward(ctx, y_grad: Tensor) -> Tensor: (d,) = ctx.saved_tensors # the same constants as used in forward pass. floor = -0.043637 ceil = 1.2 d = d * ((ceil - floor) / 255.0) + floor return y_grad * d class DoubleSwish(torch.nn.Module): def forward(self, x: Tensor) -> Tensor: """Return double-swish activation function which is an approximation to Swish(Swish(x)), that we approximate closely with x * sigmoid(x-1). """ if torch.jit.is_scripting() or torch.jit.is_tracing(): return x * torch.sigmoid(x - 1.0) return DoubleSwishFunction.apply(x) def BalancedDoubleSwish( d_model, channel_dim=-1, max_abs=10.0, min_prob=0.25 ) -> nn.Sequential: """ ActivationBalancer -> DoubleSwish """ balancer = ActivationBalancer( d_model, channel_dim=channel_dim, max_abs=max_abs, min_prob=min_prob ) return nn.Sequential( balancer, DoubleSwish(), ) def _test_max_eig(): for proportion in [0.1, 0.5, 10.0]: logging.info(f"proportion = {proportion}") x = torch.randn(100, 128) direction = torch.randn(128) coeffs = torch.randn(100, 1) x += proportion * direction * coeffs x.requires_grad = True num_channels = 128 m = MaxEig( num_channels, 1, 0.5, scale=0.1 # channel_dim # max_var_per_eig ) # grad_scale for _ in range(4): y = m(x) y_grad = torch.randn_like(x) y.backward(gradient=y_grad) if proportion < 0.2: assert torch.allclose(x.grad, y_grad, atol=1.0e-02) elif proportion > 1.0: assert not torch.allclose(x.grad, y_grad) def _test_whiten(): for proportion in [0.1, 0.5, 10.0]: logging.info(f"_test_whiten(): proportion = {proportion}") x = torch.randn(100, 128) direction = torch.randn(128) coeffs = torch.randn(100, 1) x += proportion * direction * coeffs x.requires_grad = True num_channels = 128 m = Whiten( 1, 5.0, prob=1.0, grad_scale=0.1 # num_groups # whitening_limit, ) # grad_scale for _ in range(4): y = m(x) y_grad = torch.randn_like(x) y.backward(gradient=y_grad) if proportion < 0.2: assert torch.allclose(x.grad, y_grad) elif proportion > 1.0: assert not torch.allclose(x.grad, y_grad) def _test_activation_balancer_sign(): probs = torch.arange(0, 1, 0.01) N = 1000 x = 1.0 * ( (2.0 * (torch.rand(probs.numel(), N) < probs.unsqueeze(-1))) - 1.0 ) x = x.detach() x.requires_grad = True m = ActivationBalancer( probs.numel(), channel_dim=0, min_positive=0.05, max_positive=0.95, max_factor=0.2, min_abs=0.0, ) y_grad = torch.sign(torch.randn(probs.numel(), N)) y = m(x) y.backward(gradient=y_grad) print("_test_activation_balancer_sign: x = ", x) print("_test_activation_balancer_sign: y grad = ", y_grad) print("_test_activation_balancer_sign: x grad = ", x.grad) def _test_activation_balancer_magnitude(): magnitudes = torch.arange(0, 1, 0.01) N = 1000 x = torch.sign(torch.randn(magnitudes.numel(), N)) * magnitudes.unsqueeze( -1 ) x = x.detach() x.requires_grad = True m = ActivationBalancer( magnitudes.numel(), channel_dim=0, min_positive=0.0, max_positive=1.0, max_factor=0.2, min_abs=0.2, max_abs=0.8, min_prob=1.0, ) y_grad = torch.sign(torch.randn(magnitudes.numel(), N)) y = m(x) y.backward(gradient=y_grad) print("_test_activation_balancer_magnitude: x = ", x) print("_test_activation_balancer_magnitude: y grad = ", y_grad) print("_test_activation_balancer_magnitude: x grad = ", x.grad) def _test_basic_norm(): num_channels = 128 m = BasicNorm(num_channels=num_channels, channel_dim=1) x = torch.randn(500, num_channels) y = m(x) assert y.shape == x.shape x_rms = (x ** 2).mean().sqrt() y_rms = (y ** 2).mean().sqrt() print("x rms = ", x_rms) print("y rms = ", y_rms) assert y_rms < x_rms assert y_rms > 0.5 * x_rms def _test_double_swish_deriv(): x = torch.randn(10, 12, dtype=torch.double) * 3.0 x.requires_grad = True m = DoubleSwish() tol = (1.2 - (-0.043637)) / 255.0 torch.autograd.gradcheck(m, x, atol=tol) # for self-test. x = torch.randn(1000, 1000, dtype=torch.double) * 3.0 x.requires_grad = True y = m(x) def _test_softmax(): a = torch.randn(2, 10, dtype=torch.float64) b = a.clone() a.requires_grad = True b.requires_grad = True a.softmax(dim=1)[:, 0].sum().backward() print("a grad = ", a.grad) softmax(b, dim=1)[:, 0].sum().backward() print("b grad = ", b.grad) assert torch.allclose(a.grad, b.grad) if __name__ == "__main__": logging.getLogger().setLevel(logging.INFO) torch.set_num_threads(1) torch.set_num_interop_threads(1) _test_softmax() _test_whiten() _test_max_eig() _test_activation_balancer_sign() _test_activation_balancer_magnitude() _test_basic_norm() _test_double_swish_deriv() ================================================ FILE: models/modules/transformer.py ================================================ # cp from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/transformer.py, modified by Puyuan Peng 2024 import copy import numbers from functools import partial from typing import Any, Callable, List, Optional, Tuple, Union import torch from torch import Tensor, nn from torch.nn import functional as F from .activation import MultiheadAttention from .scaling import ActivationBalancer, BalancedDoubleSwish from .scaling import BasicNorm as _BasicNorm _shape_t = Union[int, List[int], torch.Size] class LayerNorm(nn.Module): __constants__ = ["normalized_shape", "eps", "elementwise_affine"] normalized_shape: Tuple[int, ...] eps: float elementwise_affine: bool def __init__( self, normalized_shape: _shape_t, eps: float = 1e-5, elementwise_affine: bool = True, device=None, dtype=None, ) -> None: factory_kwargs = {"device": device, "dtype": dtype} super(LayerNorm, self).__init__() if isinstance(normalized_shape, numbers.Integral): # mypy error: incompatible types in assignment normalized_shape = (normalized_shape,) # type: ignore[assignment] self.normalized_shape = tuple(normalized_shape) # type: ignore[arg-type] self.eps = eps self.elementwise_affine = elementwise_affine if self.elementwise_affine: self.weight = nn.Parameter( torch.empty(self.normalized_shape, **factory_kwargs) ) self.bias = nn.Parameter( torch.empty(self.normalized_shape, **factory_kwargs) ) else: self.register_parameter("weight", None) self.register_parameter("bias", None) self.reset_parameters() def reset_parameters(self) -> None: if self.elementwise_affine: nn.init.ones_(self.weight) nn.init.zeros_(self.bias) def forward(self, input: Tensor, embedding: Any = None) -> Tensor: if isinstance(input, tuple): input, embedding = input return ( F.layer_norm( input, self.normalized_shape, self.weight, self.bias, self.eps, ), embedding, ) assert embedding is None return F.layer_norm( input, self.normalized_shape, self.weight, self.bias, self.eps ) def extra_repr(self) -> str: return ( "{normalized_shape}, eps={eps}, " "elementwise_affine={elementwise_affine}".format(**self.__dict__) ) class AdaptiveLayerNorm(nn.Module): r"""Adaptive Layer Normalization""" def __init__(self, d_model, norm) -> None: super(AdaptiveLayerNorm, self).__init__() self.project_layer = nn.Linear(d_model, 2 * d_model) self.norm = norm self.d_model = d_model self.eps = self.norm.eps def forward(self, input: Tensor, embedding: Tensor = None) -> Tensor: if isinstance(input, tuple): input, embedding = input weight, bias = torch.split( self.project_layer(embedding), split_size_or_sections=self.d_model, dim=-1, ) return (weight * self.norm(input) + bias, embedding) weight, bias = torch.split( self.project_layer(embedding), split_size_or_sections=self.d_model, dim=-1, ) return weight * self.norm(input) + bias class BasicNorm(_BasicNorm): def __init__( self, d_model: int, eps: float = 1e-5, device=None, dtype=None, ): super(BasicNorm, self).__init__(d_model, eps=eps) def forward(self, input: Tensor, embedding: Any = None) -> Tensor: if isinstance(input, tuple): input, embedding = input return ( super(BasicNorm, self).forward(input), embedding, ) assert embedding is None return super(BasicNorm, self).forward(input) class BalancedBasicNorm(nn.Module): def __init__( self, d_model: int, eps: float = 1e-5, device=None, dtype=None, ): super(BalancedBasicNorm, self).__init__() self.balancer = ActivationBalancer( d_model, channel_dim=-1, min_positive=0.45, max_positive=0.55, max_abs=6.0, ) self.norm = BasicNorm(d_model, eps, device=device, dtype=dtype) def forward(self, input: Tensor, embedding: Any = None) -> Tensor: if isinstance(input, tuple): input, embedding = input return self.norm((self.balancer(input), embedding)) assert embedding is None return self.norm(self.balancer(input)) class IdentityNorm(nn.Module): def __init__( self, d_model: int, eps: float = 1e-5, device=None, dtype=None, ) -> None: super(IdentityNorm, self).__init__() def forward(self, input: Tensor, embedding: Any = None) -> Tensor: if isinstance(input, tuple): return input assert embedding is None return input class TransformerEncoderLayer(nn.Module): __constants__ = ["batch_first", "norm_first"] def __init__( self, d_model: int, nhead: int, dim_feedforward: int = 2048, dropout: float = 0.1, activation: Union[str, Callable[[Tensor], Tensor]] = F.relu, batch_first: bool = False, norm_first: bool = False, device=None, dtype=None, linear1_self_attention_cls: nn.Module = nn.Linear, linear2_self_attention_cls: nn.Module = nn.Linear, linear1_feedforward_cls: nn.Module = nn.Linear, linear2_feedforward_cls: nn.Module = nn.Linear, layer_norm_cls: nn.Module = LayerNorm, layer_norm_eps: float = 1e-5, adaptive_layer_norm=False, ) -> None: factory_kwargs = {"device": device, "dtype": dtype} super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiheadAttention( d_model, nhead, dropout=dropout, batch_first=batch_first, linear1_cls=linear1_self_attention_cls, linear2_cls=linear2_self_attention_cls, **factory_kwargs, ) # Implementation of Feedforward model self.linear1 = linear1_feedforward_cls( d_model, dim_feedforward, **factory_kwargs ) self.dropout = nn.Dropout(dropout) self.linear2 = linear2_feedforward_cls( dim_feedforward, d_model, **factory_kwargs ) self.norm_first = norm_first self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) # Legacy string support for activation function. if isinstance(activation, str): activation = _get_activation_fn(activation) elif isinstance(activation, partial): activation = activation(d_model) elif activation == BalancedDoubleSwish: activation = BalancedDoubleSwish(d_model) # # We can't test self.activation in forward() in TorchScript, # # so stash some information about it instead. # if activation is F.relu or isinstance(activation, torch.nn.ReLU): # self.activation_relu_or_gelu = 1 # elif activation is F.gelu or isinstance(activation, torch.nn.GELU): # self.activation_relu_or_gelu = 2 # else: # self.activation_relu_or_gelu = 0 self.activation = activation norm1 = layer_norm_cls(d_model, eps=layer_norm_eps, **factory_kwargs) if layer_norm_cls == IdentityNorm: norm2 = BalancedBasicNorm( d_model, eps=layer_norm_eps, **factory_kwargs ) else: norm2 = layer_norm_cls( d_model, eps=layer_norm_eps, **factory_kwargs ) if adaptive_layer_norm: self.norm1 = AdaptiveLayerNorm(d_model, norm1) self.norm2 = AdaptiveLayerNorm(d_model, norm2) else: self.norm1 = norm1 self.norm2 = norm2 def __setstate__(self, state): super(TransformerEncoderLayer, self).__setstate__(state) if not hasattr(self, "activation"): self.activation = F.relu def forward( self, src: Tensor, src_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None, need_weights: Optional[bool] = False, past: Optional[Tensor] = None, ) -> Tensor: r"""Pass the input through the encoder layer. Args: src: the sequence to the encoder layer (required). src_mask: the mask for the src sequence (optional). src_key_padding_mask: the mask for the src keys per batch (optional). Shape: see the docs in Transformer class. """ x, stage_embedding = src, None is_src_tuple = False if isinstance(src, tuple): x, stage_embedding = src is_src_tuple = True if src_key_padding_mask is not None: _skpm_dtype = src_key_padding_mask.dtype if _skpm_dtype != torch.bool and not torch.is_floating_point( src_key_padding_mask ): raise AssertionError( "only bool and floating types of key_padding_mask are supported" ) if need_weights: if self.norm_first: out, attn = self._sa_block_attn( self.norm1(x, stage_embedding), src_mask, src_key_padding_mask, past ) out, present = out # present is the kvcache of the present timestep x = x + out x = x + self._ff_block(self.norm2(x, stage_embedding)) else: out, attn = self._sa_block_attn(x, src_mask, src_key_padding_mask, past) out, present = out # present is the kvcache of the present timestep x = self.norm1( x + out, stage_embedding, ) x = self.norm2(x + self._ff_block(x), stage_embedding) assert not is_src_tuple # return (x, stage_embedding) return (x, attn) else: if self.norm_first: out = self._sa_block( self.norm1(x, stage_embedding), src_mask, src_key_padding_mask, past ) out, present = out # present is the kvcache of the present timestep x = x + out x = x + self._ff_block(self.norm2(x, stage_embedding)) else: out = self._sa_block(x, src_mask, src_key_padding_mask) out, present = out # present is the kvcache of the present timestep x = self.norm1( x + out, stage_embedding, past ) x = self.norm2(x + self._ff_block(x), stage_embedding) if is_src_tuple: x = (x, stage_embedding) if present != None: x = [x, present] return x # self-attention block def _sa_block( self, x: Tensor, attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor], past: Optional[Tensor] = None, ) -> Tensor: x = self.self_attn( x, x, x, attn_mask=attn_mask, key_padding_mask=key_padding_mask, need_weights=False, past=past ) x, present = x return self.dropout1(x), present # self-attention block, also return attention weights def _sa_block_attn( self, x: Tensor, attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor], past: Optional[Tensor] = None, ) -> Tensor: x, attn = self.self_attn( x, x, x, attn_mask=attn_mask, key_padding_mask=key_padding_mask, need_weights=True, past=past ) x, present = x return (self.dropout1(x), present), attn # feed forward block def _ff_block(self, x: Tensor) -> Tensor: x = self.linear2(self.dropout(self.activation(self.linear1(x)))) return self.dropout2(x) class TransformerEncoder(nn.Module): r"""TransformerEncoder is a stack of N encoder layers. Users can build the BERT(https://arxiv.org/abs/1810.04805) model with corresponding parameters. Args: encoder_layer: an instance of the TransformerEncoderLayer() class (required). num_layers: the number of sub-encoder-layers in the encoder (required). norm: the layer normalization component (optional). enable_nested_tensor: if True, input will automatically convert to nested tensor (and convert back on output). This will improve the overall performance of TransformerEncoder when padding rate is high. Default: ``True`` (enabled). Examples:: >>> encoder_layer = TransformerEncoderLayer(d_model=512, nhead=8) >>> transformer_encoder = TransformerEncoder(encoder_layer, num_layers=6) >>> src = torch.rand(10, 32, 512) >>> out = transformer_encoder(src) """ __constants__ = ["norm"] def __init__(self, encoder_layer, num_layers, norm=None): super(TransformerEncoder, self).__init__() self.layers = _get_clones(encoder_layer, num_layers) self.num_layers = num_layers self.norm = norm def forward( self, src: Tensor, mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None, return_layer_states: bool = False, need_weights:Optional[bool] = False, past: Optional[Tensor] = None, ) -> Tensor: r"""Pass the input through the encoder layers in turn. Args: src: the sequence to the encoder (required). mask: the mask for the src sequence (optional). src_key_padding_mask: the mask for the src keys per batch (optional). return_layer_states: return layers' state (optional). Shape: see the docs in Transformer class. """ if return_layer_states: assert not need_weights layer_states = [] # layers' output output = src for mod in self.layers: output = mod( output, src_mask=mask, src_key_padding_mask=src_key_padding_mask, past=past ) layer_states.append(output[0]) if self.norm is not None: output = self.norm(output) return layer_states, output if need_weights: assert not return_layer_states layer_attn = [] # layers' output output = src for mod in self.layers: output = mod( output, src_mask=mask, src_key_padding_mask=src_key_padding_mask, need_weights=True, past=past ) layer_attn.append(output[1]) if self.norm is not None: output = self.norm(output) return layer_attn, output output = src all_present = [] for n_layer, mod in enumerate(self.layers): output = mod( output, src_mask=mask, src_key_padding_mask=src_key_padding_mask, past=None if past is None else past[n_layer] ) if isinstance(output, list): output, present = output all_present.append(present) if self.norm is not None: output = self.norm(output) if all_present != []: all_present = torch.stack(all_present, dim=0) # (num_layers, 2, batch_size, num_heads, seq_len, head_dim) output = [output, all_present] return output class TransformerDecoderLayer(nn.Module): __constants__ = ["batch_first", "norm_first"] def __init__( self, d_model: int, nhead: int, dim_feedforward: int = 2048, dropout: float = 0.1, activation: Union[str, Callable[[Tensor], Tensor]] = F.relu, linear1_self_attention_cls: nn.Module = nn.Linear, linear2_self_attention_cls: nn.Module = nn.Linear, linear1_feedforward_cls: nn.Module = nn.Linear, linear2_feedforward_cls: nn.Module = nn.Linear, batch_first: bool = False, norm_first: bool = False, device=None, dtype=None, layer_norm_cls: nn.Module = LayerNorm, layer_norm_eps: float = 1e-5, adaptive_layer_norm=False, ) -> None: factory_kwargs = {"device": device, "dtype": dtype} super(TransformerDecoderLayer, self).__init__() self.self_attn = MultiheadAttention( d_model, nhead, dropout=dropout, batch_first=batch_first, linear1_cls=linear1_self_attention_cls, linear2_cls=linear2_self_attention_cls, **factory_kwargs, ) self.multihead_attn = MultiheadAttention( d_model, nhead, dropout=dropout, batch_first=batch_first, linear1_cls=linear1_self_attention_cls, linear2_cls=linear2_self_attention_cls, **factory_kwargs, ) # Implementation of Feedforward model self.linear1 = linear1_feedforward_cls( d_model, dim_feedforward, **factory_kwargs ) self.dropout = nn.Dropout(dropout) self.linear2 = linear2_feedforward_cls( dim_feedforward, d_model, **factory_kwargs ) self.norm_first = norm_first self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.dropout3 = nn.Dropout(dropout) # Legacy string support for activation function. if isinstance(activation, str): self.activation = _get_activation_fn(activation) elif isinstance(activation, partial): self.activation = activation(d_model) elif activation == BalancedDoubleSwish: self.activation = BalancedDoubleSwish(d_model) else: self.activation = activation if adaptive_layer_norm: norm1 = layer_norm_cls( d_model, eps=layer_norm_eps, **factory_kwargs ) norm2 = layer_norm_cls( d_model, eps=layer_norm_eps, **factory_kwargs ) norm3 = layer_norm_cls( d_model, eps=layer_norm_eps, **factory_kwargs ) self.norm1 = AdaptiveLayerNorm(d_model, norm1) self.norm2 = AdaptiveLayerNorm(d_model, norm2) self.norm3 = AdaptiveLayerNorm(d_model, norm3) else: self.norm1 = layer_norm_cls( d_model, eps=layer_norm_eps, **factory_kwargs ) self.norm2 = layer_norm_cls( d_model, eps=layer_norm_eps, **factory_kwargs ) if layer_norm_cls == IdentityNorm: self.norm3 = BalancedBasicNorm( d_model, eps=layer_norm_eps, **factory_kwargs ) else: self.norm3 = layer_norm_cls( d_model, eps=layer_norm_eps, **factory_kwargs ) def forward( self, tgt: Tensor, memory: Tensor, tgt_mask: Optional[Tensor] = None, memory_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, ) -> Tensor: r"""Pass the inputs (and mask) through the decoder layer. Args: tgt: the sequence to the decoder layer (required). memory: the sequence from the last layer of the encoder (required). tgt_mask: the mask for the tgt sequence (optional). memory_mask: the mask for the memory sequence (optional). tgt_key_padding_mask: the mask for the tgt keys per batch (optional). memory_key_padding_mask: the mask for the memory keys per batch (optional). Shape: see the docs in Transformer class. """ tgt_is_tuple = False if isinstance(tgt, tuple): x, stage_embedding = tgt tgt_is_tuple = True else: x, stage_embedding = tgt, None if self.norm_first: x = x + self._sa_block( self.norm1(x, stage_embedding), tgt_mask, tgt_key_padding_mask ) x = x + self._mha_block( self.norm2(x, stage_embedding), memory, memory_mask, memory_key_padding_mask, ) x = x + self._ff_block(self.norm3(x, stage_embedding)) else: x = self.norm1( x + self._sa_block(x, tgt_mask, tgt_key_padding_mask), stage_embedding, ) x = self.norm2( x + self._mha_block( x, memory, memory_mask, memory_key_padding_mask ), stage_embedding, ) x = self.norm3(x + self._ff_block(x), stage_embedding) if tgt_is_tuple: return (x, stage_embedding) return x # self-attention block def _sa_block( self, x: Tensor, attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor], ) -> Tensor: x = self.self_attn( x, x, x, attn_mask=attn_mask, key_padding_mask=key_padding_mask, need_weights=False, )[0] return self.dropout1(x) # multihead attention block def _mha_block( self, x: Tensor, mem: Tensor, attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor], ) -> Tensor: x = self.multihead_attn( x, mem, mem, attn_mask=attn_mask, key_padding_mask=key_padding_mask, need_weights=False, )[0] return self.dropout2(x) # feed forward block def _ff_block(self, x: Tensor) -> Tensor: x = self.linear2(self.dropout(self.activation(self.linear1(x)))) return self.dropout3(x) def _get_clones(module, N): return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) def _get_activation_fn(activation: str) -> Callable[[Tensor], Tensor]: if activation == "relu": return F.relu elif activation == "gelu": return F.gelu raise RuntimeError( "activation should be relu/gelu, not {}".format(activation) ) ================================================ FILE: models/modules/utils.py ================================================ # cp from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/transformer.py, modified by Puyuan Peng import torch def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor: """ Args: lengths: A 1-D tensor containing sentence lengths. max_len: The length of masks. Returns: Return a 2-D bool tensor, where masked positions are filled with `True` and non-masked positions are filled with `False`. >>> lengths = torch.tensor([1, 3, 2, 5]) >>> make_pad_mask(lengths) tensor([[False, True, True, True, True], [False, False, False, True, True], [False, False, True, True, True], [False, False, False, False, False]]) """ assert lengths.ndim == 1, lengths.ndim max_len = max(max_len, lengths.max()) n = lengths.size(0) seq_range = torch.arange(0, max_len, device=lengths.device) expaned_lengths = seq_range.unsqueeze(0).expand(n, max_len) return expaned_lengths >= lengths.unsqueeze(-1) def generate_partial_autoregressive_mask(sz, start, end): mask = torch.zeros(sz, sz).bool() mask[start:end, start:end] = torch.triu(torch.ones(end-start, end-start,dtype=torch.bool), diagonal=1) mask[:start, start:end] = True mask[end:, start:end] = True return mask ================================================ FILE: models/voicecraft.py ================================================ import random import numpy as np import logging import argparse, copy from typing import Dict, Optional import torch import torch.nn as nn import torch.nn.functional as F from torchmetrics.classification import MulticlassAccuracy from .modules.utils import make_pad_mask from .modules.embedding import SinePositionalEmbedding, TokenEmbedding from .modules.transformer import ( LayerNorm, TransformerEncoder, TransformerEncoderLayer, ) from .codebooks_patterns import DelayedPatternProvider from argparse import Namespace from huggingface_hub import PyTorchModelHubMixin def top_k_top_p_filtering( logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1 ): """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering Args: logits: logits distribution shape (batch size, vocabulary size) if top_k > 0: keep only top k tokens with highest probability (top-k filtering). if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering). Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751) Make sure we keep at least min_tokens_to_keep per batch example in the output From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 """ if top_k > 0: top_k = min( max(top_k, min_tokens_to_keep), logits.size(-1) ) # Safety check # Remove all tokens with a probability less than the last token of the top-k indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] logits[indices_to_remove] = filter_value if top_p < 1.0: sorted_logits, sorted_indices = torch.sort(logits, descending=True) cumulative_probs = torch.cumsum( F.softmax(sorted_logits, dim=-1), dim=-1 ) # Remove tokens with cumulative probability above the threshold (token with 0 are kept) sorted_indices_to_remove = cumulative_probs > top_p if min_tokens_to_keep > 1: # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below) sorted_indices_to_remove[..., :min_tokens_to_keep] = 0 # Shift the indices to the right to keep also the first token above the threshold sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[ ..., :-1 ].clone() sorted_indices_to_remove[..., 0] = 0 # scatter sorted tensors to original indexing indices_to_remove = sorted_indices_to_remove.scatter( 1, sorted_indices, sorted_indices_to_remove ) logits[indices_to_remove] = filter_value return logits def topk_sampling(logits, top_k=10, top_p=1.0, temperature=1.0): # temperature: (`optional`) float # The value used to module the next token probabilities. Must be strictly positive. Default to 1.0. # top_k: (`optional`) int # The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50. # top_p: (`optional`) float # The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1. # Temperature (higher temperature => more likely to sample low probability tokens) if temperature != 1.0: logits = logits / temperature # Top-p/top-k filtering logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p) # Sample token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1) return token class VoiceCraft( nn.Module, PyTorchModelHubMixin, library_name="voicecraft", repo_url="https://github.com/jasonppy/VoiceCraft", tags=["text-to-speech"], ): def __new__(cls, args: Optional[Namespace] = None, config: Optional[Dict] = None, **kwargs) -> "VoiceCraft": # If initialized from Namespace args => convert to dict config for 'PyTorchModelHubMixin' to serialize it as config.json # Won't affect instance initialization if args is not None: if config is not None: raise ValueError("Cannot provide both `args` and `config`.") config = vars(args) return super().__new__(cls, args=args, config=config, **kwargs) def __init__(self, args: Optional[Namespace] = None, config: Optional[Dict] = None): super().__init__() # If loaded from HF Hub => convert config.json to Namespace args before initializing if args is None: if config is None: raise ValueError("Either `args` or `config` must be provided.") args = Namespace(**config) self.args = copy.copy(args) self.pattern = DelayedPatternProvider(n_q=self.args.n_codebooks) if not getattr(self.args, "special_first", False): self.args.special_first = 0 if not getattr(self.args, "n_special", False): self.args.n_special = 3 self.args.eos = getattr(self.args, "eos", -1) self.eog = nn.Parameter(torch.full((self.args.n_codebooks, 1), self.args.eog, dtype=torch.long), requires_grad=False) # [K 1] if self.args.eos > 0: assert self.args.eos != self.args.audio_pad_token and self.args.eos != self.args.empty_token, self.args.eos self.eos = nn.Parameter(torch.full((self.args.n_codebooks, 1), self.args.eos, dtype=torch.long), requires_grad=False) # [K 1] if isinstance(self.args.audio_vocab_size, str): self.args.audio_vocab_size = eval(self.args.audio_vocab_size) self.n_text_tokens = self.args.text_vocab_size + 1 assert self.args.text_pad_token == self.args.text_vocab_size, f"self.args.text_vocab_size: {self.args.text_vocab_size}, self.args.text_pad_token: {self.args.text_pad_token}" self.n_audio_tokens = [self.args.audio_vocab_size + self.args.n_special] * self.args.n_codebooks # special tokens: empty token, EOG token, audio pad token assert self.args.audio_vocab_size == self.args.empty_token, self.args.empty_token assert self.args.eog == self.args.audio_vocab_size + 1, self.args.eog assert self.args.audio_pad_token == self.args.audio_vocab_size + 2, self.args.audio_pad_token self.text_embedding = TokenEmbedding( dim_model=self.args.d_model, vocab_size=self.n_text_tokens, dropout=self.args.text_embedding_dropout ) self.audio_embedding = nn.ModuleList( [ TokenEmbedding( dim_model=self.args.audio_embedding_dim, vocab_size=self.n_audio_tokens[k], dropout=self.args.audio_embedding_dropout ) for k in range(self.args.n_codebooks) ] ) self.mask_embedding = nn.Parameter(torch.randn(self.args.max_n_spans, self.args.d_model), requires_grad=True) self.text_positional_embedding = SinePositionalEmbedding( self.args.d_model, dropout=self.args.text_positional_embedding_dropout, scale=False, alpha=True, # learnable scaler, scale the volume of positional embedding ) self.audio_positional_embedding = SinePositionalEmbedding( self.args.d_model, dropout=self.args.audio_positional_embedding_dropout, scale=False, alpha=True, # learnable scaler, scale the volume of positional embedding ) dec_layer = TransformerEncoderLayer( self.args.d_model, self.args.nhead, dim_feedforward=self.args.d_model * 4, dropout=self.args.trm_dropout, batch_first=True, norm_first=True, layer_norm_cls=LayerNorm ) self.decoder = TransformerEncoder( dec_layer, num_layers=self.args.num_decoder_layers, norm=LayerNorm(self.args.d_model), ) self.predict_layer = nn.ModuleList( [ nn.Sequential(nn.Linear(self.args.d_model, self.args.audio_vocab_size//2), nn.GELU(), nn.Linear(self.args.audio_vocab_size//2, self.n_audio_tokens[k])) for k in range(self.args.n_codebooks) ] ) self.accuracy_metrics = nn.ModuleList( [MulticlassAccuracy( self.n_audio_tokens[k], top_k=10, average="micro", multidim_average="global", ignore_index=None, ) for k in range(self.args.n_codebooks)] ) def prepare_mask_intervals(self, y_lens): mask_intervals = [] non_mask_intervals = [] for i, y_len in enumerate(y_lens): if self.args.mask_sample_dist == "uniform": n_spans = random.choice(range(1, self.args.max_n_spans+1)) elif "poisson" in self.args.mask_sample_dist.lower(): param = float(self.args.mask_sample_dist[len("poisson"):]) poisson_sample = torch.poisson(torch.tensor([param])) n_spans = int(poisson_sample.clamp(1, self.args.max_n_spans).item()) starts = random.sample(range(1, y_len-1-self.args.mask_len_min), n_spans) starts = sorted(starts) for j in range(len(starts)-1, 0, -1): if starts[j] - starts[j-1] < self.args.min_gap: del starts[j] # If elements are too close, delete the later one assert len(starts) > 0, f"there is no masked span left, y_len: {y_len}, sampled n_spans: {n_spans}" temp_starts = starts + [y_len] gaps = [temp_starts[j+1] - temp_starts[j] for j in range(len(temp_starts)-1)] ends = [] for j, (start, gap) in enumerate(zip(starts, gaps)): mask_len = random.randint(self.args.mask_len_min, self.args.mask_len_max) # if mask_len > gap * self.args.max_mask_portion: # make sure the masks are not overlapping with each other if mask_len > gap - 1: # make sure the masks are not overlapping with each other # temp_mask_start = int(0.6*gap*self.args.max_mask_portion) # temp_mask_end = int(gap*self.args.max_mask_portion) temp_mask_start = 1 temp_mask_end = gap - 1 mask_len = random.randint(temp_mask_start, temp_mask_end) ends.append(start + mask_len) mask_intervals.append([(s,e) for s,e in zip(starts, ends)]) non_mask_intervals.append([(ns,ne) for ns, ne in zip([0]+ends, starts+[y_len])]) return mask_intervals, non_mask_intervals def rearrange(self, y, non_mask_intervals, mask_intervals): reduced_eog = getattr(self.args, "reduced_eog", 0) rearranged_y = [] for i in range(len(y)): if self.args.eos > 0: assert reduced_eog cur_y = [y[i, :, item[0]: item[1]] for item in non_mask_intervals[i][:-1]] + [torch.cat([y[i, :, non_mask_intervals[i][-1][0]: non_mask_intervals[i][-1][1]], self.eos], dim=-1)] + [torch.cat([y[i, :, item[0]: item[1]], self.eog], dim=-1) for item in mask_intervals[i]] # only insert eog to the last non-mask-interval, which is when the utterance actual ends else: if reduced_eog: cur_y = [y[i, :, item[0]: item[1]] for item in non_mask_intervals[i][:-1]] + [torch.cat([y[i, :, non_mask_intervals[i][-1][0]: non_mask_intervals[i][-1][1]], self.eog], dim=-1)] + [torch.cat([y[i, :, item[0]: item[1]], self.eog], dim=-1) for item in mask_intervals[i]] # only insert eog to the last non-mask-interval, which is when the utterance actual ends else: cur_y = [torch.cat([y[i, :, item[0]: item[1]], self.eog], dim=-1) for item in non_mask_intervals[i]] + [torch.cat([y[i, :, item[0]: item[1]], self.eog], dim=-1) for item in mask_intervals[i]] # eog is added to each section TODO this is not correct, I should add eog to non_mask_intervals if that segment is not the ending segment (as there is no way for the model to predict eog for those segments, and this will do harm to tts experiment, where the model randomly output eog for the first segment) rearranged_y.append(cur_y) return rearranged_y def shift(self, rearranged_y): shifted_y = [] patterns = [] for i in range(len(rearranged_y)): cur_patterns = [self.pattern.get_pattern(cur_y.shape[1]) for cur_y in rearranged_y[i]] out = [cur_pattern.build_pattern_sequence(z=cur_y.unsqueeze(0).contiguous(), special_token=self.args.empty_token, keep_only_valid_steps=False) for cur_pattern, cur_y in zip(cur_patterns, rearranged_y[i])] shifted_y.append([item[0].squeeze(0) for item in out]) # the first item is values, later two are indexes and mask patterns.append(cur_patterns) return shifted_y, patterns def insert_mask(self, shifted_y): inserted_y = [] mask_position = [] mask_value = [] for i in range(len(shifted_y)): num_masks = (len(shifted_y[i]) - 1) // 2 assert num_masks == (len(shifted_y[i]) - 1) / 2, len(shifted_y[i]) emb_inds = list(range(self.args.max_n_spans)) if self.args.shuffle_mask_embedding: random.shuffle(emb_inds) emb_inds_use = emb_inds[:num_masks] emb_inds_use = emb_inds_use + emb_inds_use mask_value.append(emb_inds_use) cur_inserted_y = [] cur_mask_position = [] for j in range(len(shifted_y[i])-1): cur_inserted_y.append(shifted_y[i][j]) cur_mask_position.append(sum([item.shape[1] for item in cur_inserted_y])) # each item is of shape [K S], so take shape[1] cur_inserted_y.append(self.eog) # insert mask token of shape [K, 1], BUT we are actually using the eog token as a place holder here, as the real mask will be inserted in embed_y function cur_inserted_y.append(shifted_y[i][-1]) inserted_y.append(cur_inserted_y) mask_position.append(cur_mask_position) return inserted_y, mask_position, mask_value def cat_y(self, inserted_y, mask_position, y_lens): reduced_eog = getattr(self.args, "reduced_eog", 0) cated_y = [] new_y_lens = [] for i in range(len(inserted_y)): cur_cated_y = torch.cat(inserted_y[i], dim=1) #[K S] cur_cated_y = cur_cated_y.transpose(1,0) # [S K] cur_cated_y_len = cur_cated_y.shape[0] if reduced_eog: assert cur_cated_y_len == y_lens[i] + len(mask_position[i]) + (len(mask_position[i]) + 1) * self.args.n_codebooks + (len(mask_position[i])/2 + 1), f"cur_cated_y_len == {cur_cated_y_len}, but it should be y_lens[i] ({y_lens[i]}) + len(mask_position[i]) ({len(mask_position[i])}) + (len(mask_position[i]) + 1) * self.args.n_codebooks ({(len(mask_position[i]) + 1) * self.args.n_codebooks}) + (len(mask_position[i])/2 + 1) ({len(mask_position[i])/2 + 1})={y_lens[i] + len(mask_position[i]) + (len(mask_position[i]) + 1) * self.args.n_codebooks + (len(mask_position[i])/2 + 1)}" else: assert cur_cated_y_len == y_lens[i] + len(mask_position[i]) + (len(mask_position[i]) + 1) * self.args.n_codebooks + (len(mask_position[i]) + 1), f"cur_cated_y_len == {cur_cated_y_len}, but it should be y_lens[i] ({y_lens[i]}) + len(mask_position[i]) ({len(mask_position[i])}) + (len(mask_position[i]) + 1) * self.args.n_codebooks ({(len(mask_position[i]) + 1) * self.args.n_codebooks}) + (len(mask_position[i]) + 1) ({len(mask_position[i]) + 1})" # the last term represent the inserted eog token, originally it's inserted at the end of every token, but this is wrong new_y_lens.append(cur_cated_y_len) cated_y.append(cur_cated_y) cated_y = torch.nn.utils.rnn.pad_sequence(cated_y, batch_first=False, padding_value=self.args.audio_pad_token) assert cated_y.shape == torch.Size([max(new_y_lens),len(inserted_y), self.args.n_codebooks]), f"cated_y.shape: {cated_y.shape}, but it should be {torch.Size([max(new_y_lens,len(inserted_y), self.args.n_codebooks)])}" cated_y = cated_y.permute(2,0,1) # [T,B,K]->[K,T,B] assert cated_y.shape[0] == self.args.n_codebooks, cated_y.shape return cated_y, torch.LongTensor(new_y_lens).to(cated_y.device) def embed_y(self, cated_y, mask_position, mask_value): embedded_y = torch.stack([self.audio_embedding[k](cated_y[k]) for k in range(self.args.n_codebooks)], dim=0) # [K, T, B, D] assert embedded_y.shape[0] == self.args.n_codebooks, embedded_y.shape assert embedded_y.shape[-1] == self.args.d_model, embedded_y.shape embedded_y = embedded_y.sum(dim=0) # [K,T,B,D]->[T,B,D] embedded_y = embedded_y.transpose(1,0) # [T,B,D]->[B,T,D] for i in range(len(embedded_y)): if len(mask_position[i]) > 0: embedded_y[i, mask_position[i]] = self.mask_embedding[mask_value[i]] return embedded_y def prepare_input_target(self, y, y_lens): # rearrange y # assume y shape: [B T K], K is n_codebooks assert y.shape[1] == self.args.n_codebooks, y.shape # sample mask_intervals mask_intervals, non_mask_intervals = self.prepare_mask_intervals(y_lens) # need to have EOG in each section (SOG will be generated by the pattern class) # but mask can be inserted later after we have shifted the input # y could be rearranged in this way: # [ # [tensor[4, 12], tensor[4, 45], tensor[4, 102], tensor[4, 32]], tensor[4, 22]], # [tensor[4, 44], tensor[4, 56], tensor[4, 19]], # ... # ] # for the first list of tensors (4 tensors), first 3 tensors are non_masked part, last 2 are masked part. # NOTE #non_masked_part = #masked_part + 1 # NOTE *these are also the targets* # added eog at the end of each segment (masked segment and unmasked segment) rearranged_y = self.rearrange(y, non_mask_intervals, mask_intervals) targets = rearranged_y # each element in each sample is of shape [K T] assert targets[0][0].shape[0] == self.args.n_codebooks, targets[0][0].shape # next we need to apply pattern shifting to each tensor, after which, we'll replace the starting tokens of each section with a token that's different from the special padding token # [[5, 1, 2, 3, 4, 5, 5], # [5, 5, 1, 2, 3, 4, 5], # [5, 5, 5, 1, 2, 3, 4]] shifted_y, patterns = self.shift(rearranged_y) # each element [K S] assert shifted_y[0][0].shape[0] == self.args.n_codebooks, shifted_y[0][0].shape[0] # then, insert mask token at the intersection of each tensor (we want to decide the arrangement of the mask (shuffle or not)), we better have a separate nn.embedding for it # we also need to record the position of the inserted mask inserted_y, mask_position, mask_value = self.insert_mask(shifted_y) assert inserted_y[0][0].shape[0] == self.args.n_codebooks, inserted_y[0][0].shape[0] assert inserted_y[0][1].shape == torch.Size((self.args.n_codebooks, 1)), f"this should be a mask, so should have shape {(self.args.n_codebooks, 1)}, but it's {inserted_y[0][1].shape}" # then concat tensors that belong to the same sample (in order) then get the length of each sample, and then stack them in batch dimension, pad them with pad_token cated_y, new_y_lens = self.cat_y(inserted_y, mask_position, y_lens) # KTB assert cated_y.shape == torch.Size((self.args.n_codebooks, cated_y.shape[1], len(inserted_y))) # embed remember to separately embed the mask tokens embedded_y = self.embed_y(cated_y, mask_position, mask_value) #BTD assert embedded_y.shape[1:] == torch.Size((max(new_y_lens), self.args.d_model)), embedded_y.shape # positional embedding y_input = self.audio_positional_embedding(embedded_y) # make attention mask and padding mask y_padding_mask = make_pad_mask(new_y_lens).to(y.device) y_attention_mask = torch.triu(torch.ones(y_input.shape[1], y_input.shape[1]), diagonal=1).bool().to(y_padding_mask.device) return y_input, new_y_lens, targets, y_padding_mask, y_attention_mask, mask_position, patterns def remove_mask(self, logits, mask_position, new_y_lens): # logits: [B K S card] logits_use = [] for i in range(len(logits)): non_mask_positions = [-1] + mask_position[i] + [new_y_lens[i]] non_mask_intervals = [[non_mask_positions[i]+1, non_mask_positions[i+1]] for i in range(len(non_mask_positions)-1)] cur_logits_use = [logits[i, :, l:r] for l,r in non_mask_intervals] logits_use.append(cur_logits_use) return logits_use def revert_pattern(self, patterns, logits_use): logits_final = [] logit_masks = [] for i in range(len(logits_use)): cur_logits = [ item.unsqueeze(0).permute(0, 3, 1, 2).contiguous() for item in logits_use[i] ] # each item is of shape [1 K S card] [1 card K S] cur_logits_final = [ cur_pattern.revert_pattern_logits( item, 0, keep_only_valid_steps=False ) for cur_pattern, item in zip(patterns[i], cur_logits) ] # if input output order doesn't match, this step will give an error cur_logits_final_ret = [item[0].permute(0,2,3,1).squeeze(0) for item in cur_logits_final] # each element is of shape [K,T,card] logits_final.append(cur_logits_final_ret) logit_masks.append([item[2] for item in cur_logits_final]) return logits_final, logit_masks def dec_forward( self, x_input, x_lens, x_attention_mask, x_padding_mask, y_input, new_y_lens, y_attention_mask, y_padding_mask, past=None, last_3_tokens=False ): x_attn_mask = F.pad( x_attention_mask, (0, new_y_lens.max()), value=True, ) # x attn to all x, doesn't attn to any y, this follow figure 3 of the valle paper y_attn_mask = F.pad( y_attention_mask, (x_lens.max(), 0), # y is padded at the front value=False, ) # y attn to all x, for y itself use lower triangle mask to ensure autoregressive xy_attn_mask = torch.concat([x_attn_mask, y_attn_mask], dim=0) # merge key padding and attention masks bsz, src_len = x_input.shape[0], x_lens.max() + new_y_lens.max() xy_padding_mask = torch.concat([x_padding_mask, y_padding_mask], dim=1) _xy_padding_mask = ( xy_padding_mask.view(bsz, 1, 1, src_len) .expand(-1, self.args.nhead, -1, -1) .reshape(bsz * self.args.nhead, 1, src_len) ) # Check shapes and resize+broadcast as necessary if xy_attn_mask.shape != _xy_padding_mask.shape: assert xy_attn_mask.ndim + 1 == _xy_padding_mask.ndim, f"xy_attn_mask.shape: {xy_attn_mask.shape}, _xy_padding_mask: {_xy_padding_mask.shape}" xy_attn_mask = xy_attn_mask.unsqueeze(0).repeat(_xy_padding_mask.shape[0], 1, 1) # Example approach xy_attn_mask = xy_attn_mask.logical_or(_xy_padding_mask) new_attn_mask = torch.zeros_like(xy_attn_mask) new_attn_mask.masked_fill_(xy_attn_mask, float("-inf")) xy_attn_mask = new_attn_mask xy_input = torch.cat([x_input, y_input], dim=1) if past == None: # do not use kvcache out, _ = self.decoder((xy_input, None), mask=xy_attn_mask) return out[:, x_lens.max():], None else: # use kvcache if past.ndim > 3: # uses kvcache, only need to pass the last tokens, this doesn't work with multi-span speech editing yet if last_3_tokens: xy_input = xy_input[:, -3:] xy_attn_mask = xy_attn_mask[:, -3:] else: xy_input = xy_input[:, -1:] xy_attn_mask = xy_attn_mask[:, -1:] out, present = self.decoder((xy_input, None), mask=xy_attn_mask, past=past) if isinstance(out, tuple): # get rid of stage_embedding out = out[0] if out.shape[1] > x_lens.max(): # the first pass, not kvcache yet return out[:, x_lens.max():], present else: # used kvcache return out, present def forward(self, batch): """ Args: x: A 2-D tensor of shape (N, S). x_lens: A 1-D tensor of shape (N,). It contains the number of tokens in `x` before padding. y: A 3-D tensor of shape (N, K, T). where K is the number of codebooks y_lens: A 1-D tensor of shape (N,). It contains the number of tokens in `x` before padding. """ x, x_lens, y, y_lens = batch["x"], batch["x_lens"], batch["y"], batch["y_lens"] if len(x) == 0: return None x = x[:, :x_lens.max()] # this deal with gradient accumulation, where x_lens.max() might not be longer than the length of the current slice of x y = y[:, :, :y_lens.max()] assert x.ndim == 2, x.shape assert x_lens.ndim == 1, x_lens.shape assert y.ndim == 3 and y.shape[1] == self.args.n_codebooks, y.shape assert y_lens.ndim == 1, y_lens.shape # makes attention mask and padding mask for x x_padding_mask = make_pad_mask(x_lens).to(x.device) x_attention_mask = torch.triu(torch.ones(x.shape[1], x.shape[1]), diagonal=1).bool().to(x_padding_mask.device) x_input = self.text_embedding(x) x_input = self.text_positional_embedding(x_input) y_input, new_y_lens, targets, y_padding_mask, y_attention_mask, mask_position, patterns = self.prepare_input_target(y, y_lens) y_out = self.dec_forward( x_input, x_lens, x_attention_mask, x_padding_mask, y_input, new_y_lens, y_attention_mask, y_padding_mask ) y_out = y_out[0] # no kv-caching during training assert y_out.shape == y_input.shape, f"y_out.shape: {y_out.shape}, y_input.shape: {y_input.shape}" # [B S D] logits = torch.stack([self.predict_layer[i](y_out) for i in range(self.args.n_codebooks)], dim=1) # [B K S card] # take out the mask token (using mask_position and new_y_lens) and revert (using function provided by self.pattern) assert logits.shape[1] == self.args.n_codebooks and logits.shape[3] == self.n_audio_tokens[0], logits.shape logits_use = self.remove_mask(logits, mask_position, new_y_lens) # revert the pattern shift for each logits section in each sample logits_final, logit_masks = self.revert_pattern(patterns, logits_use) assert logits_final[0][0].shape[0] == self.args.n_codebooks and logits_final[0][0].shape[2] == self.n_audio_tokens[0], f"it is: {logits_final[0][0].shape}, but should be [K, T, card]" # testing sample_to_test = 0 assert len(logits_final[sample_to_test]) == len(targets[sample_to_test]), f"{len(logits_final[sample_to_test])}, {len(targets[sample_to_test])}" temp = sum([logits_final[sample_to_test][i].shape[:-1] != targets[sample_to_test][i].shape for i in range(len(targets[sample_to_test]))]) assert temp == 0, f"none equal positions: {temp}, total number of elements: {len(targets[sample_to_test])}" logit_masked = sum([(item==False).any() for cur_mask in logit_masks for item in cur_mask]) assert logit_masked == 0, logit_masks logits = torch.cat([torch.cat(item, dim=1) for item in logits_final], dim=1) # [K, T1+T2+T3+..., card] targets = torch.cat([torch.cat(item, dim=1) for item in targets], dim=1) # [K, T1+T2+T3+...] assert targets.shape[0] == logits.shape[0], f"{targets.shape}, {logits.shape}" loss = [] ntokens = [] top10acc = [] for k, (logit, target) in enumerate(zip(logits, targets)): loss.append(F.cross_entropy(logit, target, reduction='mean')) top10acc.append(self.accuracy_metrics[k](logit.detach(), target)) ntokens.append(len(logit)) all_ntokens = sum(ntokens) if self.args.codebook_weight != None: codebook_weight = eval(self.args.codebook_weight) else: codebook_weight = [1.] * self.args.n_codebooks loss = sum([l*nt*cw for l, nt, cw in zip(loss, ntokens, codebook_weight)]) top10acc_by_codebook = [t10a*nt for t10a, nt in zip(top10acc, ntokens)] top10acc = sum(top10acc_by_codebook) ntokens = torch.tensor(all_ntokens).to(logits.device) return { "loss": loss, "top10acc": top10acc, "top10acc_by_codebook": top10acc_by_codebook, "effective_ntoken": ntokens, } def inference( self, x: torch.Tensor, x_lens: torch.Tensor, y: torch.Tensor, mask_interval: list[torch.Tensor], top_k: int=-100, top_p: float=1.0, temperature: float=1.0, stop_repetition: int=-1, kvcache: int=1, silence_tokens: list[int]=[1388,1898,131], ) -> torch.Tensor: """ Args: x: A 2-D tensor of shape (1, L). x_lens: A 1-D tensor of shape (1,). It contains the number of tokens in `x` before padding. y: A 3-D tensor of shape (1, T, K). mask_interval: a list of tensors of shape (M, 2). contains M mask_start and mask_end. list length is actually 1, because we only support single sample inference for now top_k: (`optional`) int The number of highest probability tokens to keep for top-k-filtering. Default to -100. top_p: (`optional`) float For Neucleus sampling temperature: (`optional`) float The value used to module the next token probabilities. Must be strictly positive. Default to 1.0. eog_coef: (`optional`) float if 0, no change to eog token logits, otherwise, will adjust eog token logit based on the difference between acoustic token and phn token length stop_repetition (`optional`) int if not -1, will set the logits of a token that repeated this many times to be -100000, to avoid generating it again. This only apply to tokens from the first codebook allowed_repeat_tokens (`optional`) list of ints by inspecting the validation set, get a few tokens that indeed repeat a significant amount of time, and exclude those tokens from prevent repetition ultimate_stop_repetition (`optional`) int no matter that token it is, stop repetition once after this number """ assert x.ndim == 2, x.shape assert x_lens.ndim == 1, x_lens.shape assert y.ndim == 3, y.shape if self.args.special_first: y = y + int(self.args.n_special) y = y.transpose(2,1) # [1,T,K] -> [1,K,T] assert y.shape[0] == 1 and y.shape[1] == self.args.n_codebooks, y.shape # there is no padding assert mask_interval.shape == torch.Size((1, mask_interval.shape[1], 2)), mask_interval # make x attention mask and x_input x_attention_mask = torch.triu(torch.ones(x.shape[1], x.shape[1]), diagonal=1).bool().to(x.device) # x_attention_mask = torch.zeros(x.shape[1], x.shape[1]).bool().to(x.device) x_input = self.text_embedding(x) x_input = self.text_positional_embedding(x_input) # make initial y_input # make mask_interval and non_mask_interval y_len = y.shape[2] y_lens = torch.LongTensor([y_len]).to(y.device) mask_interval = mask_interval[0] starts = [item[0].item() for item in mask_interval] + [y_len] ends = [0] + [item[1].item() for item in mask_interval] mask_intervals = [[ (item[0].item(), item[1].item()) for item in mask_interval ]] # a werid name change, mask_interval is input, now is mask_intervals, with one more dimension non_mask_intervals = [[ (ns, ne) for ns, ne in zip(ends, starts) ]] # rearrange y # will add have EOG in each section (SOG will be generated by the pattern class) # but mask can be inserted later after we have shifted the input # y could be rearranged in this way: # [ # [tensor[4, 12], tensor[4, 45], tensor[4, 102], tensor[4, 32]], tensor[4, 22]], # [tensor[4, 44], tensor[4, 56], tensor[4, 19]], # ... # ] # for the first list of tensors (4 tensors), first 3 tensors are non_masked part, last 2 are masked part. # NOTE #non_masked_part = #masked_part + 1 rearranged_y = self.rearrange(y, non_mask_intervals, mask_intervals) assert rearranged_y[0][0].shape[0] == self.args.n_codebooks, rearranged_y[0][0].shape # shift each element of y # next we need to apply pattern shifting to each tensor, after which, we'll replace the starting tokens of each section with a token that's different from the special padding token # [ # [empty, 1, 2, 3, eog, empty, empty, empty], # [empty, empty, 1, 2, 3, eog, empty, empty], # [empty, empty, empty, 1, 2, 3, eog, empty], # [empty, empty, empty, empty, 1, 2, 3, eog] # ] shifted_y, patterns = self.shift(rearranged_y) # each element [K S], patterns is not used, as we directly use the original input y assert shifted_y[0][0].shape[0] == self.args.n_codebooks, shifted_y[0][0].shape # insert mask token at the intersction of each tensor, but *actually inserted eog as place holder* # the position of inserted mask is also recorded # and the mask_value, the index of the mask emb is recorded inserted_y, mask_position, mask_value = self.insert_mask(shifted_y) assert inserted_y[0][0].shape[0] == self.args.n_codebooks, inserted_y[0][0].shape[0] assert inserted_y[0][1].shape == torch.Size((self.args.n_codebooks, 1)), f"this should be a mask, so should have shape {(self.args.n_codebooks, 1)}, but it's {inserted_y[0][1].shape}" # then concat tensors that belong to the same sample (in order) then get the length of each sample, and then stack them in batch dimension, pad them with pad_token cated_y, new_y_lens = self.cat_y(inserted_y, mask_position, y_lens) # KTB assert cated_y.shape == torch.Size((self.args.n_codebooks, cated_y.shape[1], len(inserted_y))) assert not (cated_y == self.args.audio_pad_token).any(), cated_y ### NOTE this is different from forward, as we will remove the masked tokens ### say there are two masked region ### the cated_y should be like ### [empty a a a a mask0 empty b b b mask1 empty c c mask0 empty] ### which means we need to take the part after the last empty out num_mask = len(mask_position[0])//2 assert num_mask == len(mask_position[0])/2, mask_position cated_y = cated_y[:, :mask_position[0][num_mask]+2] # of shape [K,T,B] # logging.info(f"mask_position[0][num_mask]+2: {mask_position[0][num_mask]+2}") more_mask_value = mask_value[0][num_mask+1:] # NOTE this will be used in the generation loop for reference for inserting mask embedding new_y_lens[0] = mask_position[0][num_mask]+2 mask_position[0] = mask_position[0][:num_mask+1] assert mask_position[0][num_mask]+2 == cated_y.shape[1], f"num_mask: {num_mask}, mask_position: {mask_position}, cated_y.shape: {cated_y.shape}" # embed: remember to separately embed the mask tokens embedded_y = self.embed_y(cated_y, mask_position, [mask_value[0][:num_mask+1]]) #BTD # assert embedded_y.shape == torch.Size((y.shape[0], max(new_y_lens), self.args.d_model)), embedded_y.shape # positional embedding y_input = self.audio_positional_embedding(embedded_y) # make attention mask and padding mask y_attention_mask = torch.triu(torch.ones(y_input.shape[1], y_input.shape[1]), diagonal=1).bool().to(y.device) # y_lens = torch.LongTensor([y_input.shape[1]]).to(y.device) x_padding_mask = torch.full((1,x_lens[0]), False).to(x.device) y_padding_mask = torch.full((1,new_y_lens[0]), False).to(y.device) codebook_eog = [False] * self.args.n_codebooks generated = [] # doesn't contain any empty_token, contains eog cur_generated = [] # say 0 is empty, 4 is eog # tensor([[ 1, 2, 3, 4, 0, 0], # [ 0, 1, 2, 3, 4, 0], # [ 0, 0, 1, 2, 3, 4]]) num_gen = [] cur_num_gen = 0 ##################### silence repetition handling ##################### ##################### silence repetition handling ##################### logging.info(f"silence tokens: {silence_tokens}, note that if you are not using the pretrained encodec 6f79c6a8, make sure you specified it yourself, rather than using the default") consec_silence_count = 0 prev_token = None ##################### silence repetition handling ##################### ##################### silence repetition handling ##################### # prepare the cache placeholder # n_layers, 2, bsz, num_heads, src_len, head_dim past = torch.ones([self.args.num_decoder_layers, 2, x.shape[0]], device=x.device, dtype=torch.float32) if kvcache else None # handle multi-span kv-cache new_masked_span = False def sample_helper(n_eog, logits, codebook_eog, top_k, top_p, temperature, prev_token, consec_silence_count, stop_repetition, silence_tokens, cur_num_gen): if n_eog == 0: logits_adjust = logits for jj in range(1,self.args.n_codebooks): logits_adjust[jj][self.args.eog] = -10000 logits_adjust[jj][self.args.empty_token] = -10000 ##################### silence repetition handling ##################### if stop_repetition > 0 and prev_token in silence_tokens and consec_silence_count > stop_repetition: if logits_adjust[0, prev_token] < 0: logits_adjust[0, prev_token] = logits_adjust[0, prev_token] * (consec_silence_count - (stop_repetition-1)) else: logits_adjust[0, prev_token] = logits_adjust[0, prev_token] / (consec_silence_count - (stop_repetition-1)) ##################### silence repetition handling ##################### if type(logits_adjust) == list: samples_list= [] for logit in logits_adjust: # print(logit) # print(logit.shape) cur_sample = topk_sampling( logit.unsqueeze(0), top_k=top_k, top_p=top_p, temperature=temperature ) # [1, 1] samples_list.append(cur_sample) samples = torch.cat(samples_list, dim=0) # [K, 1] else: samples = topk_sampling( logits_adjust, top_k=top_k, top_p=top_p, temperature=temperature ) # [K, 1] assert samples.shape == torch.Size((self.args.n_codebooks, 1)), f"samples.shape: {samples.shape}" if cur_num_gen < self.args.n_codebooks-1: for jj in range(1, self.args.n_codebooks - cur_num_gen): samples[-jj, 0] = self.args.empty_token if ( samples[0,0] == self.args.eog or torch.argmax(logits[0], dim=-1) == self.args.eog or y_input.shape[1] > x_lens[0] * 10 ): # last one means y is already too long, shouldn't happen, but put it here samples[0,0] = self.args.eog codebook_eog[0] = True ##################### silence repetition handling ##################### ##################### silence repetition handling ##################### if samples[0,0] in silence_tokens and samples[0,0] == prev_token: consec_silence_count += 1 else: consec_silence_count = 0 prev_token = samples[0,0] ##################### silence repetition handling ##################### ##################### silence repetition handling ##################### return samples, codebook_eog, prev_token, consec_silence_count else: assert sum(codebook_eog[i] for i in range(n_eog)) == n_eog, f"codebook_eog: {codebook_eog}, but n_eog: {n_eog}" logits_adjust = logits for jj in range(n_eog+1,self.args.n_codebooks): logits_adjust[jj][self.args.eog] = -10000 logits_adjust[jj][self.args.empty_token] = -10000 if type(logits_adjust) == list: samples_list= [] for logit in logits_adjust: cur_sample = topk_sampling( logit.unsqueeze(0), top_k=top_k, top_p=top_p, temperature=temperature ) # [1, 1] samples_list.append(cur_sample) samples = torch.cat(samples_list, dim=0) # [K, 1] else: samples = topk_sampling( logits_adjust, top_k=top_k, top_p=top_p, temperature=temperature ) # [K, 1] for jj in range(n_eog): samples[jj, 0] = self.args.empty_token samples[n_eog, 0] = self.args.eog codebook_eog[n_eog] = True return samples, codebook_eog, prev_token, consec_silence_count while True: y_out, present = self.dec_forward( x_input, x_lens, x_attention_mask, x_padding_mask, y_input, new_y_lens, y_attention_mask, y_padding_mask, past=past, last_3_tokens = new_masked_span ) if new_masked_span: new_masked_span = False if past != None: past = torch.cat([past, present.to(past.dtype)], dim=-2) if past.ndim > 3 else present.to(past.dtype) y_out = y_out[:, -1:] # only take the last one logits = torch.stack([self.predict_layer[i](y_out) for i in range(self.args.n_codebooks)], dim=1) # [B K S card], B==S==1, so [1 K 1 card] logits = logits.squeeze(0).squeeze(1) # [K card] assert logits.shape == torch.Size((self.args.n_codebooks, self.n_audio_tokens[0])), f"{logits.shape}" n_eog = sum(codebook_eog) assert n_eog < self.args.n_codebooks if self.args.eos > 0: # eos stands for end-of-sentence, which shouldn't be used as we are doing speech editing for jj in range(self.args.n_codebooks): logits[jj][self.args.eos] = -10000. # need to use a helper function to hand different n_eog cases samples, codebook_eog, prev_token, consec_silence_count = sample_helper(n_eog, logits, codebook_eog, top_k, top_p, temperature, prev_token, consec_silence_count, stop_repetition, silence_tokens, cur_num_gen) cur_num_gen += 1 cur_generated.append(samples.squeeze(-1)) # [K,1] -> [K] # get samples_emb samples_emb = torch.stack([self.audio_embedding[k](samples[k]) for k in range(self.args.n_codebooks)], dim=0) # [K,1,D] samples_emb = samples_emb.sum(dim=0,keepdim=True) # [1,1,D] if sum(codebook_eog) == self.args.n_codebooks: # generation for the current span is done # re-init codebook_eog = [False] * self.args.n_codebooks num_gen.append(cur_num_gen) cur_num_gen = 0 generated.append(cur_generated) cur_generated = [] # if the current mask span is the last span, then all done # else # append the next mask token and the four empty tokens to start the next generation if len(more_mask_value) > 0: next_mask_ind = more_mask_value.pop(0) mask_emb = self.mask_embedding[next_mask_ind].unsqueeze(0).unsqueeze(0) # [1,1,D] assert mask_emb.shape == torch.Size((1,1,self.args.d_model)), mask_emb.shape empty_token = torch.LongTensor([self.args.empty_token]).to(y.device) empty_emb = torch.stack([ self.audio_embedding[k](empty_token) for k in range(self.args.n_codebooks)], dim=0 ).sum(dim=0, keepdim=True) # [1,1,D] assert empty_emb.shape == torch.Size((1,1,self.args.d_model)), empty_emb.shape extra_emb = torch.cat([mask_emb, empty_emb], dim=1) # [1,2,D] samples_emb = torch.cat([samples_emb, extra_emb], dim=1) # [1,3,D] # prev_last_token, mask_token, empty token assert samples_emb.shape == torch.Size((1,3,self.args.d_model)), f"samples_emb.shape: {samples_emb.shape}" ##################### silence repetition handling ##################### ##################### silence repetition handling ##################### consec_silence_count = 0 prev_token = None ##################### silence repetition handling ##################### ##################### silence repetition handling ##################### # handling kv-caching for multi-span editing new_masked_span = True else: break else: assert samples_emb.shape == torch.Size((1,1,self.args.d_model)), f"samples_emb.shape: {samples_emb.shape}" embedded_y = torch.cat([embedded_y, samples_emb], dim=1) # positional embedding y_input = self.audio_positional_embedding(embedded_y) # [B T D] # make attention mask and padding mask y_attention_mask = torch.triu(torch.ones(y_input.shape[1], y_input.shape[1]), diagonal=1).bool().to(y.device) new_y_lens = torch.LongTensor([y_input.shape[1]]).to(y.device) y_padding_mask = torch.full((1,new_y_lens[0]), False).to(y.device) assert len(generated) == num_mask, f"len(generated): {len(generated)}, num_mask: {num_mask}" # # combine non_masked_span with generated spans # first need to shift the generated part back flatten_gen = [] for l, orig_span in enumerate(generated): span = torch.stack(orig_span, dim=0) # [T K] span = span.transpose(1,0) # [K, T] assert span.shape[0] == self.args.n_codebooks, span.shape unshifted_span = [] for j, s in enumerate(span): start_from = j end_at = - (self.args.n_codebooks - start_from) unshifted_span.append(s[start_from:end_at]) unshifted_span = torch.stack(unshifted_span, dim=0) assert unshifted_span.shape[1] == num_gen[l] - self.args.n_codebooks, f"len(unshifted_spans[0]): {len(unshifted_span[0])}, num_gen[l]: {num_gen[l]}" flatten_gen.append(unshifted_span) # logging.info(f"unshfited_span: {unshifted_span.shape}") # raise assert len(non_mask_intervals[0]) - 1 == len(flatten_gen), f"len(non_mask_intervals[0]): {len(non_mask_intervals[0])}, len(flatten_gen): {len(flatten_gen)}" res = [] for orig_interval, gen in zip(non_mask_intervals[0], flatten_gen): res.append(y[0, :, orig_interval[0]:orig_interval[1]]) res.append(gen) res.append(y[0, :, non_mask_intervals[0][-1][0]:non_mask_intervals[0][-1][1]]) res = torch.cat(res, dim=1).unsqueeze(0) # [K,new_T] -> [1, K, new_T] expected_y_len = y_len - sum([item[1] - item[0] for item in mask_intervals[0]]) + sum([item - self.args.n_codebooks for item in num_gen]) assert res.shape == torch.Size((1, self.args.n_codebooks, expected_y_len)), f"res.shape: {res.shape}, expected_y_len: {expected_y_len}. y_len - sum([item[1] - item[0] for item in mask_interval]) + sum([item - self.args.n_codebooks for item in num_gen]): {y_len}-{sum([item[1] - item[0] for item in mask_interval])} + {sum([item - self.args.n_codebooks for item in num_gen])}" if self.args.special_first: res = res - int(self.args.n_special) return res def inference_tts( self, x: torch.Tensor, x_lens: torch.Tensor, y: torch.Tensor, top_k: int=-100, top_p: float=1.0, temperature: float=1.0, stop_repetition: int=3, kvcache: int=1, silence_tokens: list[int]=[1388,1898,131], *kargs ) -> torch.Tensor: """ different from inference_tts, this implementation uses kvcache, which should have significant speed up Args: x: A 2-D tensor of shape (1, L). x_lens: A 1-D tensor of shape (1,). It contains the number of tokens in `x` before padding. y: A 3-D tensor of shape (1, T, K). top_k: (`optional`) int The number of highest probability tokens to keep for top-k-filtering. Default to -100. top_p: (`optional`) float For Neucleus sampling temperature: (`optional`) float The value used to module the next token probabilities. Must be strictly positive. Default to 1.0. """ eog_inference = self.args.eos if self.args.eos>0 else self.args.eog assert x.ndim == 2, x.shape assert x_lens.ndim == 1, x_lens.shape assert y.ndim == 3, y.shape if self.args.special_first: y = y + int(self.args.n_special) y = y.transpose(2,1) # [1,T,K] -> [1,K,T] assert y.shape[0] == 1 and y.shape[1] == self.args.n_codebooks, y.shape # there is no padding # make x attention mask and x_input x_attention_mask = torch.triu(torch.ones(x.shape[1], x.shape[1]), diagonal=1).bool().to(x.device) # x_attention_mask = torch.zeros(x.shape[1], x.shape[1]).bool().to(x.device) x_input = self.text_embedding(x) x_input = self.text_positional_embedding(x_input) y_len = y.shape[2] y_lens = torch.LongTensor([y_len]).to(y.device) # rearrange y, we don't add eog to the end, this doesn't actually do anything in the tts scenario rearranged_y = [[y[0]]] assert rearranged_y[0][0].shape[0] == self.args.n_codebooks, rearranged_y[0][0].shape # shift y to create the delayed pattern shifted_y, patterns = self.shift(rearranged_y) # each element [K S], patterns is not used, as we directly use the original input y assert shifted_y[0][0].shape[0] == self.args.n_codebooks, shifted_y[0][0].shape assert len(shifted_y[0]) == 1, len(shifted_y[0]) # below is different from forward or inference # where we cut this shifted part shifted_y[0][0] = shifted_y[0][0][:, :-(self.args.n_codebooks-1)] assert not (shifted_y[0][0][self.args.n_codebooks:] == self.args.empty_token).any() and not (shifted_y[0][0][self.args.n_codebooks:] == self.args.eog).any(), shifted_y[0][0] # next section in inference is insert mask at the intersection of each tensor in a sample, but we don't need to do that # next section is concate tensors of each sample to one tensor, which we also don't need cated_y = shifted_y[0][0].unsqueeze(-1) #[K,S]->[K,S,B] new_y_lens = torch.LongTensor([cated_y.shape[1]]).to(cated_y.device) assert cated_y.shape == torch.Size((self.args.n_codebooks, cated_y.shape[1], 1)) assert not (cated_y == self.args.audio_pad_token).any(), cated_y # replace tokens in y with the embeddings, add sum codebooks up embedded_y = torch.stack([self.audio_embedding[k](cated_y[k]) for k in range(self.args.n_codebooks)], dim=0) # [K, S, B, D] assert embedded_y.shape[0] == self.args.n_codebooks, embedded_y.shape assert embedded_y.shape[-1] == self.args.d_model, embedded_y.shape embedded_y = embedded_y.sum(dim=0) # [K,S,B,D]->[S,B,D] embedded_y = embedded_y.transpose(1,0) # [S,B,D]->[B,S,D] # positional embedding y_input = self.audio_positional_embedding(embedded_y) # make attention mask and padding mask y_attention_mask = torch.triu(torch.ones(y_input.shape[1], y_input.shape[1]), diagonal=1).bool().to(y.device) x_padding_mask = torch.full((1,x_lens[0]), False).to(x.device) y_padding_mask = torch.full((1,new_y_lens[0]), False).to(y.device) # entering the generation stage # starting from line 708 codebook_eog = [False] * self.args.n_codebooks generated = [] # doesn't contain any empty token, contain eog cur_generated = [] # say 0 is empty, 4 is eog # tensor([[ 1, 2, 3, 4, 0, 0], # [ 0, 1, 2, 3, 4, 0], # [ 0, 0, 1, 2, 3, 4]]) num_gen = [] cur_num_gen = 0 ##################### silence repetition handling ##################### ##################### silence repetition handling ##################### logging.info(f"silence tokens: {silence_tokens}, note that if you are not using the pretrained encodec 6f79c6a8, make sure you specified it yourself, rather than using the default") consec_silence_count = 0 prev_token = None ##################### silence repetition handling ##################### ##################### silence repetition handling ##################### # prepare the cache placeholder # n_layers, 2, bsz, num_heads, src_len, head_dim past = torch.ones([self.args.num_decoder_layers, 2, x.shape[0]], device=x.device, dtype=torch.float32) if kvcache else None # logging.info(f"number of decoder layers: {self.args.num_decoder_layers}") # logging.info(f"number of decoder layers: {self.args.num_decoder_layers}") # logging.info(f"number of decoder layers: {self.args.num_decoder_layers}") def sample_helper(n_eog, logits, codebook_eog, top_k, top_p, temperature, prev_token, consec_silence_count, stop_repetition, silence_tokens, cur_num_gen): if n_eog == 0: logits_adjust = logits for jj in range(1,self.args.n_codebooks): logits_adjust[jj][eog_inference] = -10000 logits_adjust[jj][self.args.empty_token] = -10000 if cur_num_gen <= self.args.encodec_sr // 5: # this shouldn't happen, but just in case the model stopped too early logits_adjust[0][eog_inference] = -10000 ##################### silence repetition handling ##################### if stop_repetition > 0 and prev_token in silence_tokens and consec_silence_count > stop_repetition: if logits_adjust[0, prev_token] < 0: logits_adjust[0, prev_token] = logits_adjust[0, prev_token] * (consec_silence_count - (stop_repetition-1)) else: logits_adjust[0, prev_token] = logits_adjust[0, prev_token] / (consec_silence_count - (stop_repetition-1)) ##################### silence repetition handling ##################### samples = topk_sampling( logits_adjust, top_k=top_k, top_p=top_p, temperature=temperature ) # [K, 1] assert samples.shape == torch.Size((self.args.n_codebooks, 1)), f"samples.shape: {samples.shape}" if cur_num_gen < self.args.n_codebooks-1: for jj in range(1, self.args.n_codebooks - cur_num_gen): samples[-jj, 0] = self.args.empty_token if ( samples[0,0] == eog_inference or torch.argmax(logits[0], dim=-1) == eog_inference or y_input.shape[1] > x_lens[0] * (self.args.encodec_sr//5) ): # last one means y is already too long, shouldn't happen, but put it here samples[0,0] = eog_inference codebook_eog[0] = True ##################### silence repetition handling ##################### if samples[0,0] in silence_tokens and samples[0,0] == prev_token: consec_silence_count += 1 else: consec_silence_count = 0 prev_token = samples[0,0] ##################### silence repetition handling ##################### return samples, codebook_eog, prev_token, consec_silence_count else: assert sum(codebook_eog[i] for i in range(n_eog)) == n_eog, f"codebook_eog: {codebook_eog}, but n_eog: {n_eog}" logits_adjust = logits for jj in range(n_eog+1,self.args.n_codebooks): logits_adjust[jj][eog_inference] = -10000 logits_adjust[jj][self.args.empty_token] = -10000 samples = topk_sampling( logits_adjust, top_k=top_k, top_p=top_p, temperature=temperature ) # [K, 1] for jj in range(n_eog): samples[jj, 0] = self.args.empty_token samples[n_eog, 0] = eog_inference codebook_eog[n_eog] = True return samples, codebook_eog, prev_token, consec_silence_count while True: y_out, present = self.dec_forward( x_input, x_lens, x_attention_mask, x_padding_mask, y_input, new_y_lens, y_attention_mask, y_padding_mask, past=past ) if past != None: past = torch.cat([past, present.to(past.dtype)], dim=-2) if past.ndim > 3 else present.to(past.dtype) y_out = y_out[:, -1:] # only take the last token logits = torch.stack([self.predict_layer[i](y_out) for i in range(self.args.n_codebooks)], dim=1) # [B K S card], B==S==1, so [1 K 1 card] logits = logits.squeeze(0).squeeze(1) # [K card] assert logits.shape == torch.Size((self.args.n_codebooks, self.n_audio_tokens[0])), f"{logits.shape}" n_eog = sum(codebook_eog) assert n_eog < self.args.n_codebooks if self.args.eos > 0: # if we are using end-of-sentence token (which is used by default), eog shouldn't be used here, as there is no masked spans for jj in range(self.args.n_codebooks): logits[jj][self.args.eog] = -10000. samples, codebook_eog, prev_token, consec_silence_count = sample_helper(n_eog, logits, codebook_eog, top_k, top_p, temperature, prev_token, consec_silence_count, stop_repetition, silence_tokens, cur_num_gen) cur_num_gen += 1 cur_generated.append(samples.squeeze(-1)) # [K,1] -> [K] # samples.shape is [K,1] # ge samples_emb samples_emb = torch.stack([self.audio_embedding[k](samples[k]) for k in range(self.args.n_codebooks)], dim=0) # [K,1,D] samples_emb = samples_emb.sum(dim=0,keepdim=True) # [1,1,D] if sum(codebook_eog) == self.args.n_codebooks: # generation for the current span is done codebook_eog = [False] * self.args.n_codebooks num_gen.append(cur_num_gen) cur_num_gen = 0 generated.append(cur_generated) cur_generated = [] break else: assert samples_emb.shape == torch.Size((1,1,self.args.d_model)), f"samples_emb.shape: {samples_emb.shape}" embedded_y = torch.cat([embedded_y, samples_emb], dim=1) y_input = self.audio_positional_embedding(embedded_y) # [B T D] # make attention mask and padding mask y_attention_mask = torch.triu(torch.ones(y_input.shape[1], y_input.shape[1]), diagonal=1).bool().to(y.device) new_y_lens = torch.LongTensor([y_input.shape[1]]).to(y.device) y_padding_mask = torch.full((1,new_y_lens[0]), False).to(y.device) assert len(generated) == 1, f"len(generated): {len(generated)}" # revert the pattern flatten_gen = [] for l, orig_span in enumerate(generated): span = torch.stack(orig_span, dim=0) # [T, K] span = span.transpose(1,0) # [K, T] assert span.shape[0] == self.args.n_codebooks, span.shape unshifted_span = [] for j, s in enumerate(span): start_from = j end_at = - (self.args.n_codebooks - start_from) unshifted_span.append(s[start_from:end_at]) unshifted_span = torch.stack(unshifted_span, dim=0) assert unshifted_span.shape[1] == num_gen[l] - self.args.n_codebooks, f"len(unshifted_spans[0]): {len(unshifted_span[0])}, num_gen[l]: {num_gen[l]}" flatten_gen.append(unshifted_span) assert len(flatten_gen) == 1, len(flatten_gen) # combine res = [y[0], flatten_gen[0]] res = torch.cat(res, dim=1).unsqueeze(0) # [K, new_t] -> [1, K, new_T] expected_y_len = y_len + sum([item - self.args.n_codebooks for item in num_gen]) assert res.shape == torch.Size((1, self.args.n_codebooks, expected_y_len)), f"res.shape: {res.shape}, expected_y_len: {expected_y_len}. y_len + sum([item - self.args.n_codebooks for item in num_gen]): {y_len} + {sum([item - self.args.n_codebooks for item in num_gen])}" if self.args.special_first: res = res - int(self.args.n_special) flatten_gen = flatten_gen - int(self.args.n_special) return res, flatten_gen[0].unsqueeze(0) def inference_tts_batch( self, x: torch.Tensor, x_lens: torch.Tensor, y: torch.Tensor, top_k: int=-100, top_p: float=1.0, temperature: float=1.0, stop_repetition: int=3, kvcache: int=1, batch_size: int=5, silence_tokens: list[int]=[1388,1898,131], *kargs ) -> torch.Tensor: """ have a batch size when forward passing, but they are equivalant to same example but different random seed, therefore as long as one example generated eog, we can drop all other samlpes different from inference_tts, this implementation uses kvcache, which should have significant speed up Args: x: A 2-D tensor of shape (1, L). x_lens: A 1-D tensor of shape (1,). It contains the number of tokens in `x` before padding. y: A 3-D tensor of shape (1, T, K). top_k: (`optional`) int The number of highest probability tokens to keep for top-k-filtering. Default to -100. top_p: (`optional`) float For Neucleus sampling temperature: (`optional`) float The value used to module the next token probabilities. Must be strictly positive. Default to 1.0. """ eog_inference = self.args.eos if self.args.eos>0 else self.args.eog assert x.ndim == 2, x.shape assert x_lens.ndim == 1, x_lens.shape assert y.ndim == 3, y.shape if self.args.special_first: y = y + int(self.args.n_special) y = y.transpose(2,1) # [1,T,K] -> [1,K,T] assert y.shape[0] == 1 and y.shape[1] == self.args.n_codebooks, y.shape # there is no padding # make x attention mask and x_input x_attention_mask = torch.triu(torch.ones(x.shape[1], x.shape[1]), diagonal=1).bool().to(x.device) # x_attention_mask = torch.zeros(x.shape[1], x.shape[1]).bool().to(x.device) x_input = self.text_embedding(x) x_input = self.text_positional_embedding(x_input) y_len = y.shape[2] y_lens = torch.LongTensor([y_len]).to(y.device) # rearrange y, we don't add eog to the end, this doesn't actually do anything in the tts scenario rearranged_y = [[y[0]]] assert rearranged_y[0][0].shape[0] == self.args.n_codebooks, rearranged_y[0][0].shape # shift y to create the delayed pattern shifted_y, patterns = self.shift(rearranged_y) # each element [K S], patterns is not used, as we directly use the original input y assert shifted_y[0][0].shape[0] == self.args.n_codebooks, shifted_y[0][0].shape assert len(shifted_y[0]) == 1, len(shifted_y[0]) # below is different from forward or inference # where we cut this shifted part shifted_y[0][0] = shifted_y[0][0][:, :-(self.args.n_codebooks-1)] assert not (shifted_y[0][0][self.args.n_codebooks:] == self.args.empty_token).any() and not (shifted_y[0][0][self.args.n_codebooks:] == self.args.eog).any(), shifted_y[0][0] # next section in inference is insert mask at the intersection of each tensor in a sample, but we don't need to do that # next section is concate tensors of each sample to one tensor, which we also don't need cated_y = shifted_y[0][0].unsqueeze(-1) #[K,S]->[K,S,B] new_y_lens = torch.LongTensor([cated_y.shape[1]]).to(cated_y.device) assert cated_y.shape == torch.Size((self.args.n_codebooks, cated_y.shape[1], 1)) assert not (cated_y == self.args.audio_pad_token).any(), cated_y # replace tokens in y with the embeddings, add sum codebooks up embedded_y = torch.stack([self.audio_embedding[k](cated_y[k]) for k in range(self.args.n_codebooks)], dim=0) # [K, S, B, D] assert embedded_y.shape[0] == self.args.n_codebooks, embedded_y.shape assert embedded_y.shape[-1] == self.args.d_model, embedded_y.shape embedded_y = embedded_y.sum(dim=0) # [K,S,B,D]->[S,B,D] embedded_y = embedded_y.transpose(1,0) # [S,B,D]->[B,S,D] # positional embedding y_input = self.audio_positional_embedding(embedded_y) # make attention mask and padding mask y_attention_mask = torch.triu(torch.ones(y_input.shape[1], y_input.shape[1]), diagonal=1).bool().to(y.device) x_padding_mask = torch.full((1,x_lens[0]), False).to(x.device) y_padding_mask = torch.full((1,new_y_lens[0]), False).to(y.device) # entering the generation stage # starting from line 708 codebook_eog = [False] * self.args.n_codebooks generated = [] # doesn't contain any empty token, contain eog cur_generated = [[] for _ in range(batch_size)] # say 0 is empty, 4 is eog # tensor([[ 1, 2, 3, 4, 0, 0], # [ 0, 1, 2, 3, 4, 0], # [ 0, 0, 1, 2, 3, 4]]) num_gen = [] cur_num_gen = 0 ##################### silence repetition handling ##################### ##################### silence repetition handling ##################### logging.info(f"silence tokens: {silence_tokens}, note that if you are not using the pretrained encodec 6f79c6a8, make sure you specified it yourself, rather than using the default") consec_silence_counts = [0 for _ in range(batch_size)] prev_tokens = [None for _ in range(batch_size)] ##################### silence repetition handling ##################### ##################### silence repetition handling ##################### # prepare the cache placeholder # n_layers, 2, bsz, num_heads, src_len, head_dim past = torch.ones([self.args.num_decoder_layers, 2, x.shape[0]], device=x.device, dtype=torch.float32) if kvcache else None # logging.info(f"number of decoder layers: {self.args.num_decoder_layers}") # logging.info(f"number of decoder layers: {self.args.num_decoder_layers}") # logging.info(f"number of decoder layers: {self.args.num_decoder_layers}") keep = None # NOTE: this very important, tells which sample to keep def sample_helper(n_eog, logits, codebook_eog, top_k, top_p, temperature, prev_tokens, consec_silence_counts, stop_repetition, silence_tokens, cur_num_gen, keep): if n_eog == 0: logits_adjust = logits for jj in range(1,self.args.n_codebooks): logits_adjust[:,jj,eog_inference] = -10000 logits_adjust[:,jj,self.args.empty_token] = -10000 if cur_num_gen <= self.args.encodec_sr // 5: # this shouldn't happen, but just in case the model stopped too early logits_adjust[:,:,eog_inference] = -10000 ##################### silence repetition handling ##################### for b in range(batch_size): prev_token = prev_tokens[b] consec_silence_count = consec_silence_counts[b] if stop_repetition > 0 and prev_token in silence_tokens and consec_silence_count > stop_repetition: if logits_adjust[b, 0, prev_token] < 0: logits_adjust[b, 0, prev_token] = logits_adjust[b, 0, prev_token] * (consec_silence_count - (stop_repetition-1)) else: logits_adjust[b, 0, prev_token] = logits_adjust[b, 0, prev_token] / (consec_silence_count - (stop_repetition-1)) ##################### silence repetition handling ##################### samples = topk_sampling( logits_adjust.reshape(batch_size * self.args.n_codebooks, logits_adjust.shape[-1]), top_k=top_k, top_p=top_p, temperature=temperature ) # [B*K, 1] samples = samples.reshape(batch_size, self.args.n_codebooks, 1) assert samples.shape == torch.Size((batch_size, self.args.n_codebooks, 1)), f"samples.shape: {samples.shape}" for b in range(batch_size): if cur_num_gen < self.args.n_codebooks-1: for jj in range(1, self.args.n_codebooks - cur_num_gen): samples[b, -jj, 0] = self.args.empty_token if ( samples[b,0,0] == eog_inference or torch.argmax(logits[b,0], dim=-1) == eog_inference or y_input.shape[1] > x_lens[b] * (self.args.encodec_sr//5) ): # last one means y is already too long, shouldn't happen, but put it here samples[b,0,0] = eog_inference codebook_eog[0] = True keep = b # NOTE keep is a very important variable, we only return this one, note that if eog shows up in two samples, keep will be overwritten by the later one (or the last one) ##################### silence repetition handling ##################### if samples[b,0,0] in silence_tokens and samples[b,0,0] == prev_tokens[b]: consec_silence_counts[b] += 1 else: consec_silence_counts[b] = 0 prev_tokens[b] = samples[b,0,0] ##################### silence repetition handling ##################### return samples, codebook_eog, prev_tokens, consec_silence_counts, keep else: assert sum(codebook_eog[i] for i in range(n_eog)) == n_eog, f"codebook_eog: {codebook_eog}, but n_eog: {n_eog}" logits_adjust = logits for jj in range(n_eog+1,self.args.n_codebooks): logits_adjust[:,jj,eog_inference] = -10000 logits_adjust[:,jj,self.args.empty_token] = -10000 samples = topk_sampling( logits_adjust.reshape(batch_size * self.args.n_codebooks, logits_adjust.shape[-1]), top_k=top_k, top_p=top_p, temperature=temperature ) # [B, K, 1] samples = samples.reshape(batch_size, self.args.n_codebooks, 1) for jj in range(n_eog): samples[keep, jj, 0] = self.args.empty_token samples[keep, n_eog, 0] = eog_inference codebook_eog[n_eog] = True return samples, codebook_eog, prev_tokens, consec_silence_counts, keep while True: # if cur_num_gen > 0, should have everything in kvcache, so only pass in the last token # in the first generation step, we repeat each tensor to make their first dimension of length the batch size if cur_num_gen == 0: assert x_input.ndim == 3 and x_input.shape[0] == 1, x_input.shape assert x_padding_mask.ndim == 2 and x_padding_mask.shape[0] == 1, x_padding_mask.shape assert y_input.ndim == 3 and y_input.shape[0] == 1 and y_input.shape[1] == new_y_lens[0], y_input.shape assert embedded_y.ndim == 3 and embedded_y.shape[0] == 1 and embedded_y.shape[1] == new_y_lens[0], embedded_y.shape x_input = x_input.repeat(batch_size, 1, 1) x_lens = x_lens.repeat(batch_size) # x_attention_mask = x_attention_mask.repeat(batch_size, 1, 1) # no need to work with attention mask, it doesn't contain batch dimension x_padding_mask = x_padding_mask.repeat(batch_size, 1) y_input = y_input.repeat(batch_size, 1, 1) new_y_lens = new_y_lens.repeat(batch_size) # y_attention_mask = y_attention_mask.repeat(batch_size, 1, 1) # no need to work with attention mask, it doesn't contain batch dimension y_padding_mask = y_padding_mask.repeat(batch_size, 1) embedded_y = embedded_y.repeat(batch_size, 1, 1) # will be used to concat with newly generated token embedding past = past.repeat(1, 1, batch_size) if past != None else None else: assert x_input.shape[0] == batch_size and x_padding_mask.shape[0] == batch_size and y_input.shape[0] == batch_size and new_y_lens.shape[0] == batch_size, f"x_input.shape: {x_input.shape}, x_padding_mask.shape: {x_padding_mask.shape}, y_input.shape: {y_input.shape}, new_y_lens.shape: {new_y_lens.shape}" y_out, present = self.dec_forward( x_input, x_lens, x_attention_mask, x_padding_mask, y_input, new_y_lens, y_attention_mask, y_padding_mask, past=past ) if past != None: past = torch.cat([past, present.to(past.dtype)], dim=-2) if past.ndim > 3 else present.to(past.dtype) # if no eog emerges, y_out should have batch size of batch_size if sum(codebook_eog) == 0: assert y_out.shape[0] == batch_size and y_out.ndim == 3, y_out.shape y_out = y_out[:, -1:] # only take the last token logits = torch.stack([self.predict_layer[i](y_out) for i in range(self.args.n_codebooks)], dim=1) # [B K S card], S==1, so [B K 1 card] logits = logits.squeeze(2) # [B K card] assert logits.shape == torch.Size((batch_size, self.args.n_codebooks, self.n_audio_tokens[0])), f"{logits.shape}" n_eog = sum(codebook_eog) if self.args.eos > 0: for jj in range(self.args.n_codebooks): logits[:,jj,self.args.eog] = -10000. samples, codebook_eog, prev_tokens, consec_silence_counts, keep = sample_helper(n_eog, logits, codebook_eog, top_k, top_p, temperature, prev_tokens, consec_silence_counts, stop_repetition, silence_tokens, cur_num_gen, keep) cur_num_gen += 1 if sum(codebook_eog) == 0: # no eog yet, keep batch_size of samples assert keep == None for b in range(batch_size): cur_generated[b].append(samples[b].squeeze(-1)) elif sum(codebook_eog) == 1: # the first eog just showed up in this step assert keep != None cur_generated = cur_generated[keep] cur_generated.append(samples[keep].squeeze(-1)) else: # we are generating the rest eogs for the 'keep' sample cur_generated.append(samples[keep].squeeze(-1)) # samples.shape is [K,1] # ge samples_emb samples_emb = torch.stack([self.audio_embedding[k](samples[:, k]) for k in range(self.args.n_codebooks)], dim=1) # [B, K,1,D] assert samples_emb.shape == torch.Size([batch_size, self.args.n_codebooks, 1, self.args.d_model]) samples_emb = samples_emb.sum(dim=1,keepdim=False) # [B,1,D] if sum(codebook_eog) == self.args.n_codebooks: # generation for the current span is done codebook_eog = [False] * self.args.n_codebooks num_gen.append(cur_num_gen) cur_num_gen = 0 generated.append(cur_generated) cur_generated = [[] for _ in range(batch_size)] break else: assert samples_emb.shape == torch.Size((batch_size,1,self.args.d_model)), f"samples_emb.shape: {samples_emb.shape}" embedded_y = torch.cat([embedded_y, samples_emb], dim=1) y_input = self.audio_positional_embedding(embedded_y) # [B T D] # make attention mask and padding mask y_attention_mask = torch.triu(torch.ones(y_input.shape[1], y_input.shape[1]), diagonal=1).bool().to(y.device) new_y_lens = torch.LongTensor([y_input.shape[1]]).to(y.device).repeat(batch_size) y_padding_mask = torch.full((batch_size,new_y_lens[0]), False).to(y.device) assert len(generated) == 1, f"len(generated): {len(generated)}" # revert the pattern flatten_gen = [] for l, orig_span in enumerate(generated): span = torch.stack(orig_span, dim=0) # [T, K] span = span.transpose(1,0) # [K, T] assert span.shape[0] == self.args.n_codebooks, span.shape unshifted_span = [] for j, s in enumerate(span): start_from = j end_at = - (self.args.n_codebooks - start_from) unshifted_span.append(s[start_from:end_at]) unshifted_span = torch.stack(unshifted_span, dim=0) assert unshifted_span.shape[1] == num_gen[l] - self.args.n_codebooks, f"len(unshifted_spans[0]): {len(unshifted_span[0])}, num_gen[l]: {num_gen[l]}" flatten_gen.append(unshifted_span) assert len(flatten_gen) == 1, len(flatten_gen) # combine res = [y[0], flatten_gen[0]] res = torch.cat(res, dim=1).unsqueeze(0) # [K, new_t] -> [1, K, new_T] expected_y_len = y_len + sum([item - self.args.n_codebooks for item in num_gen]) assert res.shape == torch.Size((1, self.args.n_codebooks, expected_y_len)), f"res.shape: {res.shape}, expected_y_len: {expected_y_len}. y_len + sum([item - self.args.n_codebooks for item in num_gen]): {y_len} + {sum([item - self.args.n_codebooks for item in num_gen])}" if self.args.special_first: res = res - int(self.args.n_special) flatten_gen = flatten_gen - int(self.args.n_special) return res, flatten_gen[0].unsqueeze(0) ================================================ FILE: predict.py ================================================ # Prediction interface for Cog ⚙️ # https://github.com/replicate/cog/blob/main/docs/python.md import os import time import random import getpass import shutil import subprocess import torch import numpy as np import torchaudio from cog import BasePredictor, Input, Path, BaseModel os.environ["USER"] = getpass.getuser() from data.tokenizer import ( AudioTokenizer, TextTokenizer, ) from models import voicecraft from inference_tts_scale import inference_one_sample from edit_utils import get_span from inference_speech_editing_scale import ( inference_one_sample as inference_one_sample_editing, ) MODEL_URL = "https://weights.replicate.delivery/default/pyp1/VoiceCraft-models.tar" # all the models are cached and uploaded to replicate.delivery for faster booting MODEL_CACHE = "model_cache" class ModelOutput(BaseModel): whisper_transcript_orig_audio: str generated_audio: Path class WhisperxAlignModel: def __init__(self): from whisperx import load_align_model self.model, self.metadata = load_align_model( language_code="en", device="cuda:0" ) def align(self, segments, audio_path): from whisperx import align, load_audio audio = load_audio(audio_path) return align( segments, self.model, self.metadata, audio, device="cuda:0", return_char_alignments=False, )["segments"] class WhisperxModel: def __init__(self, model_name, align_model: WhisperxAlignModel, device="cuda"): from whisperx import load_model # the model weights are cached from Systran/faster-whisper-base.en etc self.model = load_model( model_name, device, asr_options={ "suppress_numerals": True, "max_new_tokens": None, "clip_timestamps": None, "hallucination_silence_threshold": None, }, ) self.align_model = align_model def transcribe(self, audio_path): segments = self.model.transcribe(audio_path, language="en", batch_size=8)[ "segments" ] return self.align_model.align(segments, audio_path) def download_weights(url, dest): start = time.time() print("downloading url: ", url) print("downloading to: ", dest) subprocess.check_call(["pget", "-x", url, dest], close_fds=False) print("downloading took: ", time.time() - start) class Predictor(BasePredictor): def setup(self): """Load the model into memory to make running multiple predictions efficient""" self.device = "cuda" if not os.path.exists(MODEL_CACHE): download_weights(MODEL_URL, MODEL_CACHE) encodec_fn = f"{MODEL_CACHE}/encodec_4cb2048_giga.th" self.models, self.ckpt, self.phn2num = {}, {}, {} for voicecraft_name in [ "giga830M.pth", "giga330M.pth", "gigaHalfLibri330M_TTSEnhanced_max16s.pth", ]: ckpt_fn = f"{MODEL_CACHE}/{voicecraft_name}" self.ckpt[voicecraft_name] = torch.load(ckpt_fn, map_location="cpu") self.models[voicecraft_name] = voicecraft.VoiceCraft( self.ckpt[voicecraft_name]["config"] ) self.models[voicecraft_name].load_state_dict( self.ckpt[voicecraft_name]["model"] ) self.models[voicecraft_name].to(self.device) self.models[voicecraft_name].eval() self.phn2num[voicecraft_name] = self.ckpt[voicecraft_name]["phn2num"] self.text_tokenizer = TextTokenizer(backend="espeak") self.audio_tokenizer = AudioTokenizer(signature=encodec_fn, device=self.device) align_model = WhisperxAlignModel() self.transcribe_models = { k: WhisperxModel(f"{MODEL_CACHE}/whisperx_{k.split('.')[0]}", align_model) for k in ["base.en", "small.en", "medium.en"] } def predict( self, task: str = Input( description="Choose a task", choices=[ "speech_editing-substitution", "speech_editing-insertion", "speech_editing-deletion", "zero-shot text-to-speech", ], default="zero-shot text-to-speech", ), voicecraft_model: str = Input( description="Choose a model", choices=["giga830M.pth", "giga330M.pth", "giga330M_TTSEnhanced.pth"], default="giga330M_TTSEnhanced.pth", ), orig_audio: Path = Input(description="Original audio file"), orig_transcript: str = Input( description="Optionally provide the transcript of the input audio. Leave it blank to use the WhisperX model below to generate the transcript. Inaccurate transcription may lead to error TTS or speech editing", default="", ), whisperx_model: str = Input( description="If orig_transcript is not provided above, choose a WhisperX model for generating the transcript. Inaccurate transcription may lead to error TTS or speech editing. You can modify the generated transcript and provide it directly to orig_transcript above", choices=[ "base.en", "small.en", "medium.en", ], default="base.en", ), target_transcript: str = Input( description="Transcript of the target audio file", ), cut_off_sec: float = Input( description="Only used for for zero-shot text-to-speech task. The first seconds of the original audio that are used for zero-shot text-to-speech. 3 sec of reference is generally enough for high quality voice cloning, but longer is generally better, try e.g. 3~6 sec", default=3.01, ), kvcache: int = Input( description="Set to 0 to use less VRAM, but with slower inference", choices=[0, 1], default=1, ), left_margin: float = Input( description="Margin to the left of the editing segment", default=0.08, ), right_margin: float = Input( description="Margin to the right of the editing segment", default=0.08, ), temperature: float = Input( description="Adjusts randomness of outputs, greater than 1 is random and 0 is deterministic. Do not recommend to change", default=1, ), top_p: float = Input( description="Default value for TTS is 0.9, and 0.8 for speech editing", default=1, ), stop_repetition: int = Input( default=3, description="Default value for TTS is 3, and -1 for speech editing. -1 means do not adjust prob of silence tokens. if there are long silence or unnaturally stretched words, increase sample_batch_size to 2, 3 or even 4", ), sample_batch_size: int = Input( description="Default value for TTS is 4, and 1 for speech editing. The higher the number, the faster the output will be. Under the hood, the model will generate this many samples and choose the shortest one", default=4, ), seed: int = Input( description="Random seed. Leave blank to randomize the seed", default=None ), ) -> ModelOutput: """Run a single prediction on the model""" if seed is None: seed = int.from_bytes(os.urandom(2), "big") print(f"Using seed: {seed}") seed_everything(seed) segments = self.transcribe_models[whisperx_model].transcribe( str(orig_audio) ) state = get_transcribe_state(segments) whisper_transcript = state["transcript"].strip() if len(orig_transcript.strip()) == 0: orig_transcript = whisper_transcript print(f"The transcript from the Whisper model: {whisper_transcript}") temp_folder = "exp_dir" if os.path.exists(temp_folder): shutil.rmtree(temp_folder) os.makedirs(temp_folder) filename = "orig_audio" audio_fn = str(orig_audio) info = torchaudio.info(audio_fn) audio_dur = info.num_frames / info.sample_rate # hyperparameters for inference codec_audio_sr = 16000 codec_sr = 50 top_k = 40 silence_tokens = [1388, 1898, 131] if voicecraft_model == "giga330M_TTSEnhanced.pth": voicecraft_model = "gigaHalfLibri330M_TTSEnhanced_max16s.pth" if task == "zero-shot text-to-speech": assert ( cut_off_sec < audio_dur ), f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}" prompt_end_frame = int(cut_off_sec * info.sample_rate) idx = find_closest_cut_off_word(state["word_bounds"], cut_off_sec) orig_transcript_until_cutoff_time = " ".join( [word_bound["word"] for word_bound in state["word_bounds"][: idx + 1]] ) else: edit_type = task.split("-")[-1] orig_span, new_span = get_span( orig_transcript, target_transcript, edit_type ) if orig_span[0] > orig_span[1]: RuntimeError(f"example {audio_fn} failed") if orig_span[0] == orig_span[1]: orig_span_save = [orig_span[0]] else: orig_span_save = orig_span if new_span[0] == new_span[1]: new_span_save = [new_span[0]] else: new_span_save = new_span orig_span_save = ",".join([str(item) for item in orig_span_save]) new_span_save = ",".join([str(item) for item in new_span_save]) start, end = get_mask_interval_from_word_bounds( state["word_bounds"], orig_span_save, edit_type ) # span in codec frames morphed_span = ( max(start - left_margin, 1 / codec_sr), min(end + right_margin, audio_dur), ) # in seconds mask_interval = [ [round(morphed_span[0] * codec_sr), round(morphed_span[1] * codec_sr)] ] mask_interval = torch.LongTensor(mask_interval) # [M,2], M==1 for now decode_config = { "top_k": top_k, "top_p": top_p, "temperature": temperature, "stop_repetition": stop_repetition, "kvcache": kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr, "silence_tokens": silence_tokens, } if task == "zero-shot text-to-speech": decode_config["sample_batch_size"] = sample_batch_size _, gen_audio = inference_one_sample( self.models[voicecraft_model], self.ckpt[voicecraft_model]["config"], self.phn2num[voicecraft_model], self.text_tokenizer, self.audio_tokenizer, audio_fn, orig_transcript_until_cutoff_time.strip() + " " + target_transcript.strip(), self.device, decode_config, prompt_end_frame, ) else: _, gen_audio = inference_one_sample_editing( self.models[voicecraft_model], self.ckpt[voicecraft_model]["config"], self.phn2num[voicecraft_model], self.text_tokenizer, self.audio_tokenizer, audio_fn, target_transcript, mask_interval, self.device, decode_config, ) # save segments for comparison gen_audio = gen_audio[0].cpu() out = "/tmp/out.wav" torchaudio.save(out, gen_audio, codec_audio_sr) return ModelOutput( generated_audio=Path(out), whisper_transcript_orig_audio=whisper_transcript ) def seed_everything(seed): os.environ["PYTHONHASHSEED"] = str(seed) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True def get_transcribe_state(segments): words_info = [word_info for segment in segments for word_info in segment["words"]] return { "transcript": " ".join([segment["text"].strip() for segment in segments]), "word_bounds": [ {"word": word["word"], "start": word["start"], "end": word["end"]} for word in words_info ], } def find_closest_cut_off_word(word_bounds, cut_off_sec): min_distance = float("inf") for i, word_bound in enumerate(word_bounds): distance = abs(word_bound["start"] - cut_off_sec) if distance < min_distance: min_distance = distance if word_bound["end"] > cut_off_sec: break return i def get_mask_interval_from_word_bounds(word_bounds, word_span_ind, editType): tmp = word_span_ind.split(",") s, e = int(tmp[0]), int(tmp[-1]) start = None for j, item in enumerate(word_bounds): if j == s: if editType == "insertion": start = float(item["end"]) else: start = float(item["start"]) if j == e: if editType == "insertion": end = float(item["start"]) else: end = float(item["end"]) assert start is not None break return (start, end) ================================================ FILE: pretrained_models/.gitkeep ================================================ ================================================ FILE: start-jupyter.bat ================================================ @echo off echo Creating and running the Jupyter container... docker run -it -d ^ --gpus all ^ -p 8888:8888 ^ -p 7860:7860 ^ --name jupyter ^ --user root ^ -e NB_USER="%username%" ^ -e CHOWN_HOME=yes ^ -e GRANT_SUDO=yes ^ -e JUPYTER_TOKEN=mytoken ^ -w "/home/%username%" ^ -v "%cd%":"/home/%username%/work" ^ voicecraft if %errorlevel% == 0 ( echo Jupyter container created and running. echo Jupyter container is running. echo To access the Jupyter web UI, please follow these steps: echo 1. Open your web browser echo 2. Navigate to http://localhost:8888/?token=mytoken echo 3. !! The default token is "mytoken" and should be changed. !! pause ) else ( echo Failed to create and run the Jupyter container. ) ================================================ FILE: start-jupyter.sh ================================================ #!/usr/bin/env bash ## Assumes you have docker installed with nvidia container container-toolkit # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/1.13.5/install-guide.html # sudo apt-get install -y nvidia-container-toolkit-base || yay -Syu nvidia-container-toolkit || echo etc... ## Try to start an existing container otherwise create a new one docker start jupyter 2> /dev/null || \ docker run -it \ -d \ --gpus all \ -p 8888:8888 \ -p 7860:7860 \ --name jupyter \ --user root \ -e NB_USER="$USER" \ -e CHOWN_HOME=yes \ -e GRANT_SUDO=yes \ -w "/home/${NB_USER}" \ -v "$PWD":"/home/$USER/work" \ voicecraft ## `docker logs jupyter` to get the URL link and token e.g. ## http://127.0.0.1:8888/lab?token=blahblahblahblabhlaabhalbhalbhal ================================================ FILE: steps/__init__.py ================================================ ================================================ FILE: steps/optim.py ================================================ # Copyright 2022 Xiaomi Corp. (authors: Daniel Povey) # # See ../LICENSE for clarification regarding multiple authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import contextlib import logging import random from collections import defaultdict from typing import List, Optional, Tuple, Union import torch import torch.nn as nn from torch import Tensor from torch.optim import Optimizer class BatchedOptimizer(Optimizer): """ This class adds to class Optimizer the capability to optimize parameters in batches: it will stack the parameters and their grads for you so the optimizer can work on tensors with an extra leading dimension. This is intended for speed with GPUs, as it reduces the number of kernels launched in the optimizer. Args: params: """ def __init__(self, params, defaults): super(BatchedOptimizer, self).__init__(params, defaults) @contextlib.contextmanager def batched_params(self, param_group, group_params_names): """ This function returns (technically, yields) a list of of tuples (p, state), where p is a `fake` parameter that is stacked (over axis 0) from real parameters that share the same shape, and its gradient is also stacked; `state` is the state corresponding to this batch of parameters (it will be physically located in the "state" for one of the real parameters, the last one that has any particular shape and dtype). This function is decorated as a context manager so that it can write parameters back to their "real" locations. The idea is, instead of doing: for p in group["params"]: state = self.state[p] ... you can do: with self.batched_params(group["params"]) as batches: for p, state, p_names in batches: ... Args: group: a parameter group, which is a list of parameters; should be one of self.param_groups. group_params_names: name for each parameter in group, which is List[str]. """ batches = defaultdict( list ) # `batches` maps from tuple (dtype_as_str,*shape) to list of nn.Parameter batches_names = defaultdict( list ) # `batches` maps from tuple (dtype_as_str,*shape) to list of str assert len(param_group) == len(group_params_names), f"len(param_group): {len(param_group)}, len(group_params_names): {len(group_params_names)}" for p, named_p in zip(param_group, group_params_names): key = (str(p.dtype), *p.shape) batches[key].append(p) batches_names[key].append(named_p) batches_names_keys = list(batches_names.keys()) sorted_idx = sorted( range(len(batches_names)), key=lambda i: batches_names_keys[i] ) batches_names = [ batches_names[batches_names_keys[idx]] for idx in sorted_idx ] batches = [batches[batches_names_keys[idx]] for idx in sorted_idx] stacked_params_dict = dict() # turn batches into a list, in deterministic order. # tuples will contain tuples of (stacked_param, state, stacked_params_names), # one for each batch in `batches`. tuples = [] for batch, batch_names in zip(batches, batches_names): p = batch[0] # we arbitrarily store the state in the # state corresponding to the 1st parameter in the # group. class Optimizer will take care of saving/loading state. state = self.state[p] p_stacked = torch.stack(batch) grad = torch.stack( [ torch.zeros_like(p) if p.grad is None else p.grad for p in batch ] ) p_stacked.grad = grad stacked_params_dict[key] = p_stacked tuples.append((p_stacked, state, batch_names)) yield tuples # <-- calling code will do the actual optimization here! for ((stacked_params, _state, _names), batch) in zip(tuples, batches): for i, p in enumerate(batch): # batch is list of Parameter p.copy_(stacked_params[i]) class ScaledAdam(BatchedOptimizer): """ Implements 'Scaled Adam', a variant of Adam where we scale each parameter's update proportional to the norm of that parameter; and also learn the scale of the parameter, in log space, subject to upper and lower limits (as if we had factored each parameter as param = underlying_param * log_scale.exp()) Args: params: The parameters or param_groups to optimize (like other Optimizer subclasses) lr: The learning rate. We will typically use a learning rate schedule that starts at 0.03 and decreases over time, i.e. much higher than other common optimizers. clipping_scale: (e.g. 2.0) A scale for gradient-clipping: if specified, the normalized gradients over the whole model will be clipped to have 2-norm equal to `clipping_scale` times the median 2-norm over the most recent period of `clipping_update_period` minibatches. By "normalized gradients", we mean after multiplying by the rms parameter value for this tensor [for non-scalars]; this is appropriate because our update is scaled by this quantity. betas: beta1,beta2 are momentum constants for regular momentum, and moving sum-sq grad. Must satisfy 0 < beta <= beta2 < 1. scalar_lr_scale: A scaling factor on the learning rate, that we use to update the scale of each parameter tensor and scalar parameters of the mode.. If each parameter were decomposed as p * p_scale.exp(), where (p**2).mean().sqrt() == 1.0, scalar_lr_scale would be a the scaling factor on the learning rate of p_scale. eps: A general-purpose epsilon to prevent division by zero param_min_rms: Minimum root-mean-square value of parameter tensor, for purposes of learning the scale on the parameters (we'll constrain the rms of each non-scalar parameter tensor to be >= this value) param_max_rms: Maximum root-mean-square value of parameter tensor, for purposes of learning the scale on the parameters (we'll constrain the rms of each non-scalar parameter tensor to be <= this value) scalar_max: Maximum absolute value for scalar parameters (applicable if your model has any parameters with numel() == 1). size_update_period: The periodicity, in steps, with which we update the size (scale) of the parameter tensor. This is provided to save a little time in the update. clipping_update_period: if clipping_scale is specified, this is the period """ def __init__( self, params, lr=3e-02, clipping_scale=None, betas=(0.9, 0.98), scalar_lr_scale=0.1, eps=1.0e-08, param_min_rms=1.0e-05, param_max_rms=3.0, scalar_max=10.0, size_update_period=4, clipping_update_period=100, parameters_names=None, show_dominant_parameters=True, ): assert parameters_names is not None, ( "Please prepare parameters_names," "which is a List[List[str]]. Each List[str] is for a group" "and each str is for a parameter" ) defaults = dict( lr=lr, clipping_scale=clipping_scale, betas=betas, scalar_lr_scale=scalar_lr_scale, eps=eps, param_min_rms=param_min_rms, param_max_rms=param_max_rms, scalar_max=scalar_max, size_update_period=size_update_period, clipping_update_period=clipping_update_period, ) super(ScaledAdam, self).__init__(params, defaults) assert len(self.param_groups) == len(parameters_names) self.parameters_names = parameters_names self.show_dominant_parameters = show_dominant_parameters def __setstate__(self, state): super(ScaledAdam, self).__setstate__(state) @torch.no_grad() def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: with torch.enable_grad(): loss = closure() batch = True for group, group_params_names in zip( self.param_groups, self.parameters_names ): with self.batched_params( group["params"], group_params_names ) as batches: # batches is list of pairs (stacked_param, state). stacked_param is like # a regular parameter, and will have a .grad, but the 1st dim corresponds to # a stacking dim, it is not a real dim. if ( len(batches[0][1]) == 0 ): # if len(first state) == 0: not yet initialized clipping_scale = 1 else: clipping_scale = self._get_clipping_scale(group, batches) for p, state, _ in batches: # Perform optimization step. # grad is not going to be None, we handled that when creating the batches. grad = p.grad if grad.is_sparse: raise RuntimeError( "ScaledAdam optimizer does not support sparse gradients" ) # State initialization if len(state) == 0: self._init_state(group, p, state) self._step_one_batch(group, p, state, clipping_scale) return loss def _init_state(self, group: dict, p: Tensor, state: dict): """ Initializes state dict for parameter 'p'. Assumes that dim 0 of tensor p is actually the batch dimension, corresponding to batched-together parameters of a given shape. Args: group: Dict to look up configuration values. p: The parameter that we are initializing the state for state: Dict from string to whatever state we are initializing """ size_update_period = group["size_update_period"] state["step"] = 0 kwargs = {"device": p.device, "dtype": p.dtype} # 'delta' implements conventional momentum. There are # several different kinds of update going on, so rather than # compute "exp_avg" like in Adam, we store and decay a # parameter-change "delta", which combines all forms of # update. this is equivalent to how it's done in Adam, # except for the first few steps. state["delta"] = torch.zeros_like( p, memory_format=torch.preserve_format ) batch_size = p.shape[0] numel = p.numel() // batch_size numel = p.numel() if numel > 1: # "param_rms" just periodically records the scalar root-mean-square value of # the parameter tensor. # it has a shape like (batch_size, 1, 1, 1, 1) param_rms = ( (p ** 2).mean(dim=list(range(1, p.ndim)), keepdim=True).sqrt() ) state["param_rms"] = param_rms state["scale_exp_avg_sq"] = torch.zeros_like(param_rms) state["scale_grads"] = torch.zeros( size_update_period, *param_rms.shape, **kwargs ) # exp_avg_sq is the weighted sum of scaled gradients. as in Adam. state["exp_avg_sq"] = torch.zeros_like( p, memory_format=torch.preserve_format ) def _get_clipping_scale( self, group: dict, tuples: List[Tuple[Tensor, dict, List[str]]] ) -> float: """ Returns a scalar factor <= 1.0 that dictates gradient clipping, i.e. we will scale the gradients by this amount before applying the rest of the update. Args: group: the parameter group, an item in self.param_groups tuples: a list of tuples of (param, state, param_names) where param is a batched set of parameters, with a .grad (1st dim is batch dim) and state is the state-dict where optimization parameters are kept. param_names is a List[str] while each str is name for a parameter in batched set of parameters "param". """ assert len(tuples) >= 1 clipping_scale = group["clipping_scale"] (first_p, first_state, _) = tuples[0] step = first_state["step"] if clipping_scale is None or step == 0: # no clipping. return early on step == 0 because the other # parameters' state won't have been initialized yet. return 1.0 clipping_update_period = group["clipping_update_period"] tot_sumsq = torch.tensor(0.0, device=first_p.device) for (p, state, param_names) in tuples: grad = p.grad if grad.is_sparse: raise RuntimeError( "ScaledAdam optimizer does not support sparse gradients" ) if p.numel() == p.shape[0]: # a batch of scalars tot_sumsq += ( grad ** 2 ).sum() # sum() to change shape [1] to [] else: tot_sumsq += ((grad * state["param_rms"]) ** 2).sum() tot_norm = tot_sumsq.sqrt() if "model_norms" not in first_state: first_state["model_norms"] = torch.zeros( clipping_update_period, device=p.device ) first_state["model_norms"][step % clipping_update_period] = tot_norm if step % clipping_update_period == 0: # Print some stats. # We don't reach here if step == 0 because we would have returned # above. sorted_norms = first_state["model_norms"].sort()[0].to("cpu") quartiles = [] for n in range(0, 5): index = min( clipping_update_period - 1, (clipping_update_period // 4) * n, ) quartiles.append(sorted_norms[index].item()) median = quartiles[2] threshold = clipping_scale * median first_state["model_norm_threshold"] = threshold percent_clipped = ( first_state["num_clipped"] * 100.0 / clipping_update_period if "num_clipped" in first_state else 0.0 ) first_state["num_clipped"] = 0 quartiles = " ".join(["%.3e" % x for x in quartiles]) logging.info( f"Clipping_scale={clipping_scale}, grad-norm quartiles {quartiles}, " f"threshold={threshold:.3e}, percent-clipped={percent_clipped:.1f}" ) if step < clipping_update_period: return 1.0 # We have not yet estimated a norm to clip to. else: try: model_norm_threshold = first_state["model_norm_threshold"] except KeyError: logging.info( "Warning: model_norm_threshold not in state: possibly " "you changed config when restarting, adding clipping_scale option?" ) return 1.0 ans = min(1.0, (model_norm_threshold / (tot_norm + 1.0e-20)).item()) if ans < 1.0: first_state["num_clipped"] += 1 if ans < 0.1: logging.warn( f"Scaling gradients by {ans}, model_norm_threshold={model_norm_threshold}" ) if self.show_dominant_parameters: assert p.shape[0] == len(param_names) self._show_gradient_dominating_parameter(tuples, tot_sumsq) return ans def _show_gradient_dominating_parameter( self, tuples: List[Tuple[Tensor, dict, List[str]]], tot_sumsq: Tensor ): """ Show information of parameter wihch dominanting tot_sumsq. Args: tuples: a list of tuples of (param, state, param_names) where param is a batched set of parameters, with a .grad (1st dim is batch dim) and state is the state-dict where optimization parameters are kept. param_names is a List[str] while each str is name for a parameter in batched set of parameters "param". tot_sumsq: sumsq of all parameters. Though it's could be calculated from tuples, we still pass it to save some time. """ all_sumsq_orig = {} for (p, state, batch_param_names) in tuples: # p is a stacked batch parameters. batch_grad = p.grad if p.numel() == p.shape[0]: # a batch of scalars batch_sumsq_orig = batch_grad ** 2 # Dummpy values used by following `zip` statement. batch_rms_orig = torch.ones(p.shape[0]) else: batch_rms_orig = state["param_rms"] batch_sumsq_orig = ((batch_grad * batch_rms_orig) ** 2).sum( dim=list(range(1, batch_grad.ndim)) ) for name, sumsq_orig, rms, grad in zip( batch_param_names, batch_sumsq_orig, batch_rms_orig, batch_grad ): proportion_orig = sumsq_orig / tot_sumsq all_sumsq_orig[name] = (proportion_orig, sumsq_orig, rms, grad) assert torch.isclose( sum([value[0] for value in all_sumsq_orig.values()]).cpu(), torch.tensor(1.0), ) sorted_by_proportion = { k: v for k, v in sorted( all_sumsq_orig.items(), key=lambda item: item[1][0], reverse=True, ) } dominant_param_name = next(iter(sorted_by_proportion)) ( dominant_proportion, dominant_sumsq, dominant_rms, dominant_grad, ) = sorted_by_proportion[dominant_param_name] logging.info( f"Parameter Dominanting tot_sumsq {dominant_param_name}" f" with proportion {dominant_proportion:.2f}," f" where dominant_sumsq=(grad_sumsq*orig_rms_sq)" f"={dominant_sumsq:.3e}," f" grad_sumsq = {(dominant_grad**2).sum():.3e}," f" orig_rms_sq={(dominant_rms**2).item():.3e}" ) def _step_one_batch( self, group: dict, p: Tensor, state: dict, clipping_scale: float ): """ Do the step for one parameter, which is actually going to be a batch of `real` parameters, with dim 0 as the batch dim. Args: group: dict to look up configuration values p: parameter to update (actually multiple parameters stacked together as a batch) state: state-dict for p, to look up the optimizer state """ lr = group["lr"] size_update_period = group["size_update_period"] beta1 = group["betas"][0] grad = p.grad if clipping_scale != 1.0: grad = grad * clipping_scale step = state["step"] delta = state["delta"] delta.mul_(beta1) batch_size = p.shape[0] numel = p.numel() // batch_size if numel > 1: # Update the size/scale of p, and set param_rms scale_grads = state["scale_grads"] scale_grads[step % size_update_period] = (p * grad).sum( dim=list(range(1, p.ndim)), keepdim=True ) if step % size_update_period == size_update_period - 1: param_rms = state["param_rms"] # shape: (batch_size, 1, 1, ..) param_rms.copy_( (p ** 2) .mean(dim=list(range(1, p.ndim)), keepdim=True) .sqrt() ) if step > 0: # self._size_update() learns the overall scale on the # parameter, by shrinking or expanding it. self._size_update(group, scale_grads, p, state) if numel == 1: # For parameters with 1 element we just use regular Adam. # Updates delta. self._step_scalar(group, p, state) else: self._step(group, p, state) state["step"] = step + 1 def _size_update( self, group: dict, scale_grads: Tensor, p: Tensor, state: dict ) -> None: """ Called only where p.numel() > 1, this updates the scale of the parameter. If we imagine: p = underlying_param * scale.exp(), and we are doing gradient descent on underlying param and on scale, this function does the update on `scale`. Args: group: dict to look up configuration values scale_grads: a tensor of shape (size_update_period, batch_size, 1, 1,...) containing grads w.r.t. the scales. p: The parameter to update state: The state-dict of p """ param_rms = state["param_rms"] beta1, beta2 = group["betas"] size_lr = group["lr"] * group["scalar_lr_scale"] param_min_rms = group["param_min_rms"] param_max_rms = group["param_max_rms"] eps = group["eps"] step = state["step"] batch_size = p.shape[0] size_update_period = scale_grads.shape[0] # correct beta2 for the size update period: we will have # faster decay at this level. beta2_corr = beta2 ** size_update_period scale_exp_avg_sq = state[ "scale_exp_avg_sq" ] # shape: (batch_size, 1, 1, ..) scale_exp_avg_sq.mul_(beta2_corr).add_( (scale_grads ** 2).mean( dim=0 ), # mean over dim `size_update_period` alpha=1 - beta2_corr, ) # shape is (batch_size, 1, 1, ...) # The 1st time we reach here is when size_step == 1. size_step = (step + 1) // size_update_period bias_correction2 = 1 - beta2_corr ** size_step # we don't bother with bias_correction1; this will help prevent divergence # at the start of training. denom = scale_exp_avg_sq.sqrt() + eps scale_step = ( -size_lr * (bias_correction2 ** 0.5) * scale_grads.sum(dim=0) / denom ) is_too_small = param_rms < param_min_rms is_too_large = param_rms > param_max_rms # when the param gets too small, just don't shrink it any further. scale_step.masked_fill_(is_too_small, 0.0) # when it gets too large, stop it from getting any larger. scale_step.masked_fill_(is_too_large, -size_lr * size_update_period) delta = state["delta"] # the factor of (1-beta1) relates to momentum. delta.add_(p * scale_step, alpha=(1 - beta1)) def _step(self, group: dict, p: Tensor, state: dict): """ This function does the core update of self.step(), in the case where the members of the batch have more than 1 element. Args: group: A dict which will be used to look up configuration values p: The parameter to be updated grad: The grad of p state: The state-dict corresponding to parameter p This function modifies p. """ grad = p.grad lr = group["lr"] beta1, beta2 = group["betas"] eps = group["eps"] param_min_rms = group["param_min_rms"] step = state["step"] exp_avg_sq = state["exp_avg_sq"] exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=(1 - beta2)) this_step = state["step"] - ( state["zero_step"] if "zero_step" in state else 0 ) bias_correction2 = 1 - beta2 ** (this_step + 1) if bias_correction2 < 0.99: # note: not in-place. exp_avg_sq = exp_avg_sq * (1.0 / bias_correction2) denom = exp_avg_sq.sqrt() denom += eps grad = grad / denom alpha = -lr * (1 - beta1) * state["param_rms"].clamp(min=param_min_rms) delta = state["delta"] delta.add_(grad * alpha) p.add_(delta) def _step_scalar(self, group: dict, p: Tensor, state: dict): """ A simplified form of the core update for scalar tensors, where we cannot get a good estimate of the parameter rms. """ beta1, beta2 = group["betas"] scalar_max = group["scalar_max"] eps = group["eps"] lr = group["lr"] * group["scalar_lr_scale"] grad = p.grad exp_avg_sq = state["exp_avg_sq"] # shape: (batch_size,) exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) # bias_correction2 is like in Adam. Don't bother with bias_correction1; # slower update at the start will help stability anyway. bias_correction2 = 1 - beta2 ** (state["step"] + 1) denom = (exp_avg_sq / bias_correction2).sqrt() + eps delta = state["delta"] delta.add_(grad / denom, alpha=-lr * (1 - beta1)) p.clamp_(min=-scalar_max, max=scalar_max) p.add_(delta) class LRScheduler(object): """ Base-class for learning rate schedulers where the learning-rate depends on both the batch and the epoch. """ def __init__(self, optimizer: Optimizer, verbose: bool = False): # Attach optimizer if not isinstance(optimizer, Optimizer): raise TypeError( "{} is not an Optimizer".format(type(optimizer).__name__) ) self.optimizer = optimizer self.verbose = verbose for group in optimizer.param_groups: group.setdefault("base_lr", group["lr"]) self.base_lrs = [group["base_lr"] for group in optimizer.param_groups] self.epoch = 0 self.batch = 0 def state_dict(self): """Returns the state of the scheduler as a :class:`dict`. It contains an entry for every variable in self.__dict__ which is not the optimizer. """ return { "base_lrs": self.base_lrs, "epoch": self.epoch, "batch": self.batch, } def load_state_dict(self, state_dict): """Loads the schedulers state. Args: state_dict (dict): scheduler state. Should be an object returned from a call to :meth:`state_dict`. """ self.__dict__.update(state_dict) def get_last_lr(self) -> List[float]: """Return last computed learning rate by current scheduler. Will be a list of float.""" return self._last_lr def get_lr(self): # Compute list of learning rates from self.epoch and self.batch and # self.base_lrs; this must be overloaded by the user. # e.g. return [some_formula(self.batch, self.epoch, base_lr) for base_lr in self.base_lrs ] raise NotImplementedError def step_batch(self, batch: Optional[int] = None) -> None: # Step the batch index, or just set it. If `batch` is specified, it # must be the batch index from the start of training, i.e. summed over # all epochs. # You can call this in any order; if you don't provide 'batch', it should # of course be called once per batch. if batch is not None: self.batch = batch else: self.batch = self.batch + 1 self._set_lrs() def step_epoch(self, epoch: Optional[int] = None): # Step the epoch index, or just set it. If you provide the 'epoch' arg, # you should call this at the start of the epoch; if you don't provide the 'epoch' # arg, you should call it at the end of the epoch. if epoch is not None: self.epoch = epoch else: self.epoch = self.epoch + 1 self._set_lrs() def _set_lrs(self): values = self.get_lr() assert len(values) == len(self.optimizer.param_groups) for i, data in enumerate(zip(self.optimizer.param_groups, values)): param_group, lr = data param_group["lr"] = lr self.print_lr(self.verbose, i, lr) self._last_lr = [group["lr"] for group in self.optimizer.param_groups] def print_lr(self, is_verbose, group, lr): """Display the current learning rate.""" if is_verbose: logging.info( f"Epoch={self.epoch}, batch={self.batch}: adjusting learning rate" f" of group {group} to {lr:.4e}." ) class Eden(LRScheduler): """ Eden scheduler. The basic formula (before warmup) is: lr = base_lr * (((batch**2 + lr_batches**2) / lr_batches**2) ** -0.25 * (((epoch**2 + lr_epochs**2) / lr_epochs**2) ** -0.25)) * warmup where `warmup` increases from linearly 0.5 to 1 over `warmup_batches` batches and then stays constant at 1. E.g. suggest base_lr = 0.04 (passed to optimizer) if used with ScaledAdam Args: optimizer: the optimizer to change the learning rates on lr_batches: the number of batches after which we start significantly decreasing the learning rate, suggest 5000. lr_epochs: the number of epochs after which we start significantly decreasing the learning rate, suggest 6 if you plan to do e.g. 20 to 40 epochs, but may need smaller number if dataset is huge and you will do few epochs. """ def __init__( self, optimizer: Optimizer, lr_batches: Union[int, float], lr_epochs: Union[int, float], warmup_batches: Union[int, float] = 500.0, verbose: bool = False, ): super(Eden, self).__init__(optimizer, verbose) self.lr_batches = lr_batches self.lr_epochs = lr_epochs self.warmup_batches = warmup_batches def get_lr(self): factor = ( (self.batch ** 2 + self.lr_batches ** 2) / self.lr_batches ** 2 ) ** -0.25 * ( ((self.epoch ** 2 + self.lr_epochs ** 2) / self.lr_epochs ** 2) ** -0.25 ) warmup_factor = ( 1.0 if self.batch >= self.warmup_batches else 0.5 + 0.5 * (self.batch / self.warmup_batches) ) return [x * factor * warmup_factor for x in self.base_lrs] def _test_eden(): m = torch.nn.Linear(100, 100) optim = ScaledAdam(m.parameters(), lr=0.03) scheduler = Eden(optim, lr_batches=100, lr_epochs=2, verbose=True) for epoch in range(10): scheduler.step_epoch(epoch) # sets epoch to `epoch` for step in range(20): x = torch.randn(200, 100).detach() x.requires_grad = True y = m(x) dy = torch.randn(200, 100).detach() f = (y * dy).sum() f.backward() optim.step() scheduler.step_batch() optim.zero_grad() logging.info(f"last lr = {scheduler.get_last_lr()}") logging.info(f"state dict = {scheduler.state_dict()}") # This is included mostly as a baseline for ScaledAdam. class Eve(Optimizer): """ Implements Eve algorithm. This is a modified version of AdamW with a special way of setting the weight-decay / shrinkage-factor, which is designed to make the rms of the parameters approach a particular target_rms (default: 0.1). This is for use with networks with 'scaled' versions of modules (see scaling.py), which will be close to invariant to the absolute scale on the parameter matrix. The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_. The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_. Eve is unpublished so far. Arguments: params (iterable): iterable of parameters to optimize or dicts defining parameter groups lr (float, optional): learning rate (default: 1e-3) betas (Tuple[float, float], optional): coefficients used for computing running averages of gradient and its square (default: (0.9, 0.999)) eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-8) weight_decay (float, optional): weight decay coefficient (default: 3e-4; this value means that the weight would decay significantly after about 3k minibatches. Is not multiplied by learning rate, but is conditional on RMS-value of parameter being > target_rms. target_rms (float, optional): target root-mean-square value of parameters, if they fall below this we will stop applying weight decay. .. _Adam: A Method for Stochastic Optimization: https://arxiv.org/abs/1412.6980 .. _Decoupled Weight Decay Regularization: https://arxiv.org/abs/1711.05101 .. _On the Convergence of Adam and Beyond: https://openreview.net/forum?id=ryQu7f-RZ """ def __init__( self, params, lr=1e-3, betas=(0.9, 0.98), eps=1e-8, weight_decay=1e-3, target_rms=0.1, ): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 <= eps: raise ValueError("Invalid epsilon value: {}".format(eps)) if not 0.0 <= betas[0] < 1.0: raise ValueError( "Invalid beta parameter at index 0: {}".format(betas[0]) ) if not 0.0 <= betas[1] < 1.0: raise ValueError( "Invalid beta parameter at index 1: {}".format(betas[1]) ) if not 0 <= weight_decay <= 0.1: raise ValueError( "Invalid weight_decay value: {}".format(weight_decay) ) if not 0 < target_rms <= 10.0: raise ValueError("Invalid target_rms value: {}".format(target_rms)) defaults = dict( lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, target_rms=target_rms, ) super(Eve, self).__init__(params, defaults) def __setstate__(self, state): super(Eve, self).__setstate__(state) @torch.no_grad() def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: with torch.enable_grad(): loss = closure() for group in self.param_groups: for p in group["params"]: if p.grad is None: continue # Perform optimization step grad = p.grad if grad.is_sparse: raise RuntimeError( "AdamW does not support sparse gradients" ) state = self.state[p] # State initialization if len(state) == 0: state["step"] = 0 # Exponential moving average of gradient values state["exp_avg"] = torch.zeros_like( p, memory_format=torch.preserve_format ) # Exponential moving average of squared gradient values state["exp_avg_sq"] = torch.zeros_like( p, memory_format=torch.preserve_format ) exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"] beta1, beta2 = group["betas"] state["step"] += 1 bias_correction1 = 1 - beta1 ** state["step"] bias_correction2 = 1 - beta2 ** state["step"] # Decay the first and second moment running average coefficient exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) denom = (exp_avg_sq.sqrt() * (bias_correction2 ** -0.5)).add_( group["eps"] ) step_size = group["lr"] / bias_correction1 target_rms = group["target_rms"] weight_decay = group["weight_decay"] if p.numel() > 1: # avoid applying this weight-decay on "scaling factors" # (which are scalar). is_above_target_rms = p.norm() > ( target_rms * (p.numel() ** 0.5) ) p.mul_(1 - (weight_decay * is_above_target_rms)) p.addcdiv_(exp_avg, denom, value=-step_size) # if random.random() < 0.0005: # step = (exp_avg / denom) * step_size # logging.info( # f"Delta rms = {(step**2).mean().item()}, shape = {step.shape}" # ) return loss def ScaledLinear(*args, initial_scale: float = 1.0, **kwargs) -> nn.Linear: """ Behaves like a constructor of a modified version of nn.Linear that gives an easy way to set the default initial parameter scale. Args: Accepts the standard args and kwargs that nn.Linear accepts e.g. in_features, out_features, bias=False. initial_scale: you can override this if you want to increase or decrease the initial magnitude of the module's output (affects the initialization of weight_scale and bias_scale). Another option, if you want to do something like this, is to re-initialize the parameters. """ ans = nn.Linear(*args, **kwargs) with torch.no_grad(): ans.weight[:] *= initial_scale if ans.bias is not None: torch.nn.init.uniform_( ans.bias, -0.1 * initial_scale, 0.1 * initial_scale ) return ans def _test_scaled_adam(hidden_dim: int): import timeit E = 100 B = 4 T = 2 logging.info("in test_eve_cain") # device = torch.device('cuda') device = torch.device("cpu") dtype = torch.float32 # these input_magnitudes and output_magnitudes are to test that # Abel is working as we expect and is able to adjust scales of # different dims differently. input_magnitudes = (1.0 * torch.randn(E, dtype=dtype, device=device)).exp() output_magnitudes = (1.0 * torch.randn(E, dtype=dtype, device=device)).exp() for iter in [1, 0]: Linear = torch.nn.Linear if iter == 0 else ScaledLinear m = torch.nn.Sequential( Linear(E, hidden_dim), torch.nn.PReLU(), Linear(hidden_dim, hidden_dim), torch.nn.PReLU(), Linear(hidden_dim, E), ).to(device) train_pairs = [ ( 100.0 * torch.randn(B, T, E, device=device, dtype=dtype) * input_magnitudes, torch.randn(B, T, E, device=device, dtype=dtype) * output_magnitudes, ) for _ in range(20) ] if iter == 0: optim = Eve(m.parameters(), lr=0.003) elif iter == 1: optim = ScaledAdam(m.parameters(), lr=0.03, clipping_scale=2.0) scheduler = Eden(optim, lr_batches=200, lr_epochs=5, verbose=False) start = timeit.default_timer() avg_loss = 0.0 for epoch in range(180): scheduler.step_epoch() # if epoch == 100 and iter in [2,3]: # optim.reset_speedup() # check it doesn't crash. # if epoch == 130: # opts = diagnostics.TensorDiagnosticOptions( # 2 ** 22 # ) # allow 4 megabytes per sub-module # diagnostic = diagnostics.attach_diagnostics(m, opts) for n, (x, y) in enumerate(train_pairs): y_out = m(x) loss = ((y_out - y) ** 2).mean() * 100.0 if epoch == 0 and n == 0: avg_loss = loss.item() else: avg_loss = 0.98 * avg_loss + 0.02 * loss.item() if n == 0 and epoch % 5 == 0: # norm1 = '%.2e' % (m[0].weight**2).mean().sqrt().item() # norm1b = '%.2e' % (m[0].bias**2).mean().sqrt().item() # norm2 = '%.2e' % (m[2].weight**2).mean().sqrt().item() # norm2b = '%.2e' % (m[2].bias**2).mean().sqrt().item() # scale1 = '%.2e' % (m[0].weight_scale.exp().item()) # scale1b = '%.2e' % (m[0].bias_scale.exp().item()) # scale2 = '%.2e' % (m[2].weight_scale.exp().item()) # scale2b = '%.2e' % (m[2].bias_scale.exp().item()) lr = scheduler.get_last_lr()[0] logging.info( f"Iter {iter}, epoch {epoch}, batch {n}, avg_loss {avg_loss:.4g}, lr={lr:.4e}" ) # , norms={norm1,norm1b,norm2,norm2b}") # scales={scale1,scale1b,scale2,scale2b} loss.log().backward() optim.step() optim.zero_grad() scheduler.step_batch() # diagnostic.print_diagnostics() stop = timeit.default_timer() logging.info(f"Iter={iter}, Time taken: {stop - start}") logging.info(f"last lr = {scheduler.get_last_lr()}") # logging.info("state dict = ", scheduler.state_dict()) # logging.info("optim state_dict = ", optim.state_dict()) logging.info(f"input_magnitudes = {input_magnitudes}") logging.info(f"output_magnitudes = {output_magnitudes}") if __name__ == "__main__": torch.set_num_threads(1) torch.set_num_interop_threads(1) logging.getLogger().setLevel(logging.INFO) import subprocess s = subprocess.check_output( "git status -uno .; git log -1; git diff HEAD .", shell=True ) logging.info(s) import sys if len(sys.argv) > 1: hidden_dim = int(sys.argv[1]) else: hidden_dim = 200 _test_scaled_adam(hidden_dim) _test_eden() ================================================ FILE: steps/trainer.py ================================================ import time import os, random import torch import math, pickle from tqdm import tqdm from torch.optim import AdamW from torch.optim.lr_scheduler import LambdaLR import torch.nn as nn import torch.distributed as dist from torch.utils.tensorboard import SummaryWriter import numpy as np from torch.utils.data.distributed import DistributedSampler import logging from data import gigaspeech from models import voicecraft from .trainer_utils import DistributedDynamicBatchSampler, StatefulDistributedSampler, AverageMeter, print_model_info from .optim import ScaledAdam, Eden class Trainer: def __init__(self, args, world_size, rank): self.start_time = time.time() self.args = args self.world_size, self.rank = world_size, rank self.device = torch.device(f"cuda:{rank}" if torch.cuda.is_available() else "cpu") if self.rank == 0: self.writer = SummaryWriter(args.exp_dir) self.seed_everything(seed=self.args.seed) self.meters = self._setup_meters() self.progress, self.total_progress = self._setup_progress() self.model, self.trainables, self.optim_states, self.scheduler_states = self._setup_models() self.train_dataset_length, self.train_sampler, self.train_loader, self.valid_loader = self._setup_dataloader() if self.args.num_steps != None: self.total_step = self.args.num_steps self.args.num_epochs = math.ceil(self.total_step / math.floor(self.train_dataset_length / self.args.batch_size)) if not self.args.dynamic_batching else None else: self.total_step = int(math.floor(self.train_dataset_length / self.args.batch_size))*self.args.num_epochs self.optimizer, self.scheduler = self._setup_optimizer() self.scaler = torch.cuda.amp.GradScaler() self.model = torch.nn.parallel.DistributedDataParallel(self.model, device_ids=[self.rank], find_unused_parameters=False) if self.rank == 0: self.early_stop_accu_steps = 0 if self.args.dynamic_batching: logging.info(f"max number of tokens per GPU in a training batch: {self.args.max_num_tokens}, max number of tokens per GPU in a inference batch: {self.args.val_max_num_tokens}") else: logging.info(f"batch size (summed over all GPUs): {self.args.batch_size}") def train(self): flag = True skip_flag = False data_start_time = time.time() while flag: self.train_sampler.set_epoch(self.progress['epoch']) for i, batch in enumerate(self.train_loader): data_end_time = time.time() self.model.train() if self.progress['step'] > self.total_step: flag = False self.validate_and_save() if self.rank == 0: self.writer.close() break if isinstance(self.scheduler, Eden): self.scheduler.step_epoch(self.progress['step']//self.args.pseudo_epoch_size + 1) if self.args.optimizer_name == "ScaledAdam": cur_lr = self.scheduler.get_last_lr()[0] else: lrs = [param_group['lr'] for param_group in self.optimizer.param_groups] assert lrs[0] == lrs[1] cur_lr = lrs[0] if self.rank == 0 and self.progress['step'] % self.args.tb_write_every_n_steps == 0: self.writer.add_scalar("train/lr", cur_lr, self.progress['step']) all_inds = list(range(len(batch['y']))) sum_losses = 0 sum_top10acc = 0 sum_ntoken = 0 sum_top10acc_cbi = [0 for _ in range(self.args.n_codebooks)] for j in range(self.args.gradient_accumulation_steps): cur_ind = all_inds[j::self.args.gradient_accumulation_steps] cur_batch = {key: batch[key][cur_ind] for key in batch} with torch.cuda.amp.autocast(dtype=torch.float16 if self.args.precision=="float16" else torch.float32): out = self.model(cur_batch) if out == None: continue record_loss = out['loss'].detach().to(self.rank) top10acc = out['top10acc'].to(self.rank) effective_ntoken = out['effective_ntoken'].to(self.rank) is_nan = torch.tensor(int(torch.isnan(record_loss).any()), dtype=torch.float32, device=self.rank) dist.all_reduce(record_loss, op=dist.ReduceOp.SUM) dist.all_reduce(top10acc, op=dist.ReduceOp.SUM) dist.all_reduce(effective_ntoken, op=dist.ReduceOp.SUM) dist.all_reduce(is_nan, op=dist.ReduceOp.SUM) # check if loss is nan if is_nan.item() > 0: logging.info(f"loss at step {self.progress['step']} is nan, therefore skip this batch") skip_flag = True continue sum_losses += record_loss.item() sum_top10acc += top10acc.item() sum_ntoken += effective_ntoken.item() if 'top10acc_by_codebook' in out: for cb in range(self.args.n_codebooks): top10acc_cbi = out['top10acc_by_codebook'][cb] dist.all_reduce(top10acc_cbi, op=dist.ReduceOp.SUM) sum_top10acc_cbi[cb] += top10acc_cbi.item() if self.rank == 0: average_loss = sum_losses / sum_ntoken average_top10acc = sum_top10acc / sum_ntoken self.meters['train_loss'].update(average_loss, batch['x'].shape[0]*self.world_size) self.meters['train_top10acc'].update(average_top10acc, batch['x'].shape[0]*self.world_size) self.meters['train_top10acc'].update(average_top10acc, batch['x'].shape[0]*self.world_size) average_top10acc_cbi = [sum_top10acc_cbi[cb] / sum_ntoken * self.args.n_codebooks for cb in range(self.args.n_codebooks)] for cb in range(self.args.n_codebooks): self.meters[f'train_top10acc_cb{cb+1}'].update(average_top10acc_cbi[cb], batch['x'].shape[0]*self.world_size) if self.progress['step'] % self.args.tb_write_every_n_steps == 0: self.writer.add_scalar('train/loss', average_loss, self.progress['step']) self.writer.add_scalar('train/top10acc', average_top10acc, self.progress['step']) self.writer.add_scalar("train/ntokens", sum_ntoken, self.progress['step']) for cb in range(self.args.n_codebooks): self.writer.add_scalar(f'train/top10acc_cb{cb+1}', average_top10acc_cbi[cb], self.progress['step']) if self.args.optimizer_name == "ScaledAdam": self.scaler.scale(out['loss']).backward() else: self.scaler.scale(out['loss']/out['effective_ntoken']).backward() if skip_flag: self.optimizer.zero_grad() skip_flag = False continue if self.args.optimizer_name != "ScaledAdam": self.scaler.unscale_(self.optimizer) torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.gradient_clip_val) self.scaler.step(self.optimizer) self.scaler.update() self.optimizer.zero_grad() if self.args.optimizer_name == "ScaledAdam": self.scheduler.step_batch(self.progress['step']) else: self.scheduler.step() if self.rank == 0: self.meters['data_time'].update(data_end_time - data_start_time) self.meters['train_time'].update(time.time() - data_end_time) if self.progress['step'] % self.args.tb_write_every_n_steps == 0: self.writer.add_scalar("train/data_time", data_end_time - data_start_time, self.progress['step']) self.writer.add_scalar("train/train_time", time.time() - data_end_time, self.progress['step']) # logging if self.progress['step'] % self.args.print_every_n_steps == 0: log_out = {} log_out['cur_epoch'] = f"{self.progress['epoch']}/{self.args.num_epochs}" if self.args.num_epochs is not None else f"{self.progress['epoch']}" log_out['cur_step'] = f"{int(self.progress['cur_step']+1)}" log_out['total_step'] = f"{self.progress['step']}/{self.args.num_steps}" log_out['lr'] = f"{cur_lr:.7f}" log_out['ntokens'] = f"{sum_ntoken}" for key in self.meters: if self.meters[key].val != 0 or self.meters[key].avg != 0: log_out[key] = f"{self.meters[key].val:.4f} ({self.meters[key].avg:.4f})" if isinstance(self.meters[key].val, float) else f"{self.meters[key].val}" logging.info(log_out) if np.isnan(self.meters['train_loss'].avg): logging.warning("training diverged...") raise RuntimeError("training diverged...") # validation and save models if self.progress['step'] % self.args.val_every_n_steps == 0: dist.barrier() self.validate_and_save() self.progress['step'] += 1 self.progress['cur_step'] += 1 data_start_time = time.time() self.progress['epoch'] += 1 self.progress['cur_step'] = 0 # reset cur_step to be 0 dist.destroy_process_group() def validate_and_save(self): self.model.eval() score = self.validate(self.valid_loader) if self.rank == 0: if self.args.early_stop_threshold > 0: if self.progress['best_score'] - score < self.args.early_stop_threshold: self.early_stop_accu_steps += self.args.val_every_n_steps if self.early_stop_accu_steps >= self.args.early_stop_step-1: logging.info(f"early stop based on self.args.early_stop_threshold: {self.args.early_stop_threshold}, and self.args.early_stop_step: {self.args.early_stop_step}") logging.info(f"best validation score at step: {self.progress['best_step']}, and the score is {self.progress['best_score']:.4f}") dist.destroy_process_group() raise RuntimeError("early stop") else: self.early_stop_accu_steps = 0 if (score < self.progress['best_score']): self.progress['best_step'] = self.progress['step'] self.progress['best_score'] = score save_path = os.path.join(self.args.exp_dir,"best_bundle.pth") torch.save( { "model": self.model.module.state_dict(), "optimizer": self.optimizer.state_dict(), "scheduler": self.scheduler.state_dict(), "config": self.args, "phn2num": self.train_loader.dataset.phn2num },save_path ) logging.info(f"save *best* models at {save_path} at global step {self.progress['step']}") self._save_progress() save_path = os.path.join(self.args.exp_dir,"bundle.pth") torch.save( { "model": self.model.module.state_dict(), "optimizer": self.optimizer.state_dict(), "scheduler": self.scheduler.state_dict(), "config": self.args, "phn2num": self.train_loader.dataset.phn2num },save_path ) logging.info(f"save models, indices, acc and other statistics at {save_path} and {self.args.exp_dir}/progress.pkl at global step {self.progress['step']}") dist.barrier() def validate(self, valid_loader=None, hide_progress=True): if valid_loader == None: valid_loader = self.valid_loader self.model.eval() start_val_time = time.time() sum_losses = 0 sum_top10acc = 0 sum_ntoken = 0 sum_top10acc_cbi = [0 for _ in range(self.args.n_codebooks)] with torch.no_grad(): for i, batch in enumerate(tqdm(valid_loader, disable=hide_progress)): out = self.model(batch) sum_losses += out['loss'] sum_top10acc += out['top10acc'] sum_ntoken += out['effective_ntoken'] if 'top10acc_by_codebook' in out: for cb in range(self.args.n_codebooks): sum_top10acc_cbi[cb] += out['top10acc_by_codebook'][cb] dist.all_reduce(sum_losses, op=dist.ReduceOp.SUM) dist.all_reduce(sum_top10acc, op=dist.ReduceOp.SUM) dist.all_reduce(sum_ntoken, op=dist.ReduceOp.SUM) if 'top10acc_by_codebook' in out: for cb in range(self.args.n_codebooks): dist.all_reduce(sum_top10acc_cbi[cb], op=dist.ReduceOp.SUM) if self.rank == 0: val_loss = sum_losses / sum_ntoken val_top10acc = sum_top10acc / sum_ntoken # logging self.meters['val_loss'].update(val_loss) logging.info(f"val loss: {val_loss:.5f}") self.writer.add_scalar("val/loss", val_loss, self.progress['step']) self.meters['val_top10acc'].update(val_top10acc) logging.info(f"val top10acc: {val_top10acc:.5f}") self.writer.add_scalar("val/top10acc", val_top10acc, self.progress['step']) for cb in range(self.args.n_codebooks): average_top10acc_cbi = sum_top10acc_cbi[cb] / sum_ntoken * self.args.n_codebooks self.meters[f'val_top10acc_cb{cb+1}'].update(average_top10acc_cbi) self.writer.add_scalar(f'val/top10acc_cb{cb+1}', average_top10acc_cbi, self.progress['step']) logging.info(f"validation takes: {time.time() - start_val_time:.2f}s") logging.info(f"Step [{self.progress['step']}/{self.total_step}]\t Time elapsed {(time.time() - self.start_time)/3600.:.2f}h, Val Loss: {val_loss:.4f}, Val Top10Acc: {val_top10acc:.4f}") return val_loss.item() else: return None def _setup_meters(self): meters = {} meter_names = ['train_loss', 'val_loss', 'train_top10acc', 'val_top10acc', 'data_time', 'train_time'] meter_names += ['train_dur_loss', 'train_dur_acc', 'val_dur_loss', 'val_dur_acc'] meter_names += [f'train_top10acc_cb{cb+1}' for cb in range(self.args.n_codebooks)] meter_names += [f'val_top10acc_cb{cb+1}' for cb in range(self.args.n_codebooks)] for name in meter_names: meters[name] = AverageMeter() return meters def _setup_progress(self): progress = {} progress['best_step'] = 1 progress['best_score'] = np.inf # this records loss value progress['step'] = 1 progress['epoch'] = 1 progress['cur_step'] = 0 # step in the current epoch, for resuming the sampler total_progress = [] # if self.args.resume or self.args.validate: if self.args.resume: progress_pkl = "%s/progress.pkl" % self.args.exp_dir with open(progress_pkl, "rb") as f: total_progress = pickle.load(f) progress['best_step'], progress['best_score'], progress['step'], progress['epoch'], progress['cur_step'], _ = total_progress[-1] if self.rank == 0: logging.info("\nResume training from:") logging.info(" epoch = %s" % progress['epoch']) logging.info(" cur_step = %s" % progress['cur_step']) logging.info(" step = %s" % progress['step']) logging.info(" best_step = %s" % progress['best_step']) logging.info(" best_score = %s" % progress['best_score']) return progress, total_progress def _save_progress(self): self.total_progress.append([self.progress['best_step'], self.progress['best_score'], int(self.progress['step']+1), self.progress['epoch'], int(self.progress['cur_step']+1), time.time() - self.start_time]) with open("%s/progress.pkl" % self.args.exp_dir, "wb") as f: pickle.dump(self.total_progress, f) def _setup_dataloader(self): assert self.args.dataset == 'gigaspeech', "only gigaspeech is supported for now" train_dataset, val_dataset = gigaspeech.dataset(self.args, 'train'), gigaspeech.dataset(self.args, 'validation') if self.args.dynamic_batching: train_sampler = DistributedDynamicBatchSampler(train_dataset, self.args, num_replicas=self.world_size, rank=self.rank, shuffle=True, seed=self.args.seed, drop_last=True, lengths_list=train_dataset.lengths_list, verbose=True, epoch=0) valid_sampler = DistributedDynamicBatchSampler(val_dataset, self.args, num_replicas=self.world_size, rank=self.rank, shuffle=True, seed=self.args.seed, drop_last=True, lengths_list=val_dataset.lengths_list, verbose=True, epoch=0) else: train_sampler = StatefulDistributedSampler(train_dataset, self.args.batch_size//self.world_size, num_replicas=self.world_size, rank=self.rank, shuffle=True, seed=self.args.seed, drop_last=True) valid_sampler = DistributedSampler(val_dataset, num_replicas=self.world_size, rank=self.rank, shuffle=False, seed=self.args.seed, drop_last=False) if self.progress['step'] > 1: train_sampler.set_epoch_resume(self.progress['epoch'], self.progress['cur_step']) if self.args.dynamic_batching: train_loader = torch.utils.data.DataLoader(train_dataset, batch_sampler=train_sampler, num_workers=self.args.num_workers//self.world_size, collate_fn=train_dataset.collate, persistent_workers=True ) valid_loader = torch.utils.data.DataLoader(val_dataset, batch_sampler=valid_sampler, num_workers=self.args.num_workers//self.world_size, collate_fn=val_dataset.collate, persistent_workers=True ) else: train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=self.args.batch_size//self.world_size, sampler=train_sampler, num_workers=self.args.num_workers//self.world_size, collate_fn=train_dataset.collate, persistent_workers=True ) valid_loader = torch.utils.data.DataLoader(val_dataset, batch_size=self.args.batch_size//self.world_size, sampler=valid_sampler, num_workers=self.args.num_workers//self.world_size, collate_fn=val_dataset.collate, persistent_workers=True ) return len(train_dataset), train_sampler, train_loader, valid_loader def _setup_models(self): model = voicecraft.VoiceCraft(self.args) if self.rank == 0: logging.info(model) logging.info("model parameters") print_model_info(model) if self.progress['step'] > 1: bundle = torch.load(os.path.join(self.args.exp_dir, "bundle.pth"), map_location="cpu") model.load_state_dict(bundle['model']) optim_states = bundle['optimizer'] scheduler_states = bundle['scheduler'] if self.rank == 0: logging.info("loaded parameters and data indices from epoch %d, global step %d" % (self.progress['epoch'], self.progress['step'])) del bundle['model'] else: optim_states = None scheduler_states = None if self.args.load_model_from != None and self.progress['step'] <= 1: sd = torch.load(self.args.load_model_from, map_location="cpu")['model'] model.load_state_dict(sd) del sd if self.args.optimizer_name == "ScaledAdam": trainables = [p for p in model.parameters() if p.requires_grad] else: no_decay = [".bias", ".audio_embeddings.weight", ".text_embeddings.weight", ".norm.weight", ".norm1.weight", ".norm2.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad], "weight_decay": self.args.weight_decay, }, { "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad], "weight_decay": 0.0, }, ] if len(optimizer_grouped_parameters[1]['params']) == 0: logging.info("there is no embedding weights, bias, and layernorm parameters in the model, which should be True, check model parameter names") trainables = optimizer_grouped_parameters[0] else: trainables = optimizer_grouped_parameters model.to(self.device) return model, trainables, optim_states, scheduler_states def _setup_optimizer(self): if self.args.optimizer_name == "ScaledAdam": parameters_names = [] parameters_names.append([n for n,p in self.model.named_parameters() if p.requires_grad]) optimizer = ScaledAdam( self.trainables, lr=self.args.lr, betas=(0.9, 0.95), clipping_scale=2.0, parameters_names=parameters_names, show_dominant_parameters=False, clipping_update_period=self.args.clipping_update_period, ) scheduler = Eden(optimizer, self.args.reduce_lr_start_step, self.args.reduce_lr_start_epoch, warmup_batches=self.total_step * self.args.warmup_fraction) else: optimizer = AdamW(self.trainables, lr=self.args.lr) warmup_steps = self.total_step * self.args.warmup_fraction def lr_lambda(current_step: int): if current_step < warmup_steps: return float(current_step) / float(max(1, warmup_steps)) return max( 0.0, float(self.total_step - current_step) / float(max(1, self.total_step - warmup_steps)) ) scheduler = LambdaLR(optimizer, lr_lambda, last_epoch=-1) # if resume if self.progress['step'] > 1: optimizer.load_state_dict(self.optim_states) for state in optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.cuda() del self.optim_states scheduler.load_state_dict(self.scheduler_states) optimizer.zero_grad() return optimizer, scheduler def seed_everything(self, seed=1): os.environ['PYTHONHASHSEED'] = str(seed) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True ================================================ FILE: steps/trainer_utils.py ================================================ import torch import math import torch.distributed as dist from torch.utils.data.sampler import Sampler import copy import numpy as np from typing import List from scipy.stats import lognorm import logging class StatefulDistributedSampler(Sampler[int]): def __init__(self, dataset, batch_size, num_replicas = None, rank = None, shuffle = True, seed = 0, drop_last = False): if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() if rank >= num_replicas or rank < 0: raise ValueError( "Invalid rank {}, rank should be in the interval" " [0, {}]".format(rank, num_replicas - 1)) self.dataset = dataset self.batch_size = batch_size self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.cur_epoch = 0 self.drop_last = drop_last # If the dataset length is evenly divisible by # of replicas, then there # is no need to drop any data, since the dataset will be split equally. if self.drop_last and len(self.dataset) % self.num_replicas != 0: # type: ignore[arg-type] # Split to nearest available length that is evenly divisible. # This is to ensure each rank receives the same amount of data when # using this Sampler. self.num_samples = math.ceil( (len(self.dataset) - self.num_replicas) / self.num_replicas # type: ignore[arg-type] ) else: self.num_samples = math.ceil(len(self.dataset) / self.num_replicas) # type: ignore[arg-type] self.total_size = self.num_samples * self.num_replicas self.shuffle = shuffle self.seed = seed self.continue_flag = False def __len__(self): return self.num_samples def set_epoch(self, epoch): r""" Sets the epoch for this sampler. When :attr:`shuffle=True`, this ensures all replicas use a different random ordering for each epoch. Otherwise, the next iteration of this sampler will yield the same ordering. Args: epoch (int): Epoch number. """ self.epoch = epoch if self.shuffle: # deterministically shuffle based on epoch and seed g = torch.Generator() g.manual_seed(self.seed + self.epoch) indices = torch.randperm(len(self.dataset), generator=g).tolist() # type: ignore[arg-type] else: indices = list(range(len(self.dataset))) # type: ignore[arg-type] if not self.drop_last: # add extra samples to make it evenly divisible padding_size = self.total_size - len(indices) if padding_size <= len(indices): indices += indices[:padding_size] else: indices += (indices * math.ceil(padding_size / len(indices)))[:padding_size] else: # remove tail of data to make it evenly divisible. indices = indices[:self.total_size] assert len(indices) == self.total_size # subsample indices = indices[self.rank:self.total_size:self.num_replicas] assert len(indices) == self.num_samples self.indices = indices if self.continue_flag: self.indices = self.indices[int(self.cur_step*self.batch_size):] self.num_samples = len(self.indices) self.continue_flag = False def __iter__(self): for idx in self.indices: yield idx def set_epoch_resume(self, epoch, cur_step): self.epoch = epoch self.cur_step = cur_step self.continue_flag = True class StatefulSampler(Sampler): def __init__(self, data_source_length, batch_size, use_random=True, seed=1, epoch=0): self.use_random = use_random self.data_source_length = data_source_length self.num_samples = self.data_source_length self.batch_size = batch_size self.continue_flag = False self.seed = seed self.epoch = epoch self.cur_step = 0 def __len__(self): return self.num_samples def __iter__(self): for idx in self.indices: yield idx def set_epoch(self, epoch): self.epoch = epoch if self.use_random: # deterministically shuffle based on epoch and seed g = torch.Generator() g.manual_seed(self.seed + self.epoch) self.indices = torch.randperm(self.data_source_length, generator=g).tolist() # type: ignore[arg-type] else: self.indices = list(range(self.data_source_length)) # type: ignore[arg-type] if self.continue_flag == True: self.continue_flag = False self.indices = self.indices[int(self.cur_step*self.batch_size):] self.num_samples = len(self.indices) def set_epoch_resume(self, epoch, cur_step): self.epoch = epoch self.cur_step = cur_step self.continue_flag = True class AverageMeter: """Computes and stores the average and current value""" def __init__(self): self.reset() def reset(self): self.val = 0 self.avg = 0 self.sum = 0 self.count = 0 def update(self, val, n=1): self.val = val self.sum += val * n self.count += n self.avg = self.sum / self.count def print_model_info(model, print_model = False, print_params = True): if print_model: logging.info(model) if print_params: all_params = {} for name, p in model.named_parameters(): name = name.split(".")[0] if name in all_params: all_params[name] += p.numel() else: all_params[name] = p.numel() logging.info("num of parameters of each components:") for name in all_params: logging.info(f"{name}: {all_params[name]/1000000.:.2f}m") class DistributedDynamicBatchSampler(Sampler): """ modified from SpeechBrian, https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/dataio/sampler.py#L307 This BatchSampler batches examples together by grouping them by their length. Every example in the batch have approximately the same length and thus padding is minimized. This enables faster training on datasets where length of examples can vary significantly (e.g Librispeech). Inspired by: https://www.tensorflow.org/api_docs/python/tf/data/experimental/bucket_by_sequence_length Dynamic batching is performed by specifying a max_batch_length which is the upper limit for the sum of the length of examples in a batch: e.g., if ex1 has length 4, ex2 length 5 and if max_batch_length is set to 6 ex1 and ex2 will be placed, alone, in two distinct batches. Length for each example can be obtained in two manners. If the input dataset is a DynamicItemDataset it can be obtained by specifying a length_func. Default assumes a "duration" entry is in the annotation. Length for each example can also be passed to this class upon instantiation by specifying a list containing the length for each example and passing it to lengths_list. Examples are grouped together by defining a set of possible discrete intervals (buckets). Examples whose length fall into these intervals can be batched together. The number of buckets can be specified by using the arg num_buckets. There is usually an optimal range for the value of this argument. If num_buckets == 1, all examples can be batched together. You have maximum randomization but your training speed will be slower due to the fact that a large amount of the values will be padding as long and short examples can be batched together. As the number of buckets grows only examples with similar length can be grouped together. This trades-off speed with randomization. TLDR: Low number -> better randomization, High number -> faster training. NOTE THAT: if set too high the training speed will decrease. If num_buckets -> number of examples in the dataset the batch size will be small impacting training speed and possibly performance. The buckets can also be specified by passing a list to the bucket_boundaries argument instead of specifying a left_bucket_length and a bucket_length_multiplier. Example ------- >>> import torch >>> import speechbrain as sb >>> from speechbrain.dataio.sampler import DynamicBatchSampler >>> from speechbrain.dataio.dataset import DynamicItemDataset >>> from speechbrain.dataio.dataloader import SaveableDataLoader >>> from speechbrain.dataio.batch import PaddedBatch >>> import numpy as np >>> item_lengths = sorted([np.random.randint(10, 100) for x in range(20)]) >>> dataset = {"ex_{}".format(x) : {"wav" :torch.randn(x)} for x in item_lengths} >>> dataset = DynamicItemDataset(dataset) >>> dataset.set_output_keys(["wav"]) >>> length_func = lambda x : len(x) # trivial in this example >>> bsampler = DynamicBatchSampler(dataset, 20, 4, length_func, shuffle=False, batch_ordering='descending') >>> dataloader = SaveableDataLoader(dataset, batch_sampler=bsampler, collate_fn=PaddedBatch) >>> for i, b in enumerate(dataloader): ... data, length = b["wav"] >>> assert data.shape[-1] == max(item_lengths) Arguments --------- dataset : torch.utils.data.Dataset Pytorch Dataset from which elements will be sampled. max_batch_length : int Upper limit for the sum of the length of examples in a batch. Should be chosen based on your GPU memory. num_buckets : int Number of discrete buckets used to group examples together. If num_buckets == 1, all examples can be batched together. As the number of buckets grows only examples with similar length can be grouped together. This trades-off speed with randomization. Low number -> better randomization, High number -> faster training. However if set too high the training speed will decrease. If num_buckets -> number of examples in the dataset the batch size will be small impacting training speed and possibly performance. NOTE: you have either to specify manually the bucket_boundaries or the number of buckets. length_func : callable Function used to get length of each example from the dataset. This argument can be used only when the dataset is a Speechbrain DynamicItemDataset object. Can be anything: e.g. lambda x: x["duration"]*16000 returns number of samples if duration key in the annotation is in seconds and the file has 16kHz sampling freq. shuffle : bool Whether or not shuffle examples between each epoch. batch_ordering : string If ``random``, batches are randomly permuted; otherwise ``ascending`` or ``descending`` sorted by length. max_batch_ex: int If set, it limits the maximum number of examples that can be in a batch superseeding max_batch_length in instances where the amount of examples will exceeed the value specified here. E.g. you have a lot of short examples and the batch size for those will be too high, you can use this argument to limit the batch size for these short examples. bucket_boundaries : list Overrides bucket_length_multiplier and left_bucket_length by specifying manually the buckets right boundaries. lengths_list: list Overrides length_func by passing a list containing the length of each example in the dataset. This argument must be set when the dataset is a plain Pytorch Dataset object and not a DynamicItemDataset object as length_func cannot be used on Pytorch Datasets. epoch : int The epoch to start at. drop_last : bool If ``True``, the sampler will drop the last examples which have not been grouped. verbose: bool If ``True``, log also the stats for each batch at the first epoch. """ def __init__( self, dataset, args, num_replicas = None, rank = None, shuffle = True, seed = 0, drop_last = False, length_func=lambda x: x["duration"], batch_ordering: str = "random", max_batch_ex: int = None, bucket_boundaries: List[int] = [], lengths_list: List[int] = None, epoch: int = 0, verbose: bool = False, ): self.args = args if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() if rank >= num_replicas or rank < 0: raise ValueError( "Invalid rank {}, rank should be in the interval" " [0, {}]".format(rank, num_replicas - 1)) self.num_replicas = num_replicas self.rank = rank max_batch_length = self.args.max_num_tokens if dataset.split == "train" else self.args.val_max_num_tokens logging.info(f"max_num_tokens per GPU for {dataset.split} split: {max_batch_length}") num_buckets = self.args.num_buckets ############# self._dataset = dataset self._ex_lengths = {} # ex_ids = self._dataset.data_ids self.verbose = verbose # We do not put a default on num_buckets to encourage users to play with this parameter if num_buckets is None and len(bucket_boundaries) == 0: raise RuntimeError( "Please specify either num_buckets or bucket boundaries." "Check the docs, and/or the tutorial !" ) assert lengths_list != None max_len = int(self.args.audio_max_length * self.args.encodec_sr) lengths_list = [min(l, max_len) for l in lengths_list] # replace all utt whose length is longer than max_len to max_len, will also do this in __getitem__ in dataset for indx in range(len(lengths_list)): self._ex_lengths[str(indx)] = lengths_list[indx] # if lengths_list is not None: # # take length of examples from this argument and bypass length_key # for indx in range(len(lengths_list)): # self._ex_lengths[str(indx)] = lengths_list[indx] # else: # # use length func # if not isinstance(dataset, DynamicItemDataset): # raise NotImplementedError( # "Dataset should be a Speechbrain DynamicItemDataset when using length function" # ) # for indx in range(len(self._dataset)): # self._ex_lengths[str(indx)] = length_func( # self._dataset.data[ex_ids[indx]] # ) if len(bucket_boundaries) > 0: if not all([x >= 0 for x in bucket_boundaries]): raise ValueError( "All elements in bucket boundaries should be non-negative (>= 0)." ) if not len(set(bucket_boundaries)) == len(bucket_boundaries): raise ValueError( "Bucket_boundaries should not contain duplicates." ) np.testing.assert_array_equal( np.array(bucket_boundaries), np.array(sorted(bucket_boundaries)), err_msg="The arg bucket_boundaries should be an ascending sorted list of non negative values values!", ) self._bucket_boundaries = np.array(sorted(bucket_boundaries)) else: # use num_buckets self._bucket_boundaries = np.array( self._get_boundaries_through_warping( # max_batch_length=max_batch_length, max_batch_length=max(lengths_list), num_quantiles=num_buckets, ) ) self._max_batch_length = max_batch_length self._shuffle_ex = shuffle self._batch_ordering = batch_ordering self._seed = seed self._drop_last = drop_last if max_batch_ex is None: max_batch_ex = np.inf self._max_batch_ex = max_batch_ex # Calculate bucket lengths - how often does one bucket boundary fit into max_batch_length? self._bucket_lens = [ max(1, int(max_batch_length / self._bucket_boundaries[i])) for i in range(len(self._bucket_boundaries)) ] + [1] self._epoch = epoch self._cur_step = 0 self.continue_flag = False self._generate_batches() self.num_samples = int(math.floor(len(self._batches) / self.num_replicas)) self.total_size = int(self.num_samples * self.num_replicas) self._replica_batches = self._batches[self.rank:self.total_size:self.num_replicas] assert len(self._replica_batches) == self.num_samples, f"len(self._batches): {len(self._batches)}, self.total_size: {self.total_size}, self.num_samples: {self.num_samples},len(self._replica_batches): {len(self._replica_batches)}" logging.info(f"len(self._batches): {len(self._batches)}") logging.info(f"self.num_replicas: {self.num_replicas}") logging.info(f"num of batches on each replica: {self.num_samples}") def get_durations(self, batch): """Gets durations of the elements in the batch.""" return [self._ex_lengths[str(idx)] for idx in batch] def _get_boundaries_through_warping( self, max_batch_length: int, num_quantiles: int, ) -> List[int]: # NOTE: the following lines do not cover that there is only one example in the dataset # warp frames (duration) distribution of train data logging.info("Batch quantisation in latent space") # linspace set-up num_boundaries = num_quantiles + 1 # create latent linearly equal spaced buckets latent_boundaries = np.linspace( 1 / num_boundaries, num_quantiles / num_boundaries, num_quantiles, ) # get quantiles using lognormal distribution quantiles = lognorm.ppf(latent_boundaries, 1) # scale up to to max_batch_length bucket_boundaries = quantiles * max_batch_length / quantiles[-1] # compute resulting bucket length multipliers length_multipliers = [ bucket_boundaries[x + 1] / bucket_boundaries[x] for x in range(num_quantiles - 1) ] # logging logging.debug( "Latent bucket boundary - buckets: {} - length multipliers: {}".format( list(map("{:.2f}".format, bucket_boundaries)), list(map("{:.2f}".format, length_multipliers)), ) ) return list(sorted(bucket_boundaries)) def _permute_batches(self): if self._batch_ordering == "random": # deterministically shuffle based on epoch and seed g = torch.Generator() g.manual_seed(self._seed + self._epoch) # since the random seed is based on self._seed and self._epoch, it should be the same for different processes when using DDP, and therefore the generated order should be the same across different process, this is important, because each replica will only take a portion of it, we want to make sure they take a non-overlapping portion, and all of them constitute the entire dataset sampler = torch.randperm( len(self._batches), generator=g ).tolist() # type: ignore tmp = [] for idx in sampler: tmp.append(self._batches[idx]) self._batches = tmp elif self._batch_ordering == "ascending": self._batches = sorted( self._batches, key=lambda x: max([self._ex_lengths[str(idx)] for idx in x]), ) elif self._batch_ordering == "descending": self._batches = sorted( self._batches, key=lambda x: max([self._ex_lengths[str(idx)] for idx in x]), reverse=True, ) else: raise NotImplementedError def _generate_batches(self): logging.info("DynamicBatchSampler: Generating dynamic batches") if self._shuffle_ex: # deterministically shuffle based on epoch and seed g = torch.Generator() g.manual_seed(self._seed + self._epoch) # since the random seed is based on self._seed and self._epoch, it should be the same for different processes when using DDP, and therefore the generated order should be the same across different process, this is important, because each replica will only take a portion of it, we want to make sure they take a non-overlapping portion, and all of them constitute the entire dataset sampler = torch.randperm(len(self._dataset), generator=g).tolist() # type: ignore # pyp note: this is actually randomly permoted indices else: # take examples as they are: e.g. they have been sorted sampler = range(len(self._dataset)) # type: ignore self._batches = [] bucket_batches = [[] for i in self._bucket_lens] stats_tracker = [ {"min": np.inf, "max": -np.inf, "tot": 0, "n_ex": 0} for i in self._bucket_lens ] for idx in sampler: # length of pre-sampled audio item_len = self._ex_lengths[str(idx)] # bucket to fill up most padding bucket_id = np.searchsorted(self._bucket_boundaries, item_len) # fill audio's duration into that bucket bucket_batches[bucket_id].append(idx) stats_tracker[bucket_id]["min"] = min( stats_tracker[bucket_id]["min"], item_len ) stats_tracker[bucket_id]["max"] = max( stats_tracker[bucket_id]["max"], item_len ) stats_tracker[bucket_id]["tot"] += item_len stats_tracker[bucket_id]["n_ex"] += 1 # track #samples - why not duration/#frames; rounded up? # keep track of durations, if necessary if ( len(bucket_batches[bucket_id]) >= self._bucket_lens[bucket_id] or len(bucket_batches[bucket_id]) >= self._max_batch_ex ): self._batches.append(bucket_batches[bucket_id]) bucket_batches[bucket_id] = [] # keep track of durations # Dump remaining batches if not self._drop_last: for batch in bucket_batches: if batch: self._batches.append(batch) self._permute_batches() # possibly reorder batches if self._epoch == 0: # only log at first epoch # frames per batch & their padding remaining boundaries = [0] + self._bucket_boundaries.tolist() for bucket_indx in range(len(self._bucket_boundaries)): try: num_batches = stats_tracker[bucket_indx]["tot"] // ( self._max_batch_length ) pad_factor = ( stats_tracker[bucket_indx]["max"] - stats_tracker[bucket_indx]["min"] ) / ( stats_tracker[bucket_indx]["tot"] / stats_tracker[bucket_indx]["n_ex"] ) except ZeroDivisionError: num_batches = 0 pad_factor = 0 logging.debug( ( "DynamicBatchSampler: Bucket {} with boundary {:.1f}-{:.1f} and " + "batch_size {}: Num Examples {:.1f}, Num Full Batches {:.3f}, Pad Factor {:.3f}." ).format( bucket_indx, boundaries[bucket_indx], boundaries[bucket_indx + 1], self._bucket_lens[bucket_indx], stats_tracker[bucket_indx]["n_ex"], num_batches, pad_factor * 100, ) ) if self.verbose: batch_stats = { "tot_frames": [], "tot_pad_frames": [], "pad_%": [], } for batch in self._batches: tot_frames = sum( [self._ex_lengths[str(idx)] for idx in batch] ) batch_stats["tot_frames"].append(tot_frames) max_frames = max( [self._ex_lengths[str(idx)] for idx in batch] ) tot_pad = sum( [ max_frames - self._ex_lengths[str(idx)] for idx in batch ] ) batch_stats["tot_pad_frames"].append(tot_pad) batch_stats["pad_%"].append(tot_pad / tot_frames * 100) padding_details = "Batch {} with {:.1f} frames with {} files - {:.1f} padding, {:.2f} (%) of total." padding_details = "DynamicBatchSampler: " + padding_details for i in range(len(self._batches)): logging.debug( padding_details.format( i, batch_stats["tot_frames"][i], len(self._batches[i]), batch_stats["tot_pad_frames"][i], batch_stats["pad_%"][i], ) ) def __iter__(self): for batch in self._replica_batches: yield batch # if self._shuffle_ex: # re-generate examples if ex_ordering == "random" # self._generate_batches() # if self._batch_ordering == "random": # # we randomly permute the batches only --> faster # self._permute_batches() def set_epoch(self, epoch): """ You can also just access self.epoch, but we maintain this interface to mirror torch.utils.data.distributed.DistributedSampler """ self._epoch = epoch self._generate_batches() self._replica_batches = self._batches[self.rank:self.total_size:self.num_replicas] self.num_samples = int(math.floor(len(self._batches) / self.num_replicas)) assert len(self._replica_batches) == self.num_samples, f"len(self._batches): {len(self._batches)}, self.total_size: {self.total_size}, self.num_samples: {self.num_samples},len(self._replica_batches): {len(self._replica_batches)}" if self.continue_flag: self.continue_flag = False self._replica_batches = self._replica_batches[self._cur_step:] self.num_samples = len(self._replica_batches) def __len__(self): return self.num_samples def set_epoch_resume(self, epoch, cur_step): self.continue_flag = True self._epoch = epoch self._cur_step = cur_step ================================================ FILE: tts_demo.py ================================================ """ This script will allow you to run TTS inference with Voicecraft Before getting started, be sure to follow the environment setup. """ from inference_tts_scale import inference_one_sample from models import voicecraft from data.tokenizer import ( AudioTokenizer, TextTokenizer, ) import argparse import random import numpy as np import torchaudio import torch import os os.environ["USER"] = "me" # TODO change this to your username device = "cuda" if torch.cuda.is_available() else "cpu" def parse_arguments(): parser = argparse.ArgumentParser( description="VoiceCraft TTS Inference: see the script for more information on the options") parser.add_argument("-m", "--model_name", type=str, default="giga830M", choices=[ "giga330M", "giga830M", "giga330M_TTSEnhanced", "giga830M_TTSEnhanced"], help="VoiceCraft model to use") parser.add_argument("-st", "--silence_tokens", type=int, nargs="*", default=[1388, 1898, 131], help="Silence token IDs") parser.add_argument("-casr", "--codec_audio_sr", type=int, default=16000, help="Codec audio sample rate.") parser.add_argument("-csr", "--codec_sr", type=int, default=50, help="Codec sample rate.") parser.add_argument("-k", "--top_k", type=float, default=0, help="Top k value.") parser.add_argument("-p", "--top_p", type=float, default=0.8, help="Top p value.") parser.add_argument("-t", "--temperature", type=float, default=1, help="Temperature value.") parser.add_argument("-kv", "--kvcache", type=float, choices=[0, 1], default=0, help="Kvcache value.") parser.add_argument("-sr", "--stop_repetition", type=int, default=-1, help="Stop repetition for generation") parser.add_argument("--sample_batch_size", type=int, default=3, help="Batch size for sampling") parser.add_argument("-s", "--seed", type=int, default=1, help="Seed value.") parser.add_argument("-bs", "--beam_size", type=int, default=50, help="beam size for MFA alignment") parser.add_argument("-rbs", "--retry_beam_size", type=int, default=200, help="retry beam size for MFA alignment") parser.add_argument("--output_dir", type=str, default="./generated_tts", help="directory to save generated audio") parser.add_argument("-oa", "--original_audio", type=str, default="./demo/5895_34622_000026_000002.wav", help="location of audio file") parser.add_argument("-ot", "--original_transcript", type=str, default="Gwynplaine had, besides, for his work and for his feats of strength, round his neck and over his shoulders, an esclavine of leather.", help="original transcript") parser.add_argument("-tt", "--target_transcript", type=str, default="I cannot believe that the same model can also do text to speech synthesis too!", help="target transcript") parser.add_argument("-co", "--cut_off_sec", type=float, default=3.6, help="cut off point in seconds for input prompt") parser.add_argument("-ma", "--margin", type=float, default=0.04, help="margin in seconds between the end of the cutoff words and the start of the next word. If the next word is not immediately following the cutoff word, the algorithm is more tolerant to word alignment errors") parser.add_argument("-cuttol", "--cutoff_tolerance", type=float, default=1, help="tolerance in seconds for the cutoff time, if given cut_off_sec plus the tolerance, we still are not able to find the next word, we will use the best cutoff time found, i.e. likely no margin or very small margin between the end of the cutoff word and the start of the next word") args = parser.parse_args() return args args = parse_arguments() voicecraft_name = args.model_name # hyperparameters for inference codec_audio_sr = args.codec_audio_sr codec_sr = args.codec_sr top_k = args.top_k top_p = args.top_p # defaults to 0.9 can also try 0.8, but 0.9 seems to work better temperature = args.temperature silence_tokens = args.silence_tokens kvcache = args.kvcache # NOTE if OOM, change this to 0, or try the 330M model # NOTE adjust the below three arguments if the generation is not as good # NOTE if the model generate long silence, reduce the stop_repetition to 3, 2 or even 1 stop_repetition = args.stop_repetition # NOTE: if the if there are long silence or unnaturally strecthed words, # increase sample_batch_size to 4 or higher. What this will do to the model is that the # model will run sample_batch_size examples of the same audio, and pick the one that's the shortest. # So if the speech rate of the generated is too fast change it to a smaller number. sample_batch_size = args.sample_batch_size seed = args.seed # change seed if you are still unhappy with the result # load the model if voicecraft_name == "330M": voicecraft_name = "giga330M" elif voicecraft_name == "830M": voicecraft_name = "giga830M" elif voicecraft_name == "330M_TTSEnhanced": voicecraft_name = "330M_TTSEnhanced" elif voicecraft_name == "830M_TTSEnhanced": voicecraft_name = "830M_TTSEnhanced" model = voicecraft.VoiceCraft.from_pretrained( f"pyp1/VoiceCraft_{voicecraft_name.replace('.pth', '')}") phn2num = model.args.phn2num config = vars(model.args) model.to(device) encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th" if not os.path.exists(encodec_fn): os.system( f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th -O ./pretrained_models/encodec_4cb2048_giga.th") # will also put the neural codec model on gpu audio_tokenizer = AudioTokenizer(signature=encodec_fn, device=device) text_tokenizer = TextTokenizer(backend="espeak") # Prepare your audio # point to the original audio whose speech you want to clone # write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file orig_audio = args.original_audio orig_transcript = args.original_transcript # move the audio and transcript to temp folder temp_folder = "./demo/temp" os.makedirs(temp_folder, exist_ok=True) os.system(f"cp {orig_audio} {temp_folder}") filename = os.path.splitext(orig_audio.split("/")[-1])[0] with open(f"{temp_folder}/{filename}.txt", "w") as f: f.write(orig_transcript) # run MFA to get the alignment align_temp = f"{temp_folder}/mfa_alignments" beam_size = args.beam_size retry_beam_size = args.retry_beam_size alignments = f"{temp_folder}/mfa_alignments/{filename}.csv" if not os.path.isfile(alignments): os.system(f"mfa align -v --clean -j 1 --output_format csv {temp_folder} \ english_us_arpa english_us_arpa {align_temp} --beam {beam_size} --retry_beam {retry_beam_size}") # if the above fails, it could be because the audio is too hard for the alignment model, # increasing the beam_size and retry_beam_size usually solves the issue def find_closest_word_boundary(alignments, cut_off_sec, margin, cutoff_tolerance = 1): with open(alignments, 'r') as file: # skip header next(file) cutoff_time = None cutoff_index = None cutoff_time_best = None cutoff_index_best = None lines = [l for l in file.readlines()] for i, line in enumerate(lines): end = float(line.strip().split(',')[1]) if end >= cut_off_sec and cutoff_time == None: cutoff_time = end cutoff_index = i if end >= cut_off_sec and end < cut_off_sec + cutoff_tolerance and float(lines[i+1].strip().split(',')[0]) - end >= margin: cutoff_time_best = end + margin * 2 / 3 cutoff_index_best = i break if cutoff_time_best != None: cutoff_time = cutoff_time_best cutoff_index = cutoff_index_best return cutoff_time, cutoff_index # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt # NOTE: according to forced-alignment file demo/temp/mfa_alignments/5895_34622_000026_000002.wav, the word "strength" stop as 3.561 sec, so we use first 3.6 sec as the prompt. this should be different for different audio cut_off_sec = args.cut_off_sec margin = args.margin audio_fn = f"{temp_folder}/{filename}.wav" cut_off_sec, cut_off_word_idx = find_closest_word_boundary(alignments, cut_off_sec, margin, args.cutoff_tolerance) target_transcript = " ".join(orig_transcript.split(" ")[:cut_off_word_idx+1]) + " " + args.target_transcript # NOTE: 3 sec of reference is generally enough for high quality voice cloning, but longer is generally better, try e.g. 3~6 sec. info = torchaudio.info(audio_fn) audio_dur = info.num_frames / info.sample_rate assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}" prompt_end_frame = int(cut_off_sec * info.sample_rate) def seed_everything(seed): os.environ['PYTHONHASHSEED'] = str(seed) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True seed_everything(seed) # inference decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, 'kvcache': kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr, "silence_tokens": silence_tokens, "sample_batch_size": sample_batch_size} concated_audio, gen_audio = inference_one_sample(model, argparse.Namespace( **config), phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_transcript, device, decode_config, prompt_end_frame) # save segments for comparison concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu() # logging.info(f"length of the resynthesize orig audio: {orig_audio.shape}") # save the audio # output_dir output_dir = args.output_dir os.makedirs(output_dir, exist_ok=True) seg_save_fn_gen = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_gen_seed{seed}.wav" seg_save_fn_concat = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_concat_seed{seed}.wav" torchaudio.save(seg_save_fn_gen, gen_audio, codec_audio_sr) torchaudio.save(seg_save_fn_concat, concated_audio, codec_audio_sr) # you might get warnings like WARNING:phonemizer:words count mismatch on 300.0% of the lines (3/1), this can be safely ignored ================================================ FILE: voicecraft-gradio-colab.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "view-in-github" }, "source": [ "\"Open" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Y87ixxsUVIhM" }, "outputs": [], "source": [ "!git clone https://github.com/jasonppy/VoiceCraft" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "-w3USR91XdxY" }, "outputs": [], "source": [ "!pip install tensorboard\n", "!pip install phonemizer\n", "!pip install datasets\n", "!pip install torchmetrics\n", "\n", "!apt-get install -y espeak espeak-data libespeak1 libespeak-dev\n", "!apt-get install -y festival*\n", "!apt-get install -y build-essential\n", "!apt-get install -y flac libasound2-dev libsndfile1-dev vorbis-tools\n", "!apt-get install -y libxml2-dev libxslt-dev zlib1g-dev\n", "\n", "!pip install -e git+https://github.com/facebookresearch/audiocraft.git@c5157b5bf14bf83449c17ea1eeb66c19fb4bc7f0#egg=audiocraft\n", "\n", "!pip install -r \"/content/VoiceCraft/gradio_requirements.txt\"\n", "!pip install typer==0.7.0" ] }, { "cell_type": "markdown", "metadata": { "id": "jNuzjrtmv2n1" }, "source": [ "# Let it restarted, it won't let your entire installation be aborted." ] }, { "cell_type": "markdown", "metadata": { "id": "AnqGEwZ4NxtJ" }, "source": [ "# Note before launching the `gradio_app.py`\n", "\n", "***You will get JSON warning if you move anything beside `sample_batch_size`, `stop_repetition` and `seed`.*** Which for most advanced setting, `kvache` and `temperature` unable to set in different value.\n", "\n", "You will download a .file File when you download the output audio for some reason. You will need to **convert the file from .snd to .wav/.mp3 manually**. Or if you enable showing file type in the name in Windows or wherever you are, change the file name to \"xxx.wav\" or \"xxx.mp3\". (know the solution? pull request my repository)\n", "\n", "Frequency of VRAM spikes no longer exist as well in April 5 Update.\n", "* Nevermind, I have observed some weird usage on Colab's GPU Memory Monitor. It can spike up to 13.5GB VRAM even in WhisperX mode. (April 11)" ] }, { "cell_type": "markdown", "metadata": { "id": "dE0W76cMN3Si" }, "source": [ "Don't make your `prompt end time` too long, 6-9s is fine. Or else it will **either raise up JSON issue or cut off your generated audio**. This one is due to how VoiceCraft worked (so probably unfixable). It will add those text you want to get audio from at the end of the input audio transcript. It was way too much word for application or code to handle as it added up with original transcript. So please keep it short.\n", "\n", "Your total audio length (`prompt end time` + add-up audio) must not exceed 16 or 17s." ] }, { "cell_type": "markdown", "metadata": { "id": "nnu2cY4t8P6X" }, "source": [ "For voice cloning, I suggest you to probably have a monotone input to feed the voice cloning. Of course you can always try input that have tons of tone variety, but I find that as per April 11 Update, it's much more easy to replicate in monotone rather than audio that have laugh, scream, crying inside.\n", "\n", "The inference speed is much stable. With sample text, T4 (Free Tier Colab GPU) can do 6-15s on 6s-8s of `prompt end time`." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "NDt4r4DiXAwG" }, "outputs": [], "source": [ "!python /content/VoiceCraft/gradio_app.py --demo-path=/content/VoiceCraft/demo --tmp-path=/content/VoiceCraft/demo/temp --models-path=/content/VoiceCraft/pretrained_models --share" ] } ], "metadata": { "accelerator": "GPU", "colab": { "authorship_tag": "ABX9TyPsqFhtOeQ18CXHnRkWAQSk", "gpuType": "T4", "include_colab_link": true, "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 } ================================================ FILE: z_scripts/e830M.sh ================================================ #!/bin/bash source ~/miniconda3/etc/profile.d/conda.sh conda activate voicecraft export CUDA_VISIBLE_DEVICES=0,1,2,3 export WORLD_SIZE=4 dataset=gigaspeech mkdir -p ./logs/${dataset} exp_root="path/to/store/exp_results" exp_name=e830M dataset_dir="path/to/stored_extracted_codes_and_phonemes/xl" # xs if you only extracted xs in previous step encodec_codes_folder_name="encodec_16khz_4codebooks" # export CUDA_LAUNCH_BLOCKING=1 # for debugging torchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=localhost:41977 --nproc_per_node=${WORLD_SIZE} \ ../main.py \ --reduced_eog 1 \ --drop_long 1 \ --eos 2051 \ --n_special 4 \ --pad_x 0 \ --codebook_weight "[5,1,0.5,0.1]" \ --encodec_sr 50 \ --num_steps 50000 \ --lr 0.05 \ --warmup_fraction 0.01 \ --optimizer_name "ScaledAdam" \ --pseudo_epoch_size 3000 \ --reduce_lr_start_step 3000 \ --reduce_lr_start_epoch 4 \ --clipping_update_period 1000 \ --d_model 2048 \ --audio_embedding_dim 2048 \ --nhead 16 \ --num_decoder_layers 16 \ --max_num_tokens 100000 \ --gradient_accumulation_steps 26 \ --val_max_num_tokens 6000 \ --num_buckets 6 \ --audio_max_length 20 \ --audio_min_length 2 \ --text_max_length 400 \ --text_min_length 10 \ --mask_len_min 1 \ --mask_len_max 600 \ --tb_write_every_n_steps 10 \ --print_every_n_steps 400 \ --val_every_n_steps 1600 \ --text_vocab_size 100 \ --text_pad_token 100 \ --phn_folder_name "phonemes" \ --manifest_name "manifest" \ --encodec_folder_name ${encodec_codes_folder_name} \ --audio_vocab_size 2048 \ --empty_token 2048 \ --eog 2049 \ --audio_pad_token 2050 \ --n_codebooks 4 \ --max_n_spans 3 \ --shuffle_mask_embedding 0 \ --mask_sample_dist poisson1 \ --max_mask_portion 0.9 \ --min_gap 5 \ --num_workers 8 \ --dynamic_batching 1 \ --dataset $dataset \ --exp_dir "${exp_root}/${dataset}/${exp_name}" \ --dataset_dir ${dataset_dir} # >> ./logs/${dataset}/${exp_name}.log 2>&1 ================================================ FILE: z_scripts/e830M_ft.sh ================================================ #!/bin/bash source ~/miniconda3/etc/profile.d/conda.sh conda activate voicecraft export CUDA_VISIBLE_DEVICES=0,1,2,3 export WORLD_SIZE=4 dataset=gigaspeech mkdir -p ./logs/${dataset} exp_root="path/to/store/exp_results" exp_name=e830M_ft dataset_dir="path/to/stored_extracted_codes_and_phonemes/xl" # xs if you only extracted xs in previous step encodec_codes_folder_name="encodec_16khz_4codebooks" load_model_from="./pretrained_models/giga830M.pth" # export CUDA_LAUNCH_BLOCKING=1 # for debugging torchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=localhost:41977 --nproc_per_node=${WORLD_SIZE} \ ../main.py \ --load_model_from ${load_model_from} \ --reduced_eog 1 \ --drop_long 1 \ --eos 2051 \ --n_special 4 \ --pad_x 0 \ --codebook_weight "[3,1,1,1]" \ --encodec_sr 50 \ --num_steps 500000 \ --lr 0.00001 \ --warmup_fraction 0.1 \ --optimizer_name "AdamW" \ --d_model 2048 \ --audio_embedding_dim 2048 \ --nhead 16 \ --num_decoder_layers 16 \ --max_num_tokens 20000 \ --gradient_accumulation_steps 12 \ --val_max_num_tokens 6000 \ --num_buckets 6 \ --audio_max_length 20 \ --audio_min_length 2 \ --text_max_length 400 \ --text_min_length 10 \ --mask_len_min 1 \ --mask_len_max 600 \ --tb_write_every_n_steps 10 \ --print_every_n_steps 400 \ --val_every_n_steps 1600 \ --text_vocab_size 100 \ --text_pad_token 100 \ --phn_folder_name "phonemes" \ --manifest_name "manifest" \ --encodec_folder_name ${encodec_codes_folder_name} \ --audio_vocab_size 2048 \ --empty_token 2048 \ --eog 2049 \ --audio_pad_token 2050 \ --n_codebooks 4 \ --max_n_spans 3 \ --shuffle_mask_embedding 0 \ --mask_sample_dist poisson1 \ --max_mask_portion 0.9 \ --min_gap 5 \ --num_workers 8 \ --dynamic_batching 1 \ --dataset $dataset \ --exp_dir "${exp_root}/${dataset}/${exp_name}" \ --dataset_dir ${dataset_dir} # >> ./logs/${dataset}/${exp_name}.log 2>&1