Repository: SociallyIneptWeeb/AniVoiceChanger Branch: main Commit: fc16d0c18efd Files: 10 Total size: 42.8 KB Directory structure: gitextract_wa9osrpx/ ├── .gitattributes ├── .gitignore ├── AniVoiceChanger_colab.ipynb ├── LICENSE ├── README.md ├── audio/ │ └── AUDIO.txt ├── extra_requirements.txt ├── get_audio_devices.py ├── main_colab.py └── main_local.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitattributes ================================================ # Auto detect text files and perform LF normalization * text=auto ================================================ FILE: .gitignore ================================================ .idea audio/*.mp3 audio/*.wav ================================================ FILE: AniVoiceChanger_colab.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "source": [ "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/SociallyIneptWeeb/AniVoiceChanger/blob/main/AniVoiceChanger_colab.ipynb)" ], "metadata": { "id": "wEsZxTheMO8_" } }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "jwu07JgqoFON", "cellView": "form" }, "outputs": [], "source": [ "#@title Mount Drive\n", "\n", "from google.colab import drive\n", "drive.mount('/content/drive')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ge_97mfpgqTm", "cellView": "form" }, "outputs": [], "source": [ "#@title Clone repository\n", "!git init\n", "!git remote add origin https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI.git\n", "!git fetch origin 195a14e5c51ec02774c4d1961d0a4b67755e25c8 --depth=1\n", "!git reset --hard FETCH_HEAD" ] }, { "cell_type": "code", "source": [ "#@title Set Mode and Parameters\n", "#@markdown ## Mode\n", "#@markdown To run the WebUI for training a voice model, set the mode to Training.\n", "\n", "#@markdown To run the voice changer server for main_colab.py to connect to, set the mode to Inference.\n", "MODE = 'Training' #@param ['Training', 'Inference']\n", "\n", "#@markdown If MODE: Training, specify the path to a zip file containing the voice clips in your google drive to be used for training. If MODE: Inference, ignore.\n", "DATASET = 'char_voice_lines.zip' #@param {type:\"string\"}\n", "DATASET = '/content/drive/MyDrive/' + DATASET\n", "\n", "if MODE == 'Training':\n", " !mkdir -p dataset\n", " !unzip -d dataset -B {DATASET}\n", " # rename duplicate filenames in dataset\n", " !ls -a /content/dataset/\n", " !rename 's/(\\w+)\\.(\\w+)~(\\d*)/$1_$3.$2/' /content/dataset/*.*~*\n", "\n", "#@markdown ## Upload a trained model\n", "#@markdown Only fill the below fields if you would like to continue training a previously trained model or use it for inference. Specify the name and epoch number of the model to be used or trained. The folder containing the trained files in your google drive will be used.\n", "\n", "MODELNAME = \"\" #@param {type:\"string\"}\n", "MODELEPOCH = 2333333 #@param {type:\"integer\"}\n", "if MODELNAME:\n", " !mkdir -p /content/logs/{MODELNAME}\n", " !cp /content/drive/MyDrive/{MODELNAME}_files/*.index /content/logs/{MODELNAME}/\n", " !cp /content/drive/MyDrive/{MODELNAME}_files/{MODELNAME}.pth /content/weights/\n", " if MODE == 'Training':\n", " !cp /content/drive/MyDrive/{MODELNAME}_files/D_{MODELEPOCH}.pth /content/logs/{MODELNAME}/\n", " !cp /content/drive/MyDrive/{MODELNAME}_files/G_{MODELEPOCH}.pth /content/logs/{MODELNAME}/\n", " !cp /content/drive/MyDrive/{MODELNAME}_files/*.npy /content/logs/{MODELNAME}/" ], "metadata": { "id": "S8wlJabmgeBK", "cellView": "form" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "pqE0PrnuRqI2", "cellView": "form" }, "outputs": [], "source": [ "#@title Install requirements\n", "!pip install -r requirements.txt\n", "!apt -y install -qq aria2" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "UG3XpUwEomUz", "cellView": "form" }, "outputs": [], "source": [ "#@title Download pretrained models\n", "\n", "# v1\n", "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D32k.pth -d /content/pretrained -o D32k.pth\n", "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D40k.pth -d /content/pretrained -o D40k.pth\n", "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D48k.pth -d /content/pretrained -o D48k.pth\n", "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G32k.pth -d /content/pretrained -o G32k.pth\n", "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G40k.pth -d /content/pretrained -o G40k.pth\n", "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G48k.pth -d /content/pretrained -o G48k.pth\n", "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D32k.pth -d /content/pretrained -o f0D32k.pth\n", "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D40k.pth -d /content/pretrained -o f0D40k.pth\n", "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D48k.pth -d /content/pretrained -o f0D48k.pth\n", "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G32k.pth -d /content/pretrained -o f0G32k.pth\n", "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G40k.pth -d /content/pretrained -o f0G40k.pth\n", "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G48k.pth -d /content/pretrained -o f0G48k.pth\n", "\n", "# v2\n", "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/D40k.pth -d /content/pretrained_v2 -o D40k.pth\n", "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/G40k.pth -d /content/pretrained_v2 -o G40k.pth\n", "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0D40k.pth -d /content/pretrained_v2 -o f0D40k.pth\n", "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0G40k.pth -d /content/pretrained_v2 -o f0G40k.pth" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "HugjmZqZRuiF", "cellView": "form" }, "outputs": [], "source": [ "#@title Download Vocal Separation model\n", "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2-人声vocals+非人声instrumentals.pth -d /content/uvr5_weights -o HP2-人声vocals+非人声instrumentals.pth\n", "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5-主旋律人声vocals+其他instrumentals.pth -d /content/uvr5_weights -o HP5-主旋律人声vocals+其他instrumentals.pth" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "2RCaT9FTR0ej", "cellView": "form" }, "outputs": [], "source": [ "#@title Download hubert_base model\n", "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt -d /content -o hubert_base.pt" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "7vh6vphDwO0b", "cellView": "form" }, "outputs": [], "source": [ "#@title Run WebUI for Training, skip if Inference\n", "if MODE == 'Training':\n", " # %load_ext tensorboard\n", " # %tensorboard --logdir /content/Retrieval-based-Voice-Conversion-WebUI/logs\n", " !python3 infer-web.py --colab --pycmd python3" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "FgJuNeAwx5Y_", "cellView": "form" }, "outputs": [], "source": [ "#@title Manually back up the trained model files to Google Drive for Mode: \"Training\"\n", "#@markdown MODELNAME should be the EXPERIMENT_NAME that you typed.\n", "\n", "#@markdown If the name of the model is john, and in the logs/john folder is a file called D_2333333.pth, set the MODELNAME: john and MODELEPOCH: 2333333\n", "\n", "if MODE == 'Training':\n", "\n", "#@markdown Model name\n", " MODELNAME = \"\" #@param {type:\"string\"}\n", " if MODELNAME:\n", "#@markdown Epoch number\n", " MODELEPOCH = 2333333 #@param {type:\"integer\"}\n", "#@markdown Save intermediate models if you would like to continue training the model later\n", " SAVE_INTERMEDIATE = True #@param {type:\"boolean\"}\n", "\n", " !mkdir -p /content/drive/MyDrive/{MODELNAME}_files\n", "\n", " if SAVE_INTERMEDIATE:\n", " !cp /content/logs/{MODELNAME}/G_{MODELEPOCH}.pth /content/drive/MyDrive/{MODELNAME}_files/\n", " !cp /content/logs/{MODELNAME}/D_{MODELEPOCH}.pth /content/drive/MyDrive/{MODELNAME}_files/\n", " !cp /content/logs/{MODELNAME}/total_*.npy /content/drive/MyDrive/{MODELNAME}_files/\n", "\n", " !cp /content/logs/{MODELNAME}/added_*.index /content/drive/MyDrive/{MODELNAME}_files/\n", " !cp /content/weights/{MODELNAME}.pth /content/drive/MyDrive/{MODELNAME}_files/" ] }, { "cell_type": "markdown", "source": [ "# Inference\n", "\n", "The code below is to be run for Mode: Inference\n", "\n", "When prompted `Proceed (Y/n)?`, click beside it, type `Y` and press `Enter`.\n", "\n", "If `WARNING: The following packages were previously imported in this runtime: [numpy] You must restart the runtime in order to use newly installed versions.` is seen in the output, click `Restart Runtime` and then continue running the next cell." ], "metadata": { "id": "0JTjkTndoHb6" } }, { "cell_type": "code", "source": [ "#@title Install specific numpy version. If needed, click Restart Runtime before running the bottom two cells.\n", "if MODE == 'Inference':\n", " !pip uninstall numpy\n", " !pip install numpy==1.23.5" ], "metadata": { "id": "cfJV7kbKqtnw", "cellView": "form" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "try:\n", " MODE\n", "except NameError:\n", " MODE = 'Inference'\n", "\n", "#@title Set your NGROK_AUTH_TOKEN.\n", "if MODE == 'Inference':\n", " !pip install flask-ngrok3 -q\n", "\n", "#@markdown Obtain your Ngrok auth token from [here](https://dashboard.ngrok.com/get-started/your-authtoken)\n", " NGROK_AUTH_TOKEN = '' #@param {type:\"string\"}" ], "metadata": { "id": "OL-1YvcbOiWd", "cellView": "form" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "if MODE != 'Inference':\n", " raise Exception('Mode is not set to Inference.')\n", "\n", "\n", "#@title Run RVC Inference server\n", "import json\n", "import sys\n", "import wave\n", "from pathlib import Path\n", "\n", "BASE_DIR = Path('/content')\n", "sys.path.append(str(BASE_DIR))\n", "\n", "import torch\n", "from multiprocessing import cpu_count\n", "from flask_ngrok3 import run_with_ngrok\n", "from flask import Flask, request, send_file\n", "\n", "from vc_infer_pipeline import VC\n", "from infer_pack.models import (\n", " SynthesizerTrnMs256NSFsid,\n", " SynthesizerTrnMs256NSFsid_nono,\n", " SynthesizerTrnMs768NSFsid,\n", " SynthesizerTrnMs768NSFsid_nono,\n", ")\n", "from my_utils import load_audio\n", "from fairseq import checkpoint_utils\n", "from scipy.io import wavfile\n", "\n", "\n", "INPUT_VOICE_PATH = 'input.mp3'\n", "OUTPUT_VOICE_PATH = 'output.wav'\n", "MODEL_NAME = ''\n", "DEVICE = 'cuda:0'\n", "cpt = None\n", "\n", "\n", "class Config:\n", " def __init__(self, device, is_half):\n", " self.device = device\n", " self.is_half = is_half\n", " self.n_cpu = 0\n", " self.gpu_name = None\n", " self.gpu_mem = None\n", " self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()\n", "\n", " def device_config(self) -> tuple:\n", " if torch.cuda.is_available():\n", " i_device = int(self.device.split(\":\")[-1])\n", " self.gpu_name = torch.cuda.get_device_name(i_device)\n", "\n", " if (\n", " (\"16\" in self.gpu_name and \"V100\" not in self.gpu_name.upper())\n", " or \"P40\" in self.gpu_name.upper()\n", " or \"1060\" in self.gpu_name\n", " or \"1070\" in self.gpu_name\n", " or \"1080\" in self.gpu_name\n", " ):\n", " print(\"16 series/10 series P40 forced single precision\")\n", " self.is_half = False\n", " for config_file in [\"32k.json\", \"40k.json\", \"48k.json\"]:\n", " with open(f\"configs/{config_file}\", \"r\") as f:\n", " strr = f.read().replace(\"true\", \"false\")\n", " with open(f\"configs/{config_file}\", \"w\") as f:\n", " f.write(strr)\n", " with open(\"trainset_preprocess_pipeline_print.py\", \"r\") as f:\n", " strr = f.read().replace(\"3.7\", \"3.0\")\n", " with open(\"trainset_preprocess_pipeline_print.py\", \"w\") as f:\n", " f.write(strr)\n", " else:\n", " self.gpu_name = None\n", "\n", " self.gpu_mem = int(\n", " torch.cuda.get_device_properties(i_device).total_memory\n", " / 1024\n", " / 1024\n", " / 1024\n", " + 0.4\n", " )\n", " if self.gpu_mem <= 4:\n", " with open(\"trainset_preprocess_pipeline_print.py\", \"r\") as f:\n", " strr = f.read().replace(\"3.7\", \"3.0\")\n", " with open(\"trainset_preprocess_pipeline_print.py\", \"w\") as f:\n", " f.write(strr)\n", "\n", " elif torch.backends.mps.is_available():\n", " print(\"No supported N-card found, use MPS for inference\")\n", " self.device = \"mps\"\n", " else:\n", " print(\"No supported N-card found, use CPU for inference\")\n", " self.device = \"cpu\"\n", " self.is_half = True\n", "\n", " if self.n_cpu == 0:\n", " self.n_cpu = cpu_count()\n", "\n", " if self.is_half:\n", " # 6G memory config\n", " x_pad = 3\n", " x_query = 10\n", " x_center = 60\n", " x_max = 65\n", " else:\n", " # 5G memory config\n", " x_pad = 1\n", " x_query = 6\n", " x_center = 38\n", " x_max = 41\n", "\n", " if self.gpu_mem != None and self.gpu_mem <= 4:\n", " x_pad = 1\n", " x_query = 5\n", " x_center = 30\n", " x_max = 32\n", "\n", " return x_pad, x_query, x_center, x_max\n", "\n", "\n", "CONFIG = Config(DEVICE, True)\n", "\n", "\n", "def load_hubert():\n", " models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(['hubert_base.pt'], suffix='', )\n", " hubert = models[0]\n", " hubert = hubert.to(DEVICE)\n", "\n", " if True:\n", " hubert = hubert.half()\n", " else:\n", " hubert = hubert.float()\n", "\n", " hubert.eval()\n", " return hubert\n", "\n", "HUBERT_MODEL = load_hubert()\n", "\n", "\n", "def get_vc(device, is_half, config):\n", " global cpt, version, net_g, tgt_sr, vc\n", " model_path = BASE_DIR / 'weights' / f'{MODEL_NAME}.pth'\n", " if not model_path.exists():\n", " print(f'The model {model_path} does not exist. Please ensure that you have filled in the proper MODEL_NAME in your .env file.')\n", " return None\n", "\n", " model_path = str(model_path)\n", " print(f'loading pth {model_path}')\n", " cpt = torch.load(model_path, map_location='cpu')\n", " tgt_sr = cpt[\"config\"][-1]\n", " cpt[\"config\"][-3] = cpt[\"weight\"][\"emb_g.weight\"].shape[0]\n", " if_f0 = cpt.get(\"f0\", 1)\n", " version = cpt.get(\"version\", \"v1\")\n", "\n", " if version == \"v1\":\n", " if if_f0 == 1:\n", " net_g = SynthesizerTrnMs256NSFsid(*cpt[\"config\"], is_half=is_half)\n", " else:\n", " net_g = SynthesizerTrnMs256NSFsid_nono(*cpt[\"config\"])\n", " elif version == \"v2\":\n", " if if_f0 == 1:\n", " net_g = SynthesizerTrnMs768NSFsid(*cpt[\"config\"], is_half=is_half)\n", " else:\n", " net_g = SynthesizerTrnMs768NSFsid_nono(*cpt[\"config\"])\n", "\n", " del net_g.enc_q\n", " print(net_g.load_state_dict(cpt[\"weight\"], strict=False))\n", " net_g.eval().to(device)\n", "\n", " if is_half:\n", " net_g = net_g.half()\n", " else:\n", " net_g = net_g.float()\n", "\n", " vc = VC(tgt_sr, config)\n", "\n", "\n", "def rvc_infer(pitch_change, pitch_extraction_algo, volume_envelope, index_rate):\n", " logs_dir = BASE_DIR / 'logs' / MODEL_NAME\n", " index_path = ''\n", " for file in logs_dir.iterdir():\n", " if file.suffix == '.index':\n", " index_path = str(logs_dir / file.name)\n", " break\n", "\n", " # vc single\n", " audio = load_audio(INPUT_VOICE_PATH, 16000)\n", " times = [0, 0, 0]\n", " if_f0 = cpt.get('f0', 1)\n", " audio_opt = vc.pipeline(HUBERT_MODEL, net_g, 0, audio, INPUT_VOICE_PATH, times, pitch_change, pitch_extraction_algo, index_path, index_rate, if_f0, 3, tgt_sr, 0, volume_envelope, version, 0.33, f0_file=None)\n", " wavfile.write(OUTPUT_VOICE_PATH, tgt_sr, audio_opt)\n", "\n", "\n", "app = Flask(__name__)\n", "run_with_ngrok(app, auth_token=NGROK_AUTH_TOKEN)\n", "\n", "@app.route('/', methods=['GET'])\n", "def test():\n", " response = {'status':'OK','message':'Test'}\n", " return json.dumps(response)\n", "\n", "\n", "@app.route('/infer', methods=['POST'])\n", "def infer():\n", " global MODEL_NAME, cpt\n", " model_name = request.args.get('model')\n", " if MODEL_NAME != model_name:\n", " MODEL_NAME = model_name\n", " if cpt:\n", " del cpt\n", " get_vc(DEVICE, True, CONFIG)\n", "\n", " pitch_change = int(request.args.get('pitch'))\n", " pitch_extraction_algo = request.args.get('algo')\n", " volume_envelope = float(request.args.get('volume'))\n", " index_rate = float(request.args.get('index_rate'))\n", " audio_data = request.files['audio_file']\n", " audio_data.save(INPUT_VOICE_PATH)\n", " rvc_infer(pitch_change, pitch_extraction_algo, volume_envelope, index_rate)\n", " return send_file(OUTPUT_VOICE_PATH, mimetype=\"audio/wav\")\n", "\n", "app.run()" ], "metadata": { "id": "3qcqylOdMDFg", "cellView": "form" }, "execution_count": null, "outputs": [] } ], "metadata": { "accelerator": "GPU", "colab": { "private_outputs": true, "provenance": [], "gpuType": "T4" }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 } ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2023 SociallyIneptWeeb Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # AniVoiceChanger ![](thumbnail.jpg?raw=true) An "extension" for Retrieval-based Voice Conversion WebUI. Provides a way to record your voice, convert it using a trained voice model, and output it in voice-chat of any application without running the webui. Showcase: https://www.youtube.com/watch?v=C-PqTbh0LxY Setup Guide: https://www.youtube.com/watch?v=K4vVW7iA1w8 ## Setup ### Prerequisites #### Install Git Follow the instructions [here](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) to install Git on your computer. #### Install 7-Zip Download and Install the 7-Zip application from [here](https://www.7-zip.org/download.html). This is used to extract the zipped RVC WebUI application after it has been downloaded. #### Install Virtual Audio Cable Download and Install VB-CABLE Driver from [here](https://vb-audio.com/Cable/) by extracting all files and Run Setup Program in administrator mode. Reboot after installation. This is used to pipe the converted voice audio into the audio input of apps. ### Install RVC WebUI If you haven't installed the RVC WebUI, download the RVC-beta.7z file from [here](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/RVC-beta.7z) and extract it using 7-Zip into a folder of your choosing. It will take around 8GB of space, not including any voice models that you may train later on. ### Clone AniVoiceChanger repository Within the extracted RVC-beta folder (Should have a bunch of folders and files), open a command line window and run this command to clone this entire repository and install the additional dependencies required for this extension. ``` git clone https://github.com/SociallyIneptWeeb/AniVoiceChanger runtime\python.exe -m pip install -r AniVoiceChanger\extra_requirements.txt ``` ### Filling in your Environment Variables in the .env file Follow the instructions written in the .env file and fill in the appropriate values. If unsure, please refer to [this section](https://www.youtube.com/watch?v=K4vVW7iA1w8&t=728s) of the setup video. ## Usage This program assumes that you have already trained a voice model, with the model file in the weights folder. If you have not done so, please refer to [this section](https://www.youtube.com/watch?v=K4vVW7iA1w8&t=457s) of the setup video. Remember to change the audio input device of the game or application you are using to Cable Output (VB-Audio Virtual Cable). There are 2 ways to run this program, either locally or using google colab. If you have around 5 GB of GPU VRAM to spare, feel free to run this locally while using the `crepe` pitch extraction algorithm. If you only have around 3 GB of GPU VRAM, you can also run this locally while using the `pm` pitch extraction algorithm. If none of these requirements are met, you should run this using Google Colab. ### Local To start the program, open a command line window in the extracted RVC-beta folder (Should have a bunch of folders and files) and run this command. ```runtime\python.exe AniVoiceChanger\main_local.py``` > Do note that every time a variable is updated in the `.env` file, you will have to rerun this command for the changes to take into effect. E.g. when changing the model name. Now, hold the RECORD_KEY as defined in your .env file on your keyboard and speak into your mic. For the first time, this might take around 5 seconds to generate and play the voice. For consecutive uses, the time taken will be drastically reduced with caching. The voice will be played into the Cable Output audio device and your speakers as defined in the `.env` file. The generated voice will also be written into [this folder](audio/) as `output.wav` file. ### Google Colab Go to [AniVoiceChanger_colab.ipynb](AniVoiceChanger_colab.ipynb) file in Github and click on `Open in Colab` badge. This will open a Colab notebook. Follow the instructions in the notebook to either train a voice model, or run the RVC Inference server. If you have already uploaded a trained voice model to the Colab runtime and it has started running the inference server, the output of the last cell should display a Ngrok public url. Copy and paste this url into the `COLAB_URL` environment variable in your `.env` file. After all your environment variables are properly set, open a command line window in the extracted RVC-beta folder (Should have a bunch of folders and files) and run this command. Do note that every time a variable is updated in the `.env` file, you will have to rerun this command for the changes to take into effect. E.g. when changing the model name. ```runtime\python.exe AniVoiceChanger\main_colab.py``` > Do note that every time a variable is updated in the `.env` file, you will have to rerun this command for the changes to take into effect. E.g. when changing the model name. Now, hold the RECORD_KEY as defined in your .env file on your keyboard and speak into your mic. For the first time, this might take around 10 seconds to generate and play the voice. For consecutive uses, the time taken will be drastically reduced with caching. The voice will be played into the Cable Output audio device and your speakers as defined in the `.env` file. The generated voice will also be written into [this folder](audio/) as `output.wav` file. ## Terms of Use The use of the converted voice for the following purposes is prohibited. * Criticizing or attacking individuals. * Advocating for or opposing specific political positions, religions, or ideologies. * Publicly displaying strongly stimulating expressions without proper zoning. * Selling of voice models and generated voice clips. * Impersonation of the original owner of the voice with malicious intentions to harm/hurt others. * Fraudulent purposes that lead to identity theft or fraudulent phone calls. ## Disclaimer I am not liable for any direct, indirect, consequential, incidental, or special damages arising out of or in any way connected with the use/misuse or inability to use this software. ================================================ FILE: audio/AUDIO.txt ================================================ Input and Output audio files are created and stored here. ================================================ FILE: extra_requirements.txt ================================================ keyboard pyaudio python-dotenv ================================================ FILE: get_audio_devices.py ================================================ import sounddevice as sd if __name__ == '__main__': for device in sd.query_devices(): print(f"{device['index']}: {device['name']}") ================================================ FILE: main_colab.py ================================================ import sys import wave from pathlib import Path from time import sleep, time from os import getenv from urllib.parse import urlencode import keyboard from threading import Thread BASE_DIR = Path(__file__).resolve().parent.parent sys.path.append(str(BASE_DIR)) import pyaudio import requests import sounddevice as sd import soundfile as sf from dotenv import load_dotenv # load environment variables load_dotenv() COLAB_URL = getenv('COLAB_URL') MODEL_NAME = getenv('MODEL_NAME') if MODEL_NAME.endswith('.pth'): MODEL_NAME = MODEL_NAME[:-4] PITCH_CHANGE = int(getenv('PITCH_CHANGE')) VOLUME_ENVELOPE = float(getenv('VOLUME_ENVELOPE')) INDEX_RATE = float(getenv('INDEX_RATE')) if getenv('INDEX_RATE') else 0 PITCH_EXTRACTION_ALGO = getenv('PITCH_EXTRACTION_ALGO') GPU_INDEX = getenv('GPU_INDEX') MIC_RECORD_KEY = getenv('MIC_RECORD_KEY') INGAME_PUSH_TO_TALK_KEY = getenv('INGAME_PUSH_TO_TALK_KEY') MICROPHONE_ID = int(getenv('MICROPHONE_ID')) if getenv('MICROPHONE_ID') else None SPEAKERS_INPUT_ID = int(getenv('SPEAKERS_INPUT_ID')) if getenv('SPEAKERS_INPUT_ID') else None def rvc_infer_colab(): params_encoded = urlencode({'model': MODEL_NAME, 'pitch': PITCH_CHANGE, 'algo': PITCH_EXTRACTION_ALGO, 'volume': VOLUME_ENVELOPE, 'index_rate': INDEX_RATE}) with open(INPUT_VOICE_PATH, 'rb') as infile: files = {'audio_file': infile} r = requests.post(f'{COLAB_URL}/infer?{params_encoded}', files=files) with open(OUTPUT_VOICE_PATH, 'wb') as outfile: outfile.write(r.content) def play_voice(device_id): data, fs = sf.read(OUTPUT_VOICE_PATH, dtype='float32') if INGAME_PUSH_TO_TALK_KEY: keyboard.press(INGAME_PUSH_TO_TALK_KEY) sd.play(data, fs, device=device_id) sd.wait() if INGAME_PUSH_TO_TALK_KEY: keyboard.release(INGAME_PUSH_TO_TALK_KEY) def on_press_key(_): global frames, recording, stream if not recording: print('\nRecording has started.') frames = [] recording = True stream = p.open(format=FORMAT, channels=MIC_CHANNELS, rate=MIC_SAMPLING_RATE, input=True, frames_per_buffer=CHUNK, input_device_index=MICROPHONE_ID) def on_release_key(_): global recording, stream recording = False stream.stop_stream() stream.close() stream = None # if key not held down for long enough if not frames or len(frames) < 20: print('No audio file to transcribe detected. Hold down the key for a longer time.') return print('Converting voice...') start_time = time() # write microphone audio to file wf = wave.open(str(INPUT_VOICE_PATH), 'wb') wf.setnchannels(MIC_CHANNELS) wf.setsampwidth(p.get_sample_size(FORMAT)) wf.setframerate(MIC_SAMPLING_RATE) wf.writeframes(b''.join(frames)) wf.close() # voice change rvc_infer_colab() print(f'Time taken for RVC voice conversion: {time() - start_time}s') # play to both app mic input and speakers threads = [Thread(target=play_voice, args=[CABLE_INPUT_ID]), Thread(target=play_voice, args=[SPEAKERS_INPUT_ID])] [t.start() for t in threads] [t.join() for t in threads] if __name__ == '__main__': INPUT_VOICE_PATH = str(BASE_DIR / 'AniVoiceChanger' / 'audio' / 'input.mp3') OUTPUT_VOICE_PATH = str(BASE_DIR / 'AniVoiceChanger' / 'audio' / 'output.wav') CHUNK = 1024 FORMAT = pyaudio.paInt16 p = pyaudio.PyAudio() if MICROPHONE_ID is None: MICROPHONE_ID = p.get_default_input_device_info()['index'] if SPEAKERS_INPUT_ID is None: SPEAKERS_INPUT_ID = p.get_default_output_device_info()['index'] CABLE_INPUT_ID = None for audio_device in sd.query_devices(): if 'CABLE Input' in audio_device['name']: CABLE_INPUT_ID = audio_device['index'] break if not CABLE_INPUT_ID: print('Virtual audio cable was not found. Please download and install it.') sys.exit() # get channels and sampling rate of mic mic_info = p.get_device_info_by_index(MICROPHONE_ID) MIC_CHANNELS = mic_info['maxInputChannels'] MIC_SAMPLING_RATE = 40000 print('Voice changer is booting up...') frames = [] recording = False stream = None keyboard.on_press_key(MIC_RECORD_KEY, on_press_key) keyboard.on_release_key(MIC_RECORD_KEY, on_release_key) try: print('Voice changer is ready.') while True: if recording and stream: data = stream.read(CHUNK) frames.append(data) else: sleep(0.2) except KeyboardInterrupt: print('Closing voice changer...') ================================================ FILE: main_local.py ================================================ import sys import wave from pathlib import Path from time import sleep, time from os import getenv import keyboard from threading import Thread BASE_DIR = Path(__file__).resolve().parent.parent sys.path.append(str(BASE_DIR)) import torch import pyaudio import sounddevice as sd import soundfile as sf from multiprocessing import cpu_count from dotenv import load_dotenv from vc_infer_pipeline import VC from infer_pack.models import ( SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono, SynthesizerTrnMs768NSFsid, SynthesizerTrnMs768NSFsid_nono, ) from my_utils import load_audio from fairseq import checkpoint_utils from scipy.io import wavfile # load environment variables load_dotenv() MODEL_NAME = getenv('MODEL_NAME') if MODEL_NAME.endswith('.pth'): MODEL_NAME = MODEL_NAME[:-4] PITCH_CHANGE = int(getenv('PITCH_CHANGE')) VOLUME_ENVELOPE = float(getenv('VOLUME_ENVELOPE')) INDEX_RATE = float(getenv('INDEX_RATE')) if getenv('INDEX_RATE') else 0 PITCH_EXTRACTION_ALGO = getenv('PITCH_EXTRACTION_ALGO') GPU_INDEX = getenv('GPU_INDEX') MIC_RECORD_KEY = getenv('MIC_RECORD_KEY') INGAME_PUSH_TO_TALK_KEY = getenv('INGAME_PUSH_TO_TALK_KEY') MICROPHONE_ID = int(getenv('MICROPHONE_ID')) if getenv('MICROPHONE_ID') else None SPEAKERS_INPUT_ID = int(getenv('SPEAKERS_INPUT_ID')) if getenv('SPEAKERS_INPUT_ID') else None class Config: def __init__(self, device, is_half): self.device = device self.is_half = is_half self.n_cpu = 0 self.gpu_name = None self.gpu_mem = None self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config() def device_config(self) -> tuple: if torch.cuda.is_available(): i_device = int(self.device.split(":")[-1]) self.gpu_name = torch.cuda.get_device_name(i_device) if ( ("16" in self.gpu_name and "V100" not in self.gpu_name.upper()) or "P40" in self.gpu_name.upper() or "1060" in self.gpu_name or "1070" in self.gpu_name or "1080" in self.gpu_name ): print("16 series/10 series P40 forced single precision") self.is_half = False for config_file in ["32k.json", "40k.json", "48k.json"]: with open(f"configs/{config_file}", "r") as f: strr = f.read().replace("true", "false") with open(f"configs/{config_file}", "w") as f: f.write(strr) with open("trainset_preprocess_pipeline_print.py", "r") as f: strr = f.read().replace("3.7", "3.0") with open("trainset_preprocess_pipeline_print.py", "w") as f: f.write(strr) else: self.gpu_name = None self.gpu_mem = int( torch.cuda.get_device_properties(i_device).total_memory / 1024 / 1024 / 1024 + 0.4 ) if self.gpu_mem <= 4: with open("trainset_preprocess_pipeline_print.py", "r") as f: strr = f.read().replace("3.7", "3.0") with open("trainset_preprocess_pipeline_print.py", "w") as f: f.write(strr) elif torch.backends.mps.is_available(): print("No supported N-card found, use MPS for inference") self.device = "mps" else: print("No supported N-card found, use CPU for inference") self.device = "cpu" self.is_half = True if self.n_cpu == 0: self.n_cpu = cpu_count() if self.is_half: # 6G memory config x_pad = 3 x_query = 10 x_center = 60 x_max = 65 else: # 5G memory config x_pad = 1 x_query = 6 x_center = 38 x_max = 41 if self.gpu_mem != None and self.gpu_mem <= 4: x_pad = 1 x_query = 5 x_center = 30 x_max = 32 return x_pad, x_query, x_center, x_max def load_hubert(): models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(['hubert_base.pt'], suffix='', ) hubert = models[0] hubert = hubert.to(device) if is_half: hubert = hubert.half() else: hubert = hubert.float() hubert.eval() return hubert def get_vc(): model_path = BASE_DIR / 'weights' / f'{MODEL_NAME}.pth' if not model_path.exists(): print(f'The model {model_path} does not exist. Please ensure that you have filled in the proper MODEL_NAME in your .env file.') raise Exception() model_path = str(model_path) print(f'loading pth {model_path}') cpt = torch.load(model_path, map_location='cpu') tgt_sr = cpt["config"][-1] cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] if_f0 = cpt.get("f0", 1) version = cpt.get("version", "v1") if version == "v1": if if_f0 == 1: net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half) else: net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) elif version == "v2": if if_f0 == 1: net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=is_half) else: net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) del net_g.enc_q print(net_g.load_state_dict(cpt["weight"], strict=False)) net_g.eval().to(device) if is_half: net_g = net_g.half() else: net_g = net_g.float() vc = VC(tgt_sr, config) return cpt, version, net_g, tgt_sr, vc def rvc_infer(): logs_dir = BASE_DIR / 'logs' / MODEL_NAME index_path = '' for file in logs_dir.iterdir(): if file.suffix == '.index': index_path = str(logs_dir / file.name) break # vc single audio = load_audio(INPUT_VOICE_PATH, 16000) times = [0, 0, 0] if_f0 = cpt.get('f0', 1) audio_opt = vc.pipeline(hubert_model, net_g, 0, audio, INPUT_VOICE_PATH, times, PITCH_CHANGE, PITCH_EXTRACTION_ALGO, index_path, INDEX_RATE, if_f0, 3, tgt_sr, 0, VOLUME_ENVELOPE, version, 0.33, f0_file=None) wavfile.write(OUTPUT_VOICE_PATH, tgt_sr, audio_opt) def play_voice(device_id): data, fs = sf.read(OUTPUT_VOICE_PATH, dtype='float32') if INGAME_PUSH_TO_TALK_KEY: keyboard.press(INGAME_PUSH_TO_TALK_KEY) sd.play(data, fs, device=device_id) sd.wait() if INGAME_PUSH_TO_TALK_KEY: keyboard.release(INGAME_PUSH_TO_TALK_KEY) def on_press_key(_): global frames, recording, stream if not recording: print('\nRecording has started.') frames = [] recording = True stream = p.open(format=FORMAT, channels=MIC_CHANNELS, rate=MIC_SAMPLING_RATE, input=True, frames_per_buffer=CHUNK, input_device_index=MICROPHONE_ID) def on_release_key(_): global recording, stream recording = False stream.stop_stream() stream.close() stream = None # if key not held down for long enough if not frames or len(frames) < 20: print('No audio file to transcribe detected. Hold down the key for a longer time.') return print('Converting voice...') start_time = time() # write microphone audio to file wf = wave.open(str(INPUT_VOICE_PATH), 'wb') wf.setnchannels(MIC_CHANNELS) wf.setsampwidth(p.get_sample_size(FORMAT)) wf.setframerate(MIC_SAMPLING_RATE) wf.writeframes(b''.join(frames)) wf.close() # voice change rvc_infer() print(f'Time taken for RVC voice conversion: {time() - start_time}s') # play to both app mic input and speakers threads = [Thread(target=play_voice, args=[CABLE_INPUT_ID]), Thread(target=play_voice, args=[SPEAKERS_INPUT_ID])] [t.start() for t in threads] [t.join() for t in threads] if __name__ == '__main__': device = f'cuda:{GPU_INDEX}' is_half = True config = Config(device, is_half) INPUT_VOICE_PATH = str(BASE_DIR / 'AniVoiceChanger' / 'audio' / 'input.mp3') OUTPUT_VOICE_PATH = str(BASE_DIR / 'AniVoiceChanger' / 'audio' / 'output.wav') CHUNK = 1024 FORMAT = pyaudio.paInt16 p = pyaudio.PyAudio() if MICROPHONE_ID is None: MICROPHONE_ID = p.get_default_input_device_info()['index'] if SPEAKERS_INPUT_ID is None: SPEAKERS_INPUT_ID = p.get_default_output_device_info()['index'] CABLE_INPUT_ID = None for audio_device in sd.query_devices(): if 'CABLE Input' in audio_device['name']: CABLE_INPUT_ID = audio_device['index'] break if not CABLE_INPUT_ID: print('Virtual audio cable was not found. Please download and install it.') sys.exit() # get channels and sampling rate of mic mic_info = p.get_device_info_by_index(MICROPHONE_ID) MIC_CHANNELS = mic_info['maxInputChannels'] MIC_SAMPLING_RATE = 40000 print('Voice changer is booting up...') # load hubert model hubert_model = load_hubert() # get vc cpt, version, net_g, tgt_sr, vc = get_vc() frames = [] recording = False stream = None keyboard.on_press_key(MIC_RECORD_KEY, on_press_key) keyboard.on_release_key(MIC_RECORD_KEY, on_release_key) try: print('Voice changer is ready.') while True: if recording and stream: data = stream.read(CHUNK) frames.append(data) else: sleep(0.2) except KeyboardInterrupt: print('Closing voice changer...')