Repository: SociallyIneptWeeb/AniVoiceChanger
Branch: main
Commit: fc16d0c18efd
Files: 10
Total size: 42.8 KB
Directory structure:
gitextract_wa9osrpx/
├── .gitattributes
├── .gitignore
├── AniVoiceChanger_colab.ipynb
├── LICENSE
├── README.md
├── audio/
│ └── AUDIO.txt
├── extra_requirements.txt
├── get_audio_devices.py
├── main_colab.py
└── main_local.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitattributes
================================================
# Auto detect text files and perform LF normalization
* text=auto
================================================
FILE: .gitignore
================================================
.idea
audio/*.mp3
audio/*.wav
================================================
FILE: AniVoiceChanger_colab.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"source": [
"[](https://colab.research.google.com/github/SociallyIneptWeeb/AniVoiceChanger/blob/main/AniVoiceChanger_colab.ipynb)"
],
"metadata": {
"id": "wEsZxTheMO8_"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "jwu07JgqoFON",
"cellView": "form"
},
"outputs": [],
"source": [
"#@title Mount Drive\n",
"\n",
"from google.colab import drive\n",
"drive.mount('/content/drive')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "ge_97mfpgqTm",
"cellView": "form"
},
"outputs": [],
"source": [
"#@title Clone repository\n",
"!git init\n",
"!git remote add origin https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI.git\n",
"!git fetch origin 195a14e5c51ec02774c4d1961d0a4b67755e25c8 --depth=1\n",
"!git reset --hard FETCH_HEAD"
]
},
{
"cell_type": "code",
"source": [
"#@title Set Mode and Parameters\n",
"#@markdown ## Mode\n",
"#@markdown To run the WebUI for training a voice model, set the mode to Training.\n",
"\n",
"#@markdown To run the voice changer server for main_colab.py to connect to, set the mode to Inference.\n",
"MODE = 'Training' #@param ['Training', 'Inference']\n",
"\n",
"#@markdown If MODE: Training, specify the path to a zip file containing the voice clips in your google drive to be used for training. If MODE: Inference, ignore.\n",
"DATASET = 'char_voice_lines.zip' #@param {type:\"string\"}\n",
"DATASET = '/content/drive/MyDrive/' + DATASET\n",
"\n",
"if MODE == 'Training':\n",
" !mkdir -p dataset\n",
" !unzip -d dataset -B {DATASET}\n",
" # rename duplicate filenames in dataset\n",
" !ls -a /content/dataset/\n",
" !rename 's/(\\w+)\\.(\\w+)~(\\d*)/$1_$3.$2/' /content/dataset/*.*~*\n",
"\n",
"#@markdown ## Upload a trained model\n",
"#@markdown Only fill the below fields if you would like to continue training a previously trained model or use it for inference. Specify the name and epoch number of the model to be used or trained. The folder containing the trained files in your google drive will be used.\n",
"\n",
"MODELNAME = \"\" #@param {type:\"string\"}\n",
"MODELEPOCH = 2333333 #@param {type:\"integer\"}\n",
"if MODELNAME:\n",
" !mkdir -p /content/logs/{MODELNAME}\n",
" !cp /content/drive/MyDrive/{MODELNAME}_files/*.index /content/logs/{MODELNAME}/\n",
" !cp /content/drive/MyDrive/{MODELNAME}_files/{MODELNAME}.pth /content/weights/\n",
" if MODE == 'Training':\n",
" !cp /content/drive/MyDrive/{MODELNAME}_files/D_{MODELEPOCH}.pth /content/logs/{MODELNAME}/\n",
" !cp /content/drive/MyDrive/{MODELNAME}_files/G_{MODELEPOCH}.pth /content/logs/{MODELNAME}/\n",
" !cp /content/drive/MyDrive/{MODELNAME}_files/*.npy /content/logs/{MODELNAME}/"
],
"metadata": {
"id": "S8wlJabmgeBK",
"cellView": "form"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "pqE0PrnuRqI2",
"cellView": "form"
},
"outputs": [],
"source": [
"#@title Install requirements\n",
"!pip install -r requirements.txt\n",
"!apt -y install -qq aria2"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "UG3XpUwEomUz",
"cellView": "form"
},
"outputs": [],
"source": [
"#@title Download pretrained models\n",
"\n",
"# v1\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D32k.pth -d /content/pretrained -o D32k.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D40k.pth -d /content/pretrained -o D40k.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D48k.pth -d /content/pretrained -o D48k.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G32k.pth -d /content/pretrained -o G32k.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G40k.pth -d /content/pretrained -o G40k.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G48k.pth -d /content/pretrained -o G48k.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D32k.pth -d /content/pretrained -o f0D32k.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D40k.pth -d /content/pretrained -o f0D40k.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D48k.pth -d /content/pretrained -o f0D48k.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G32k.pth -d /content/pretrained -o f0G32k.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G40k.pth -d /content/pretrained -o f0G40k.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G48k.pth -d /content/pretrained -o f0G48k.pth\n",
"\n",
"# v2\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/D40k.pth -d /content/pretrained_v2 -o D40k.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/G40k.pth -d /content/pretrained_v2 -o G40k.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0D40k.pth -d /content/pretrained_v2 -o f0D40k.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0G40k.pth -d /content/pretrained_v2 -o f0G40k.pth"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "HugjmZqZRuiF",
"cellView": "form"
},
"outputs": [],
"source": [
"#@title Download Vocal Separation model\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2-人声vocals+非人声instrumentals.pth -d /content/uvr5_weights -o HP2-人声vocals+非人声instrumentals.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5-主旋律人声vocals+其他instrumentals.pth -d /content/uvr5_weights -o HP5-主旋律人声vocals+其他instrumentals.pth"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "2RCaT9FTR0ej",
"cellView": "form"
},
"outputs": [],
"source": [
"#@title Download hubert_base model\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt -d /content -o hubert_base.pt"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "7vh6vphDwO0b",
"cellView": "form"
},
"outputs": [],
"source": [
"#@title Run WebUI for Training, skip if Inference\n",
"if MODE == 'Training':\n",
" # %load_ext tensorboard\n",
" # %tensorboard --logdir /content/Retrieval-based-Voice-Conversion-WebUI/logs\n",
" !python3 infer-web.py --colab --pycmd python3"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "FgJuNeAwx5Y_",
"cellView": "form"
},
"outputs": [],
"source": [
"#@title Manually back up the trained model files to Google Drive for Mode: \"Training\"\n",
"#@markdown MODELNAME should be the EXPERIMENT_NAME that you typed.\n",
"\n",
"#@markdown If the name of the model is john, and in the logs/john folder is a file called D_2333333.pth, set the MODELNAME: john and MODELEPOCH: 2333333\n",
"\n",
"if MODE == 'Training':\n",
"\n",
"#@markdown Model name\n",
" MODELNAME = \"\" #@param {type:\"string\"}\n",
" if MODELNAME:\n",
"#@markdown Epoch number\n",
" MODELEPOCH = 2333333 #@param {type:\"integer\"}\n",
"#@markdown Save intermediate models if you would like to continue training the model later\n",
" SAVE_INTERMEDIATE = True #@param {type:\"boolean\"}\n",
"\n",
" !mkdir -p /content/drive/MyDrive/{MODELNAME}_files\n",
"\n",
" if SAVE_INTERMEDIATE:\n",
" !cp /content/logs/{MODELNAME}/G_{MODELEPOCH}.pth /content/drive/MyDrive/{MODELNAME}_files/\n",
" !cp /content/logs/{MODELNAME}/D_{MODELEPOCH}.pth /content/drive/MyDrive/{MODELNAME}_files/\n",
" !cp /content/logs/{MODELNAME}/total_*.npy /content/drive/MyDrive/{MODELNAME}_files/\n",
"\n",
" !cp /content/logs/{MODELNAME}/added_*.index /content/drive/MyDrive/{MODELNAME}_files/\n",
" !cp /content/weights/{MODELNAME}.pth /content/drive/MyDrive/{MODELNAME}_files/"
]
},
{
"cell_type": "markdown",
"source": [
"# Inference\n",
"\n",
"The code below is to be run for Mode: Inference\n",
"\n",
"When prompted `Proceed (Y/n)?`, click beside it, type `Y` and press `Enter`.\n",
"\n",
"If `WARNING: The following packages were previously imported in this runtime: [numpy] You must restart the runtime in order to use newly installed versions.` is seen in the output, click `Restart Runtime` and then continue running the next cell."
],
"metadata": {
"id": "0JTjkTndoHb6"
}
},
{
"cell_type": "code",
"source": [
"#@title Install specific numpy version. If needed, click Restart Runtime before running the bottom two cells.\n",
"if MODE == 'Inference':\n",
" !pip uninstall numpy\n",
" !pip install numpy==1.23.5"
],
"metadata": {
"id": "cfJV7kbKqtnw",
"cellView": "form"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"try:\n",
" MODE\n",
"except NameError:\n",
" MODE = 'Inference'\n",
"\n",
"#@title Set your NGROK_AUTH_TOKEN.\n",
"if MODE == 'Inference':\n",
" !pip install flask-ngrok3 -q\n",
"\n",
"#@markdown Obtain your Ngrok auth token from [here](https://dashboard.ngrok.com/get-started/your-authtoken)\n",
" NGROK_AUTH_TOKEN = '' #@param {type:\"string\"}"
],
"metadata": {
"id": "OL-1YvcbOiWd",
"cellView": "form"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"if MODE != 'Inference':\n",
" raise Exception('Mode is not set to Inference.')\n",
"\n",
"\n",
"#@title Run RVC Inference server\n",
"import json\n",
"import sys\n",
"import wave\n",
"from pathlib import Path\n",
"\n",
"BASE_DIR = Path('/content')\n",
"sys.path.append(str(BASE_DIR))\n",
"\n",
"import torch\n",
"from multiprocessing import cpu_count\n",
"from flask_ngrok3 import run_with_ngrok\n",
"from flask import Flask, request, send_file\n",
"\n",
"from vc_infer_pipeline import VC\n",
"from infer_pack.models import (\n",
" SynthesizerTrnMs256NSFsid,\n",
" SynthesizerTrnMs256NSFsid_nono,\n",
" SynthesizerTrnMs768NSFsid,\n",
" SynthesizerTrnMs768NSFsid_nono,\n",
")\n",
"from my_utils import load_audio\n",
"from fairseq import checkpoint_utils\n",
"from scipy.io import wavfile\n",
"\n",
"\n",
"INPUT_VOICE_PATH = 'input.mp3'\n",
"OUTPUT_VOICE_PATH = 'output.wav'\n",
"MODEL_NAME = ''\n",
"DEVICE = 'cuda:0'\n",
"cpt = None\n",
"\n",
"\n",
"class Config:\n",
" def __init__(self, device, is_half):\n",
" self.device = device\n",
" self.is_half = is_half\n",
" self.n_cpu = 0\n",
" self.gpu_name = None\n",
" self.gpu_mem = None\n",
" self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()\n",
"\n",
" def device_config(self) -> tuple:\n",
" if torch.cuda.is_available():\n",
" i_device = int(self.device.split(\":\")[-1])\n",
" self.gpu_name = torch.cuda.get_device_name(i_device)\n",
"\n",
" if (\n",
" (\"16\" in self.gpu_name and \"V100\" not in self.gpu_name.upper())\n",
" or \"P40\" in self.gpu_name.upper()\n",
" or \"1060\" in self.gpu_name\n",
" or \"1070\" in self.gpu_name\n",
" or \"1080\" in self.gpu_name\n",
" ):\n",
" print(\"16 series/10 series P40 forced single precision\")\n",
" self.is_half = False\n",
" for config_file in [\"32k.json\", \"40k.json\", \"48k.json\"]:\n",
" with open(f\"configs/{config_file}\", \"r\") as f:\n",
" strr = f.read().replace(\"true\", \"false\")\n",
" with open(f\"configs/{config_file}\", \"w\") as f:\n",
" f.write(strr)\n",
" with open(\"trainset_preprocess_pipeline_print.py\", \"r\") as f:\n",
" strr = f.read().replace(\"3.7\", \"3.0\")\n",
" with open(\"trainset_preprocess_pipeline_print.py\", \"w\") as f:\n",
" f.write(strr)\n",
" else:\n",
" self.gpu_name = None\n",
"\n",
" self.gpu_mem = int(\n",
" torch.cuda.get_device_properties(i_device).total_memory\n",
" / 1024\n",
" / 1024\n",
" / 1024\n",
" + 0.4\n",
" )\n",
" if self.gpu_mem <= 4:\n",
" with open(\"trainset_preprocess_pipeline_print.py\", \"r\") as f:\n",
" strr = f.read().replace(\"3.7\", \"3.0\")\n",
" with open(\"trainset_preprocess_pipeline_print.py\", \"w\") as f:\n",
" f.write(strr)\n",
"\n",
" elif torch.backends.mps.is_available():\n",
" print(\"No supported N-card found, use MPS for inference\")\n",
" self.device = \"mps\"\n",
" else:\n",
" print(\"No supported N-card found, use CPU for inference\")\n",
" self.device = \"cpu\"\n",
" self.is_half = True\n",
"\n",
" if self.n_cpu == 0:\n",
" self.n_cpu = cpu_count()\n",
"\n",
" if self.is_half:\n",
" # 6G memory config\n",
" x_pad = 3\n",
" x_query = 10\n",
" x_center = 60\n",
" x_max = 65\n",
" else:\n",
" # 5G memory config\n",
" x_pad = 1\n",
" x_query = 6\n",
" x_center = 38\n",
" x_max = 41\n",
"\n",
" if self.gpu_mem != None and self.gpu_mem <= 4:\n",
" x_pad = 1\n",
" x_query = 5\n",
" x_center = 30\n",
" x_max = 32\n",
"\n",
" return x_pad, x_query, x_center, x_max\n",
"\n",
"\n",
"CONFIG = Config(DEVICE, True)\n",
"\n",
"\n",
"def load_hubert():\n",
" models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(['hubert_base.pt'], suffix='', )\n",
" hubert = models[0]\n",
" hubert = hubert.to(DEVICE)\n",
"\n",
" if True:\n",
" hubert = hubert.half()\n",
" else:\n",
" hubert = hubert.float()\n",
"\n",
" hubert.eval()\n",
" return hubert\n",
"\n",
"HUBERT_MODEL = load_hubert()\n",
"\n",
"\n",
"def get_vc(device, is_half, config):\n",
" global cpt, version, net_g, tgt_sr, vc\n",
" model_path = BASE_DIR / 'weights' / f'{MODEL_NAME}.pth'\n",
" if not model_path.exists():\n",
" print(f'The model {model_path} does not exist. Please ensure that you have filled in the proper MODEL_NAME in your .env file.')\n",
" return None\n",
"\n",
" model_path = str(model_path)\n",
" print(f'loading pth {model_path}')\n",
" cpt = torch.load(model_path, map_location='cpu')\n",
" tgt_sr = cpt[\"config\"][-1]\n",
" cpt[\"config\"][-3] = cpt[\"weight\"][\"emb_g.weight\"].shape[0]\n",
" if_f0 = cpt.get(\"f0\", 1)\n",
" version = cpt.get(\"version\", \"v1\")\n",
"\n",
" if version == \"v1\":\n",
" if if_f0 == 1:\n",
" net_g = SynthesizerTrnMs256NSFsid(*cpt[\"config\"], is_half=is_half)\n",
" else:\n",
" net_g = SynthesizerTrnMs256NSFsid_nono(*cpt[\"config\"])\n",
" elif version == \"v2\":\n",
" if if_f0 == 1:\n",
" net_g = SynthesizerTrnMs768NSFsid(*cpt[\"config\"], is_half=is_half)\n",
" else:\n",
" net_g = SynthesizerTrnMs768NSFsid_nono(*cpt[\"config\"])\n",
"\n",
" del net_g.enc_q\n",
" print(net_g.load_state_dict(cpt[\"weight\"], strict=False))\n",
" net_g.eval().to(device)\n",
"\n",
" if is_half:\n",
" net_g = net_g.half()\n",
" else:\n",
" net_g = net_g.float()\n",
"\n",
" vc = VC(tgt_sr, config)\n",
"\n",
"\n",
"def rvc_infer(pitch_change, pitch_extraction_algo, volume_envelope, index_rate):\n",
" logs_dir = BASE_DIR / 'logs' / MODEL_NAME\n",
" index_path = ''\n",
" for file in logs_dir.iterdir():\n",
" if file.suffix == '.index':\n",
" index_path = str(logs_dir / file.name)\n",
" break\n",
"\n",
" # vc single\n",
" audio = load_audio(INPUT_VOICE_PATH, 16000)\n",
" times = [0, 0, 0]\n",
" if_f0 = cpt.get('f0', 1)\n",
" audio_opt = vc.pipeline(HUBERT_MODEL, net_g, 0, audio, INPUT_VOICE_PATH, times, pitch_change, pitch_extraction_algo, index_path, index_rate, if_f0, 3, tgt_sr, 0, volume_envelope, version, 0.33, f0_file=None)\n",
" wavfile.write(OUTPUT_VOICE_PATH, tgt_sr, audio_opt)\n",
"\n",
"\n",
"app = Flask(__name__)\n",
"run_with_ngrok(app, auth_token=NGROK_AUTH_TOKEN)\n",
"\n",
"@app.route('/', methods=['GET'])\n",
"def test():\n",
" response = {'status':'OK','message':'Test'}\n",
" return json.dumps(response)\n",
"\n",
"\n",
"@app.route('/infer', methods=['POST'])\n",
"def infer():\n",
" global MODEL_NAME, cpt\n",
" model_name = request.args.get('model')\n",
" if MODEL_NAME != model_name:\n",
" MODEL_NAME = model_name\n",
" if cpt:\n",
" del cpt\n",
" get_vc(DEVICE, True, CONFIG)\n",
"\n",
" pitch_change = int(request.args.get('pitch'))\n",
" pitch_extraction_algo = request.args.get('algo')\n",
" volume_envelope = float(request.args.get('volume'))\n",
" index_rate = float(request.args.get('index_rate'))\n",
" audio_data = request.files['audio_file']\n",
" audio_data.save(INPUT_VOICE_PATH)\n",
" rvc_infer(pitch_change, pitch_extraction_algo, volume_envelope, index_rate)\n",
" return send_file(OUTPUT_VOICE_PATH, mimetype=\"audio/wav\")\n",
"\n",
"app.run()"
],
"metadata": {
"id": "3qcqylOdMDFg",
"cellView": "form"
},
"execution_count": null,
"outputs": []
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"private_outputs": true,
"provenance": [],
"gpuType": "T4"
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
================================================
FILE: LICENSE
================================================
MIT License
Copyright (c) 2023 SociallyIneptWeeb
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: README.md
================================================
# AniVoiceChanger

An "extension" for Retrieval-based Voice Conversion WebUI. Provides a way to record your voice, convert it using a trained voice model, and output it in voice-chat of any application without running the webui.
Showcase: https://www.youtube.com/watch?v=C-PqTbh0LxY
Setup Guide: https://www.youtube.com/watch?v=K4vVW7iA1w8
## Setup
### Prerequisites
#### Install Git
Follow the instructions [here](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) to install Git on your computer.
#### Install 7-Zip
Download and Install the 7-Zip application from [here](https://www.7-zip.org/download.html).
This is used to extract the zipped RVC WebUI application after it has been downloaded.
#### Install Virtual Audio Cable
Download and Install VB-CABLE Driver from [here](https://vb-audio.com/Cable/) by extracting all files and Run Setup Program in administrator mode. Reboot after installation.
This is used to pipe the converted voice audio into the audio input of apps.
### Install RVC WebUI
If you haven't installed the RVC WebUI, download the RVC-beta.7z file from [here](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/RVC-beta.7z) and extract it using 7-Zip into a folder of your choosing. It will take around 8GB of space, not including any voice models that you may train later on.
### Clone AniVoiceChanger repository
Within the extracted RVC-beta folder (Should have a bunch of folders and files), open a command line window and run this command to clone this entire repository and install the additional dependencies required for this extension.
```
git clone https://github.com/SociallyIneptWeeb/AniVoiceChanger
runtime\python.exe -m pip install -r AniVoiceChanger\extra_requirements.txt
```
### Filling in your Environment Variables in the .env file
Follow the instructions written in the .env file and fill in the appropriate values. If unsure, please refer to [this section](https://www.youtube.com/watch?v=K4vVW7iA1w8&t=728s) of the setup video.
## Usage
This program assumes that you have already trained a voice model, with the model file in the weights folder. If you have not done so, please refer to [this section](https://www.youtube.com/watch?v=K4vVW7iA1w8&t=457s) of the setup video.
Remember to change the audio input device of the game or application you are using to Cable Output (VB-Audio Virtual Cable).
There are 2 ways to run this program, either locally or using google colab. If you have around 5 GB of GPU VRAM to spare, feel free to run this locally while using the `crepe` pitch extraction algorithm. If you only have around 3 GB of GPU VRAM, you can also run this locally while using the `pm` pitch extraction algorithm. If none of these requirements are met, you should run this using Google Colab.
### Local
To start the program, open a command line window in the extracted RVC-beta folder (Should have a bunch of folders and files) and run this command.
```runtime\python.exe AniVoiceChanger\main_local.py```
> Do note that every time a variable is updated in the `.env` file, you will have to rerun this command for the changes to take into effect. E.g. when changing the model name.
Now, hold the RECORD_KEY as defined in your .env file on your keyboard and speak into your mic. For the first time, this might take around 5 seconds to generate and play the voice. For consecutive uses, the time taken will be drastically reduced with caching. The voice will be played into the Cable Output audio device and your speakers as defined in the `.env` file. The generated voice will also be written into [this folder](audio/) as `output.wav` file.
### Google Colab
Go to [AniVoiceChanger_colab.ipynb](AniVoiceChanger_colab.ipynb) file in Github and click on `Open in Colab` badge. This will open a Colab notebook. Follow the instructions in the notebook to either train a voice model, or run the RVC Inference server.
If you have already uploaded a trained voice model to the Colab runtime and it has started running the inference server, the output of the last cell should display a Ngrok public url. Copy and paste this url into the `COLAB_URL` environment variable in your `.env` file. After all your environment variables are properly set, open a command line window in the extracted RVC-beta folder (Should have a bunch of folders and files) and run this command. Do note that every time a variable is updated in the `.env` file, you will have to rerun this command for the changes to take into effect. E.g. when changing the model name.
```runtime\python.exe AniVoiceChanger\main_colab.py```
> Do note that every time a variable is updated in the `.env` file, you will have to rerun this command for the changes to take into effect. E.g. when changing the model name.
Now, hold the RECORD_KEY as defined in your .env file on your keyboard and speak into your mic. For the first time, this might take around 10 seconds to generate and play the voice. For consecutive uses, the time taken will be drastically reduced with caching. The voice will be played into the Cable Output audio device and your speakers as defined in the `.env` file. The generated voice will also be written into [this folder](audio/) as `output.wav` file.
## Terms of Use
The use of the converted voice for the following purposes is prohibited.
* Criticizing or attacking individuals.
* Advocating for or opposing specific political positions, religions, or ideologies.
* Publicly displaying strongly stimulating expressions without proper zoning.
* Selling of voice models and generated voice clips.
* Impersonation of the original owner of the voice with malicious intentions to harm/hurt others.
* Fraudulent purposes that lead to identity theft or fraudulent phone calls.
## Disclaimer
I am not liable for any direct, indirect, consequential, incidental, or special damages arising out of or in any way connected with the use/misuse or inability to use this software.
================================================
FILE: audio/AUDIO.txt
================================================
Input and Output audio files are created and stored here.
================================================
FILE: extra_requirements.txt
================================================
keyboard
pyaudio
python-dotenv
================================================
FILE: get_audio_devices.py
================================================
import sounddevice as sd
if __name__ == '__main__':
for device in sd.query_devices():
print(f"{device['index']}: {device['name']}")
================================================
FILE: main_colab.py
================================================
import sys
import wave
from pathlib import Path
from time import sleep, time
from os import getenv
from urllib.parse import urlencode
import keyboard
from threading import Thread
BASE_DIR = Path(__file__).resolve().parent.parent
sys.path.append(str(BASE_DIR))
import pyaudio
import requests
import sounddevice as sd
import soundfile as sf
from dotenv import load_dotenv
# load environment variables
load_dotenv()
COLAB_URL = getenv('COLAB_URL')
MODEL_NAME = getenv('MODEL_NAME')
if MODEL_NAME.endswith('.pth'):
MODEL_NAME = MODEL_NAME[:-4]
PITCH_CHANGE = int(getenv('PITCH_CHANGE'))
VOLUME_ENVELOPE = float(getenv('VOLUME_ENVELOPE'))
INDEX_RATE = float(getenv('INDEX_RATE')) if getenv('INDEX_RATE') else 0
PITCH_EXTRACTION_ALGO = getenv('PITCH_EXTRACTION_ALGO')
GPU_INDEX = getenv('GPU_INDEX')
MIC_RECORD_KEY = getenv('MIC_RECORD_KEY')
INGAME_PUSH_TO_TALK_KEY = getenv('INGAME_PUSH_TO_TALK_KEY')
MICROPHONE_ID = int(getenv('MICROPHONE_ID')) if getenv('MICROPHONE_ID') else None
SPEAKERS_INPUT_ID = int(getenv('SPEAKERS_INPUT_ID')) if getenv('SPEAKERS_INPUT_ID') else None
def rvc_infer_colab():
params_encoded = urlencode({'model': MODEL_NAME, 'pitch': PITCH_CHANGE, 'algo': PITCH_EXTRACTION_ALGO, 'volume': VOLUME_ENVELOPE, 'index_rate': INDEX_RATE})
with open(INPUT_VOICE_PATH, 'rb') as infile:
files = {'audio_file': infile}
r = requests.post(f'{COLAB_URL}/infer?{params_encoded}', files=files)
with open(OUTPUT_VOICE_PATH, 'wb') as outfile:
outfile.write(r.content)
def play_voice(device_id):
data, fs = sf.read(OUTPUT_VOICE_PATH, dtype='float32')
if INGAME_PUSH_TO_TALK_KEY:
keyboard.press(INGAME_PUSH_TO_TALK_KEY)
sd.play(data, fs, device=device_id)
sd.wait()
if INGAME_PUSH_TO_TALK_KEY:
keyboard.release(INGAME_PUSH_TO_TALK_KEY)
def on_press_key(_):
global frames, recording, stream
if not recording:
print('\nRecording has started.')
frames = []
recording = True
stream = p.open(format=FORMAT,
channels=MIC_CHANNELS,
rate=MIC_SAMPLING_RATE,
input=True,
frames_per_buffer=CHUNK,
input_device_index=MICROPHONE_ID)
def on_release_key(_):
global recording, stream
recording = False
stream.stop_stream()
stream.close()
stream = None
# if key not held down for long enough
if not frames or len(frames) < 20:
print('No audio file to transcribe detected. Hold down the key for a longer time.')
return
print('Converting voice...')
start_time = time()
# write microphone audio to file
wf = wave.open(str(INPUT_VOICE_PATH), 'wb')
wf.setnchannels(MIC_CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(MIC_SAMPLING_RATE)
wf.writeframes(b''.join(frames))
wf.close()
# voice change
rvc_infer_colab()
print(f'Time taken for RVC voice conversion: {time() - start_time}s')
# play to both app mic input and speakers
threads = [Thread(target=play_voice, args=[CABLE_INPUT_ID]), Thread(target=play_voice, args=[SPEAKERS_INPUT_ID])]
[t.start() for t in threads]
[t.join() for t in threads]
if __name__ == '__main__':
INPUT_VOICE_PATH = str(BASE_DIR / 'AniVoiceChanger' / 'audio' / 'input.mp3')
OUTPUT_VOICE_PATH = str(BASE_DIR / 'AniVoiceChanger' / 'audio' / 'output.wav')
CHUNK = 1024
FORMAT = pyaudio.paInt16
p = pyaudio.PyAudio()
if MICROPHONE_ID is None:
MICROPHONE_ID = p.get_default_input_device_info()['index']
if SPEAKERS_INPUT_ID is None:
SPEAKERS_INPUT_ID = p.get_default_output_device_info()['index']
CABLE_INPUT_ID = None
for audio_device in sd.query_devices():
if 'CABLE Input' in audio_device['name']:
CABLE_INPUT_ID = audio_device['index']
break
if not CABLE_INPUT_ID:
print('Virtual audio cable was not found. Please download and install it.')
sys.exit()
# get channels and sampling rate of mic
mic_info = p.get_device_info_by_index(MICROPHONE_ID)
MIC_CHANNELS = mic_info['maxInputChannels']
MIC_SAMPLING_RATE = 40000
print('Voice changer is booting up...')
frames = []
recording = False
stream = None
keyboard.on_press_key(MIC_RECORD_KEY, on_press_key)
keyboard.on_release_key(MIC_RECORD_KEY, on_release_key)
try:
print('Voice changer is ready.')
while True:
if recording and stream:
data = stream.read(CHUNK)
frames.append(data)
else:
sleep(0.2)
except KeyboardInterrupt:
print('Closing voice changer...')
================================================
FILE: main_local.py
================================================
import sys
import wave
from pathlib import Path
from time import sleep, time
from os import getenv
import keyboard
from threading import Thread
BASE_DIR = Path(__file__).resolve().parent.parent
sys.path.append(str(BASE_DIR))
import torch
import pyaudio
import sounddevice as sd
import soundfile as sf
from multiprocessing import cpu_count
from dotenv import load_dotenv
from vc_infer_pipeline import VC
from infer_pack.models import (
SynthesizerTrnMs256NSFsid,
SynthesizerTrnMs256NSFsid_nono,
SynthesizerTrnMs768NSFsid,
SynthesizerTrnMs768NSFsid_nono,
)
from my_utils import load_audio
from fairseq import checkpoint_utils
from scipy.io import wavfile
# load environment variables
load_dotenv()
MODEL_NAME = getenv('MODEL_NAME')
if MODEL_NAME.endswith('.pth'):
MODEL_NAME = MODEL_NAME[:-4]
PITCH_CHANGE = int(getenv('PITCH_CHANGE'))
VOLUME_ENVELOPE = float(getenv('VOLUME_ENVELOPE'))
INDEX_RATE = float(getenv('INDEX_RATE')) if getenv('INDEX_RATE') else 0
PITCH_EXTRACTION_ALGO = getenv('PITCH_EXTRACTION_ALGO')
GPU_INDEX = getenv('GPU_INDEX')
MIC_RECORD_KEY = getenv('MIC_RECORD_KEY')
INGAME_PUSH_TO_TALK_KEY = getenv('INGAME_PUSH_TO_TALK_KEY')
MICROPHONE_ID = int(getenv('MICROPHONE_ID')) if getenv('MICROPHONE_ID') else None
SPEAKERS_INPUT_ID = int(getenv('SPEAKERS_INPUT_ID')) if getenv('SPEAKERS_INPUT_ID') else None
class Config:
def __init__(self, device, is_half):
self.device = device
self.is_half = is_half
self.n_cpu = 0
self.gpu_name = None
self.gpu_mem = None
self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
def device_config(self) -> tuple:
if torch.cuda.is_available():
i_device = int(self.device.split(":")[-1])
self.gpu_name = torch.cuda.get_device_name(i_device)
if (
("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
or "P40" in self.gpu_name.upper()
or "1060" in self.gpu_name
or "1070" in self.gpu_name
or "1080" in self.gpu_name
):
print("16 series/10 series P40 forced single precision")
self.is_half = False
for config_file in ["32k.json", "40k.json", "48k.json"]:
with open(f"configs/{config_file}", "r") as f:
strr = f.read().replace("true", "false")
with open(f"configs/{config_file}", "w") as f:
f.write(strr)
with open("trainset_preprocess_pipeline_print.py", "r") as f:
strr = f.read().replace("3.7", "3.0")
with open("trainset_preprocess_pipeline_print.py", "w") as f:
f.write(strr)
else:
self.gpu_name = None
self.gpu_mem = int(
torch.cuda.get_device_properties(i_device).total_memory
/ 1024
/ 1024
/ 1024
+ 0.4
)
if self.gpu_mem <= 4:
with open("trainset_preprocess_pipeline_print.py", "r") as f:
strr = f.read().replace("3.7", "3.0")
with open("trainset_preprocess_pipeline_print.py", "w") as f:
f.write(strr)
elif torch.backends.mps.is_available():
print("No supported N-card found, use MPS for inference")
self.device = "mps"
else:
print("No supported N-card found, use CPU for inference")
self.device = "cpu"
self.is_half = True
if self.n_cpu == 0:
self.n_cpu = cpu_count()
if self.is_half:
# 6G memory config
x_pad = 3
x_query = 10
x_center = 60
x_max = 65
else:
# 5G memory config
x_pad = 1
x_query = 6
x_center = 38
x_max = 41
if self.gpu_mem != None and self.gpu_mem <= 4:
x_pad = 1
x_query = 5
x_center = 30
x_max = 32
return x_pad, x_query, x_center, x_max
def load_hubert():
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(['hubert_base.pt'], suffix='', )
hubert = models[0]
hubert = hubert.to(device)
if is_half:
hubert = hubert.half()
else:
hubert = hubert.float()
hubert.eval()
return hubert
def get_vc():
model_path = BASE_DIR / 'weights' / f'{MODEL_NAME}.pth'
if not model_path.exists():
print(f'The model {model_path} does not exist. Please ensure that you have filled in the proper MODEL_NAME in your .env file.')
raise Exception()
model_path = str(model_path)
print(f'loading pth {model_path}')
cpt = torch.load(model_path, map_location='cpu')
tgt_sr = cpt["config"][-1]
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
if_f0 = cpt.get("f0", 1)
version = cpt.get("version", "v1")
if version == "v1":
if if_f0 == 1:
net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half)
else:
net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
elif version == "v2":
if if_f0 == 1:
net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=is_half)
else:
net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
del net_g.enc_q
print(net_g.load_state_dict(cpt["weight"], strict=False))
net_g.eval().to(device)
if is_half:
net_g = net_g.half()
else:
net_g = net_g.float()
vc = VC(tgt_sr, config)
return cpt, version, net_g, tgt_sr, vc
def rvc_infer():
logs_dir = BASE_DIR / 'logs' / MODEL_NAME
index_path = ''
for file in logs_dir.iterdir():
if file.suffix == '.index':
index_path = str(logs_dir / file.name)
break
# vc single
audio = load_audio(INPUT_VOICE_PATH, 16000)
times = [0, 0, 0]
if_f0 = cpt.get('f0', 1)
audio_opt = vc.pipeline(hubert_model, net_g, 0, audio, INPUT_VOICE_PATH, times, PITCH_CHANGE, PITCH_EXTRACTION_ALGO, index_path, INDEX_RATE, if_f0, 3, tgt_sr, 0, VOLUME_ENVELOPE, version, 0.33, f0_file=None)
wavfile.write(OUTPUT_VOICE_PATH, tgt_sr, audio_opt)
def play_voice(device_id):
data, fs = sf.read(OUTPUT_VOICE_PATH, dtype='float32')
if INGAME_PUSH_TO_TALK_KEY:
keyboard.press(INGAME_PUSH_TO_TALK_KEY)
sd.play(data, fs, device=device_id)
sd.wait()
if INGAME_PUSH_TO_TALK_KEY:
keyboard.release(INGAME_PUSH_TO_TALK_KEY)
def on_press_key(_):
global frames, recording, stream
if not recording:
print('\nRecording has started.')
frames = []
recording = True
stream = p.open(format=FORMAT,
channels=MIC_CHANNELS,
rate=MIC_SAMPLING_RATE,
input=True,
frames_per_buffer=CHUNK,
input_device_index=MICROPHONE_ID)
def on_release_key(_):
global recording, stream
recording = False
stream.stop_stream()
stream.close()
stream = None
# if key not held down for long enough
if not frames or len(frames) < 20:
print('No audio file to transcribe detected. Hold down the key for a longer time.')
return
print('Converting voice...')
start_time = time()
# write microphone audio to file
wf = wave.open(str(INPUT_VOICE_PATH), 'wb')
wf.setnchannels(MIC_CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(MIC_SAMPLING_RATE)
wf.writeframes(b''.join(frames))
wf.close()
# voice change
rvc_infer()
print(f'Time taken for RVC voice conversion: {time() - start_time}s')
# play to both app mic input and speakers
threads = [Thread(target=play_voice, args=[CABLE_INPUT_ID]), Thread(target=play_voice, args=[SPEAKERS_INPUT_ID])]
[t.start() for t in threads]
[t.join() for t in threads]
if __name__ == '__main__':
device = f'cuda:{GPU_INDEX}'
is_half = True
config = Config(device, is_half)
INPUT_VOICE_PATH = str(BASE_DIR / 'AniVoiceChanger' / 'audio' / 'input.mp3')
OUTPUT_VOICE_PATH = str(BASE_DIR / 'AniVoiceChanger' / 'audio' / 'output.wav')
CHUNK = 1024
FORMAT = pyaudio.paInt16
p = pyaudio.PyAudio()
if MICROPHONE_ID is None:
MICROPHONE_ID = p.get_default_input_device_info()['index']
if SPEAKERS_INPUT_ID is None:
SPEAKERS_INPUT_ID = p.get_default_output_device_info()['index']
CABLE_INPUT_ID = None
for audio_device in sd.query_devices():
if 'CABLE Input' in audio_device['name']:
CABLE_INPUT_ID = audio_device['index']
break
if not CABLE_INPUT_ID:
print('Virtual audio cable was not found. Please download and install it.')
sys.exit()
# get channels and sampling rate of mic
mic_info = p.get_device_info_by_index(MICROPHONE_ID)
MIC_CHANNELS = mic_info['maxInputChannels']
MIC_SAMPLING_RATE = 40000
print('Voice changer is booting up...')
# load hubert model
hubert_model = load_hubert()
# get vc
cpt, version, net_g, tgt_sr, vc = get_vc()
frames = []
recording = False
stream = None
keyboard.on_press_key(MIC_RECORD_KEY, on_press_key)
keyboard.on_release_key(MIC_RECORD_KEY, on_release_key)
try:
print('Voice changer is ready.')
while True:
if recording and stream:
data = stream.read(CHUNK)
frames.append(data)
else:
sleep(0.2)
except KeyboardInterrupt:
print('Closing voice changer...')
gitextract_wa9osrpx/ ├── .gitattributes ├── .gitignore ├── AniVoiceChanger_colab.ipynb ├── LICENSE ├── README.md ├── audio/ │ └── AUDIO.txt ├── extra_requirements.txt ├── get_audio_devices.py ├── main_colab.py └── main_local.py
SYMBOL INDEX (13 symbols across 2 files)
FILE: main_colab.py
function rvc_infer_colab (line 38) | def rvc_infer_colab():
function play_voice (line 49) | def play_voice(device_id):
function on_press_key (line 62) | def on_press_key(_):
function on_release_key (line 76) | def on_release_key(_):
FILE: main_local.py
class Config (line 48) | class Config:
method __init__ (line 49) | def __init__(self, device, is_half):
method device_config (line 57) | def device_config(self) -> tuple:
function load_hubert (line 126) | def load_hubert():
function get_vc (line 140) | def get_vc():
function rvc_infer (line 178) | def rvc_infer():
function play_voice (line 194) | def play_voice(device_id):
function on_press_key (line 207) | def on_press_key(_):
function on_release_key (line 221) | def on_release_key(_):
Condensed preview — 10 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (47K chars).
[
{
"path": ".gitattributes",
"chars": 66,
"preview": "# Auto detect text files and perform LF normalization\n* text=auto\n"
},
{
"path": ".gitignore",
"chars": 29,
"preview": ".idea\naudio/*.mp3\naudio/*.wav"
},
{
"path": "AniVoiceChanger_colab.ipynb",
"chars": 21872,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"source\": [\n \"[ 2023 SociallyIneptWeeb\n\nPermission is hereby granted, free of charge, to any person obtaining"
},
{
"path": "README.md",
"chars": 5996,
"preview": "# AniVoiceChanger\n\n\n\nAn \"extension\" for Retrieval-based Voice Conversion WebUI. Provides a wa"
},
{
"path": "audio/AUDIO.txt",
"chars": 57,
"preview": "Input and Output audio files are created and stored here."
},
{
"path": "extra_requirements.txt",
"chars": 30,
"preview": "keyboard\npyaudio\npython-dotenv"
},
{
"path": "get_audio_devices.py",
"chars": 145,
"preview": "import sounddevice as sd\n\n\nif __name__ == '__main__':\n for device in sd.query_devices():\n print(f\"{device['ind"
},
{
"path": "main_colab.py",
"chars": 4760,
"preview": "import sys\nimport wave\nfrom pathlib import Path\nfrom time import sleep, time\nfrom os import getenv\nfrom urllib.parse imp"
},
{
"path": "main_local.py",
"chars": 9819,
"preview": "import sys\nimport wave\nfrom pathlib import Path\nfrom time import sleep, time\nfrom os import getenv\n\nimport keyboard\nfrom"
}
]
About this extraction
This page contains the full source code of the SociallyIneptWeeb/AniVoiceChanger GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 10 files (42.8 KB), approximately 12.3k tokens, and a symbol index with 13 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.