Repository: SociallyIneptWeeb/AniVoiceChanger
Branch: main
Commit: fc16d0c18efd
Files: 10
Total size: 42.8 KB

Directory structure:
gitextract_wa9osrpx/

├── .gitattributes
├── .gitignore
├── AniVoiceChanger_colab.ipynb
├── LICENSE
├── README.md
├── audio/
│   └── AUDIO.txt
├── extra_requirements.txt
├── get_audio_devices.py
├── main_colab.py
└── main_local.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitattributes
================================================
# Auto detect text files and perform LF normalization
* text=auto


================================================
FILE: .gitignore
================================================
.idea
audio/*.mp3
audio/*.wav

================================================
FILE: AniVoiceChanger_colab.ipynb
================================================
{
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/SociallyIneptWeeb/AniVoiceChanger/blob/main/AniVoiceChanger_colab.ipynb)"
      ],
      "metadata": {
        "id": "wEsZxTheMO8_"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "jwu07JgqoFON",
        "cellView": "form"
      },
      "outputs": [],
      "source": [
        "#@title Mount Drive\n",
        "\n",
        "from google.colab import drive\n",
        "drive.mount('/content/drive')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ge_97mfpgqTm",
        "cellView": "form"
      },
      "outputs": [],
      "source": [
        "#@title Clone repository\n",
        "!git init\n",
        "!git remote add origin https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI.git\n",
        "!git fetch origin 195a14e5c51ec02774c4d1961d0a4b67755e25c8 --depth=1\n",
        "!git reset --hard FETCH_HEAD"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "#@title Set Mode and Parameters\n",
        "#@markdown ## Mode\n",
        "#@markdown To run the WebUI for training a voice model, set the mode to Training.\n",
        "\n",
        "#@markdown To run the voice changer server for main_colab.py to connect to, set the mode to Inference.\n",
        "MODE = 'Training' #@param ['Training', 'Inference']\n",
        "\n",
        "#@markdown If MODE: Training, specify the path to a zip file containing the voice clips in your google drive to be used for training. If MODE: Inference, ignore.\n",
        "DATASET = 'char_voice_lines.zip' #@param {type:\"string\"}\n",
        "DATASET = '/content/drive/MyDrive/' + DATASET\n",
        "\n",
        "if MODE == 'Training':\n",
        "  !mkdir -p dataset\n",
        "  !unzip -d dataset -B {DATASET}\n",
        "  # rename duplicate filenames in dataset\n",
        "  !ls -a /content/dataset/\n",
        "  !rename 's/(\\w+)\\.(\\w+)~(\\d*)/$1_$3.$2/' /content/dataset/*.*~*\n",
        "\n",
        "#@markdown ## Upload a trained model\n",
        "#@markdown Only fill the below fields if you would like to continue training a previously trained model or use it for inference. Specify the name and epoch number of the model to be used or trained. The folder containing the trained files in your google drive will be used.\n",
        "\n",
        "MODELNAME = \"\"  #@param {type:\"string\"}\n",
        "MODELEPOCH = 2333333  #@param {type:\"integer\"}\n",
        "if MODELNAME:\n",
        "  !mkdir -p /content/logs/{MODELNAME}\n",
        "  !cp /content/drive/MyDrive/{MODELNAME}_files/*.index /content/logs/{MODELNAME}/\n",
        "  !cp /content/drive/MyDrive/{MODELNAME}_files/{MODELNAME}.pth /content/weights/\n",
        "  if MODE == 'Training':\n",
        "    !cp /content/drive/MyDrive/{MODELNAME}_files/D_{MODELEPOCH}.pth /content/logs/{MODELNAME}/\n",
        "    !cp /content/drive/MyDrive/{MODELNAME}_files/G_{MODELEPOCH}.pth /content/logs/{MODELNAME}/\n",
        "    !cp /content/drive/MyDrive/{MODELNAME}_files/*.npy /content/logs/{MODELNAME}/"
      ],
      "metadata": {
        "id": "S8wlJabmgeBK",
        "cellView": "form"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "pqE0PrnuRqI2",
        "cellView": "form"
      },
      "outputs": [],
      "source": [
        "#@title Install requirements\n",
        "!pip install -r requirements.txt\n",
        "!apt -y install -qq aria2"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "UG3XpUwEomUz",
        "cellView": "form"
      },
      "outputs": [],
      "source": [
        "#@title Download pretrained models\n",
        "\n",
        "# v1\n",
        "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D32k.pth -d /content/pretrained -o D32k.pth\n",
        "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D40k.pth -d /content/pretrained -o D40k.pth\n",
        "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D48k.pth -d /content/pretrained -o D48k.pth\n",
        "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G32k.pth -d /content/pretrained -o G32k.pth\n",
        "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G40k.pth -d /content/pretrained -o G40k.pth\n",
        "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G48k.pth -d /content/pretrained -o G48k.pth\n",
        "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D32k.pth -d /content/pretrained -o f0D32k.pth\n",
        "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D40k.pth -d /content/pretrained -o f0D40k.pth\n",
        "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D48k.pth -d /content/pretrained -o f0D48k.pth\n",
        "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G32k.pth -d /content/pretrained -o f0G32k.pth\n",
        "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G40k.pth -d /content/pretrained -o f0G40k.pth\n",
        "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G48k.pth -d /content/pretrained -o f0G48k.pth\n",
        "\n",
        "# v2\n",
        "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/D40k.pth -d /content/pretrained_v2 -o D40k.pth\n",
        "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/G40k.pth -d /content/pretrained_v2 -o G40k.pth\n",
        "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0D40k.pth -d /content/pretrained_v2 -o f0D40k.pth\n",
        "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0G40k.pth -d /content/pretrained_v2 -o f0G40k.pth"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "HugjmZqZRuiF",
        "cellView": "form"
      },
      "outputs": [],
      "source": [
        "#@title Download Vocal Separation model\n",
        "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2-人声vocals+非人声instrumentals.pth -d /content/uvr5_weights -o HP2-人声vocals+非人声instrumentals.pth\n",
        "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5-主旋律人声vocals+其他instrumentals.pth -d /content/uvr5_weights -o HP5-主旋律人声vocals+其他instrumentals.pth"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "2RCaT9FTR0ej",
        "cellView": "form"
      },
      "outputs": [],
      "source": [
        "#@title Download hubert_base model\n",
        "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt -d /content -o hubert_base.pt"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "7vh6vphDwO0b",
        "cellView": "form"
      },
      "outputs": [],
      "source": [
        "#@title Run WebUI for Training, skip if Inference\n",
        "if MODE == 'Training':\n",
        "  # %load_ext tensorboard\n",
        "  # %tensorboard --logdir /content/Retrieval-based-Voice-Conversion-WebUI/logs\n",
        "  !python3 infer-web.py --colab --pycmd python3"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "FgJuNeAwx5Y_",
        "cellView": "form"
      },
      "outputs": [],
      "source": [
        "#@title Manually back up the trained model files to Google Drive for Mode: \"Training\"\n",
        "#@markdown MODELNAME should be the EXPERIMENT_NAME that you typed.\n",
        "\n",
        "#@markdown If the name of the model is john, and in the logs/john folder is a file called D_2333333.pth, set the MODELNAME: john and MODELEPOCH: 2333333\n",
        "\n",
        "if MODE == 'Training':\n",
        "\n",
        "#@markdown Model name\n",
        "  MODELNAME = \"\"  #@param {type:\"string\"}\n",
        "  if MODELNAME:\n",
        "#@markdown Epoch number\n",
        "    MODELEPOCH = 2333333  #@param {type:\"integer\"}\n",
        "#@markdown Save intermediate models if you would like to continue training the model later\n",
        "    SAVE_INTERMEDIATE = True  #@param {type:\"boolean\"}\n",
        "\n",
        "    !mkdir -p /content/drive/MyDrive/{MODELNAME}_files\n",
        "\n",
        "    if SAVE_INTERMEDIATE:\n",
        "      !cp /content/logs/{MODELNAME}/G_{MODELEPOCH}.pth /content/drive/MyDrive/{MODELNAME}_files/\n",
        "      !cp /content/logs/{MODELNAME}/D_{MODELEPOCH}.pth /content/drive/MyDrive/{MODELNAME}_files/\n",
        "      !cp /content/logs/{MODELNAME}/total_*.npy /content/drive/MyDrive/{MODELNAME}_files/\n",
        "\n",
        "    !cp /content/logs/{MODELNAME}/added_*.index /content/drive/MyDrive/{MODELNAME}_files/\n",
        "    !cp /content/weights/{MODELNAME}.pth /content/drive/MyDrive/{MODELNAME}_files/"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Inference\n",
        "\n",
        "The code below is to be run for Mode: Inference\n",
        "\n",
        "When prompted `Proceed (Y/n)?`, click beside it, type `Y` and press `Enter`.\n",
        "\n",
        "If `WARNING: The following packages were previously imported in this runtime: [numpy] You must restart the runtime in order to use newly installed versions.` is seen in the output, click `Restart Runtime` and then continue running the next cell."
      ],
      "metadata": {
        "id": "0JTjkTndoHb6"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "#@title Install specific numpy version. If needed, click Restart Runtime before running the bottom two cells.\n",
        "if MODE == 'Inference':\n",
        "  !pip uninstall numpy\n",
        "  !pip install numpy==1.23.5"
      ],
      "metadata": {
        "id": "cfJV7kbKqtnw",
        "cellView": "form"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "try:\n",
        "  MODE\n",
        "except NameError:\n",
        "  MODE = 'Inference'\n",
        "\n",
        "#@title Set your NGROK_AUTH_TOKEN.\n",
        "if MODE == 'Inference':\n",
        "  !pip install flask-ngrok3 -q\n",
        "\n",
        "#@markdown Obtain your Ngrok auth token from [here](https://dashboard.ngrok.com/get-started/your-authtoken)\n",
        "  NGROK_AUTH_TOKEN = '' #@param {type:\"string\"}"
      ],
      "metadata": {
        "id": "OL-1YvcbOiWd",
        "cellView": "form"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "if MODE != 'Inference':\n",
        "  raise Exception('Mode is not set to Inference.')\n",
        "\n",
        "\n",
        "#@title Run RVC Inference server\n",
        "import json\n",
        "import sys\n",
        "import wave\n",
        "from pathlib import Path\n",
        "\n",
        "BASE_DIR = Path('/content')\n",
        "sys.path.append(str(BASE_DIR))\n",
        "\n",
        "import torch\n",
        "from multiprocessing import cpu_count\n",
        "from flask_ngrok3 import run_with_ngrok\n",
        "from flask import Flask, request, send_file\n",
        "\n",
        "from vc_infer_pipeline import VC\n",
        "from infer_pack.models import (\n",
        "    SynthesizerTrnMs256NSFsid,\n",
        "    SynthesizerTrnMs256NSFsid_nono,\n",
        "    SynthesizerTrnMs768NSFsid,\n",
        "    SynthesizerTrnMs768NSFsid_nono,\n",
        ")\n",
        "from my_utils import load_audio\n",
        "from fairseq import checkpoint_utils\n",
        "from scipy.io import wavfile\n",
        "\n",
        "\n",
        "INPUT_VOICE_PATH = 'input.mp3'\n",
        "OUTPUT_VOICE_PATH = 'output.wav'\n",
        "MODEL_NAME = ''\n",
        "DEVICE = 'cuda:0'\n",
        "cpt = None\n",
        "\n",
        "\n",
        "class Config:\n",
        "  def __init__(self, device, is_half):\n",
        "    self.device = device\n",
        "    self.is_half = is_half\n",
        "    self.n_cpu = 0\n",
        "    self.gpu_name = None\n",
        "    self.gpu_mem = None\n",
        "    self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()\n",
        "\n",
        "  def device_config(self) -> tuple:\n",
        "    if torch.cuda.is_available():\n",
        "      i_device = int(self.device.split(\":\")[-1])\n",
        "      self.gpu_name = torch.cuda.get_device_name(i_device)\n",
        "\n",
        "      if (\n",
        "        (\"16\" in self.gpu_name and \"V100\" not in self.gpu_name.upper())\n",
        "        or \"P40\" in self.gpu_name.upper()\n",
        "        or \"1060\" in self.gpu_name\n",
        "        or \"1070\" in self.gpu_name\n",
        "        or \"1080\" in self.gpu_name\n",
        "      ):\n",
        "        print(\"16 series/10 series P40 forced single precision\")\n",
        "        self.is_half = False\n",
        "        for config_file in [\"32k.json\", \"40k.json\", \"48k.json\"]:\n",
        "          with open(f\"configs/{config_file}\", \"r\") as f:\n",
        "            strr = f.read().replace(\"true\", \"false\")\n",
        "          with open(f\"configs/{config_file}\", \"w\") as f:\n",
        "            f.write(strr)\n",
        "        with open(\"trainset_preprocess_pipeline_print.py\", \"r\") as f:\n",
        "          strr = f.read().replace(\"3.7\", \"3.0\")\n",
        "        with open(\"trainset_preprocess_pipeline_print.py\", \"w\") as f:\n",
        "          f.write(strr)\n",
        "      else:\n",
        "        self.gpu_name = None\n",
        "\n",
        "      self.gpu_mem = int(\n",
        "        torch.cuda.get_device_properties(i_device).total_memory\n",
        "        / 1024\n",
        "        / 1024\n",
        "        / 1024\n",
        "        + 0.4\n",
        "      )\n",
        "      if self.gpu_mem <= 4:\n",
        "        with open(\"trainset_preprocess_pipeline_print.py\", \"r\") as f:\n",
        "          strr = f.read().replace(\"3.7\", \"3.0\")\n",
        "        with open(\"trainset_preprocess_pipeline_print.py\", \"w\") as f:\n",
        "          f.write(strr)\n",
        "\n",
        "    elif torch.backends.mps.is_available():\n",
        "      print(\"No supported N-card found, use MPS for inference\")\n",
        "      self.device = \"mps\"\n",
        "    else:\n",
        "      print(\"No supported N-card found, use CPU for inference\")\n",
        "      self.device = \"cpu\"\n",
        "      self.is_half = True\n",
        "\n",
        "    if self.n_cpu == 0:\n",
        "      self.n_cpu = cpu_count()\n",
        "\n",
        "    if self.is_half:\n",
        "      # 6G memory config\n",
        "      x_pad = 3\n",
        "      x_query = 10\n",
        "      x_center = 60\n",
        "      x_max = 65\n",
        "    else:\n",
        "      # 5G memory config\n",
        "      x_pad = 1\n",
        "      x_query = 6\n",
        "      x_center = 38\n",
        "      x_max = 41\n",
        "\n",
        "    if self.gpu_mem != None and self.gpu_mem <= 4:\n",
        "      x_pad = 1\n",
        "      x_query = 5\n",
        "      x_center = 30\n",
        "      x_max = 32\n",
        "\n",
        "    return x_pad, x_query, x_center, x_max\n",
        "\n",
        "\n",
        "CONFIG = Config(DEVICE, True)\n",
        "\n",
        "\n",
        "def load_hubert():\n",
        "  models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(['hubert_base.pt'], suffix='', )\n",
        "  hubert = models[0]\n",
        "  hubert = hubert.to(DEVICE)\n",
        "\n",
        "  if True:\n",
        "    hubert = hubert.half()\n",
        "  else:\n",
        "    hubert = hubert.float()\n",
        "\n",
        "  hubert.eval()\n",
        "  return hubert\n",
        "\n",
        "HUBERT_MODEL = load_hubert()\n",
        "\n",
        "\n",
        "def get_vc(device, is_half, config):\n",
        "  global cpt, version, net_g, tgt_sr, vc\n",
        "  model_path = BASE_DIR / 'weights' / f'{MODEL_NAME}.pth'\n",
        "  if not model_path.exists():\n",
        "    print(f'The model {model_path} does not exist. Please ensure that you have filled in the proper MODEL_NAME in your .env file.')\n",
        "    return None\n",
        "\n",
        "  model_path = str(model_path)\n",
        "  print(f'loading pth {model_path}')\n",
        "  cpt = torch.load(model_path, map_location='cpu')\n",
        "  tgt_sr = cpt[\"config\"][-1]\n",
        "  cpt[\"config\"][-3] = cpt[\"weight\"][\"emb_g.weight\"].shape[0]\n",
        "  if_f0 = cpt.get(\"f0\", 1)\n",
        "  version = cpt.get(\"version\", \"v1\")\n",
        "\n",
        "  if version == \"v1\":\n",
        "    if if_f0 == 1:\n",
        "      net_g = SynthesizerTrnMs256NSFsid(*cpt[\"config\"], is_half=is_half)\n",
        "    else:\n",
        "      net_g = SynthesizerTrnMs256NSFsid_nono(*cpt[\"config\"])\n",
        "  elif version == \"v2\":\n",
        "    if if_f0 == 1:\n",
        "      net_g = SynthesizerTrnMs768NSFsid(*cpt[\"config\"], is_half=is_half)\n",
        "    else:\n",
        "      net_g = SynthesizerTrnMs768NSFsid_nono(*cpt[\"config\"])\n",
        "\n",
        "  del net_g.enc_q\n",
        "  print(net_g.load_state_dict(cpt[\"weight\"], strict=False))\n",
        "  net_g.eval().to(device)\n",
        "\n",
        "  if is_half:\n",
        "    net_g = net_g.half()\n",
        "  else:\n",
        "    net_g = net_g.float()\n",
        "\n",
        "  vc = VC(tgt_sr, config)\n",
        "\n",
        "\n",
        "def rvc_infer(pitch_change, pitch_extraction_algo, volume_envelope, index_rate):\n",
        "  logs_dir = BASE_DIR / 'logs' / MODEL_NAME\n",
        "  index_path = ''\n",
        "  for file in logs_dir.iterdir():\n",
        "    if file.suffix == '.index':\n",
        "      index_path = str(logs_dir / file.name)\n",
        "      break\n",
        "\n",
        "  # vc single\n",
        "  audio = load_audio(INPUT_VOICE_PATH, 16000)\n",
        "  times = [0, 0, 0]\n",
        "  if_f0 = cpt.get('f0', 1)\n",
        "  audio_opt = vc.pipeline(HUBERT_MODEL, net_g, 0, audio, INPUT_VOICE_PATH, times, pitch_change, pitch_extraction_algo, index_path, index_rate, if_f0, 3, tgt_sr, 0, volume_envelope, version, 0.33, f0_file=None)\n",
        "  wavfile.write(OUTPUT_VOICE_PATH, tgt_sr, audio_opt)\n",
        "\n",
        "\n",
        "app = Flask(__name__)\n",
        "run_with_ngrok(app, auth_token=NGROK_AUTH_TOKEN)\n",
        "\n",
        "@app.route('/', methods=['GET'])\n",
        "def test():\n",
        "  response = {'status':'OK','message':'Test'}\n",
        "  return json.dumps(response)\n",
        "\n",
        "\n",
        "@app.route('/infer', methods=['POST'])\n",
        "def infer():\n",
        "  global MODEL_NAME, cpt\n",
        "  model_name = request.args.get('model')\n",
        "  if MODEL_NAME != model_name:\n",
        "    MODEL_NAME = model_name\n",
        "    if cpt:\n",
        "      del cpt\n",
        "    get_vc(DEVICE, True, CONFIG)\n",
        "\n",
        "  pitch_change = int(request.args.get('pitch'))\n",
        "  pitch_extraction_algo = request.args.get('algo')\n",
        "  volume_envelope = float(request.args.get('volume'))\n",
        "  index_rate = float(request.args.get('index_rate'))\n",
        "  audio_data = request.files['audio_file']\n",
        "  audio_data.save(INPUT_VOICE_PATH)\n",
        "  rvc_infer(pitch_change, pitch_extraction_algo, volume_envelope, index_rate)\n",
        "  return send_file(OUTPUT_VOICE_PATH, mimetype=\"audio/wav\")\n",
        "\n",
        "app.run()"
      ],
      "metadata": {
        "id": "3qcqylOdMDFg",
        "cellView": "form"
      },
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "private_outputs": true,
      "provenance": [],
      "gpuType": "T4"
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}


================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) 2023 SociallyIneptWeeb

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: README.md
================================================
# AniVoiceChanger

![](thumbnail.jpg?raw=true)

An "extension" for Retrieval-based Voice Conversion WebUI. Provides a way to record your voice, convert it using a trained voice model, and output it in voice-chat of any application without running the webui.

Showcase: https://www.youtube.com/watch?v=C-PqTbh0LxY

Setup Guide: https://www.youtube.com/watch?v=K4vVW7iA1w8

## Setup

### Prerequisites

#### Install Git

Follow the instructions [here](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) to install Git on your computer.

#### Install 7-Zip

Download and Install the 7-Zip application from [here](https://www.7-zip.org/download.html).

This is used to extract the zipped RVC WebUI application after it has been downloaded.

#### Install Virtual Audio Cable

Download and Install VB-CABLE Driver from [here](https://vb-audio.com/Cable/) by extracting all files and Run Setup Program in administrator mode. Reboot after installation.

This is used to pipe the converted voice audio into the audio input of apps.


### Install RVC WebUI

If you haven't installed the RVC WebUI, download the RVC-beta.7z file from [here](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/RVC-beta.7z) and extract it using 7-Zip into a folder of your choosing. It will take around 8GB of space, not including any voice models that you may train later on.


### Clone AniVoiceChanger repository

Within the extracted RVC-beta folder (Should have a bunch of folders and files), open a command line window and run this command to clone this entire repository and install the additional dependencies required for this extension.

```
git clone https://github.com/SociallyIneptWeeb/AniVoiceChanger
runtime\python.exe -m pip install -r AniVoiceChanger\extra_requirements.txt
```

### Filling in your Environment Variables in the .env file

Follow the instructions written in the .env file and fill in the appropriate values. If unsure, please refer to [this section](https://www.youtube.com/watch?v=K4vVW7iA1w8&t=728s) of the setup video.


## Usage

This program assumes that you have already trained a voice model, with the model file in the weights folder. If you have not done so, please refer to [this section](https://www.youtube.com/watch?v=K4vVW7iA1w8&t=457s) of the setup video.

Remember to change the audio input device of the game or application you are using to Cable Output (VB-Audio Virtual Cable).

There are 2 ways to run this program, either locally or using google colab. If you have around 5 GB of GPU VRAM to spare, feel free to run this locally while using the `crepe` pitch extraction algorithm. If you only have around 3 GB of GPU VRAM, you can also run this locally while using the `pm` pitch extraction algorithm. If none of these requirements are met, you should run this using Google Colab.

### Local

To start the program, open a command line window in the extracted RVC-beta folder (Should have a bunch of folders and files) and run this command.

```runtime\python.exe AniVoiceChanger\main_local.py```
> Do note that every time a variable is updated in the `.env` file, you will have to rerun this command for the changes to take into effect. E.g. when changing the model name.

Now, hold the RECORD_KEY as defined in your .env file on your keyboard and speak into your mic. For the first time, this might take around 5 seconds to generate and play the voice. For consecutive uses, the time taken will be drastically reduced with caching. The voice will be played into the Cable Output audio device and your speakers as defined in the `.env` file. The generated voice will also be written into [this folder](audio/) as `output.wav` file. 

### Google Colab

Go to [AniVoiceChanger_colab.ipynb](AniVoiceChanger_colab.ipynb) file in Github and click on `Open in Colab` badge. This will open a Colab notebook. Follow the instructions in the notebook to either train a voice model, or run the RVC Inference server.

If you have already uploaded a trained voice model to the Colab runtime and it has started running the inference server, the output of the last cell should display a Ngrok public url. Copy and paste this url into the `COLAB_URL` environment variable in your `.env` file. After all your environment variables are properly set, open a command line window in the extracted RVC-beta folder (Should have a bunch of folders and files) and run this command. Do note that every time a variable is updated in the `.env` file, you will have to rerun this command for the changes to take into effect. E.g. when changing the model name.

```runtime\python.exe AniVoiceChanger\main_colab.py```
> Do note that every time a variable is updated in the `.env` file, you will have to rerun this command for the changes to take into effect. E.g. when changing the model name.

Now, hold the RECORD_KEY as defined in your .env file on your keyboard and speak into your mic. For the first time, this might take around 10 seconds to generate and play the voice. For consecutive uses, the time taken will be drastically reduced with caching. The voice will be played into the Cable Output audio device and your speakers as defined in the `.env` file. The generated voice will also be written into [this folder](audio/) as `output.wav` file.

## Terms of Use

The use of the converted voice for the following purposes is prohibited.

* Criticizing or attacking individuals.

* Advocating for or opposing specific political positions, religions, or ideologies.

* Publicly displaying strongly stimulating expressions without proper zoning.

* Selling of voice models and generated voice clips.

* Impersonation of the original owner of the voice with malicious intentions to harm/hurt others.

* Fraudulent purposes that lead to identity theft or fraudulent phone calls.

## Disclaimer

I am not liable for any direct, indirect, consequential, incidental, or special damages arising out of or in any way connected with the use/misuse or inability to use this software.


================================================
FILE: audio/AUDIO.txt
================================================
Input and Output audio files are created and stored here.

================================================
FILE: extra_requirements.txt
================================================
keyboard
pyaudio
python-dotenv

================================================
FILE: get_audio_devices.py
================================================
import sounddevice as sd


if __name__ == '__main__':
    for device in sd.query_devices():
        print(f"{device['index']}: {device['name']}")

================================================
FILE: main_colab.py
================================================
import sys
import wave
from pathlib import Path
from time import sleep, time
from os import getenv
from urllib.parse import urlencode

import keyboard
from threading import Thread

BASE_DIR = Path(__file__).resolve().parent.parent
sys.path.append(str(BASE_DIR))

import pyaudio
import requests
import sounddevice as sd
import soundfile as sf
from dotenv import load_dotenv


# load environment variables
load_dotenv()
COLAB_URL = getenv('COLAB_URL')
MODEL_NAME = getenv('MODEL_NAME')
if MODEL_NAME.endswith('.pth'):
    MODEL_NAME = MODEL_NAME[:-4]
PITCH_CHANGE = int(getenv('PITCH_CHANGE'))
VOLUME_ENVELOPE = float(getenv('VOLUME_ENVELOPE'))
INDEX_RATE = float(getenv('INDEX_RATE')) if getenv('INDEX_RATE') else 0
PITCH_EXTRACTION_ALGO = getenv('PITCH_EXTRACTION_ALGO')
GPU_INDEX = getenv('GPU_INDEX')
MIC_RECORD_KEY = getenv('MIC_RECORD_KEY')
INGAME_PUSH_TO_TALK_KEY = getenv('INGAME_PUSH_TO_TALK_KEY')
MICROPHONE_ID = int(getenv('MICROPHONE_ID')) if getenv('MICROPHONE_ID') else None
SPEAKERS_INPUT_ID = int(getenv('SPEAKERS_INPUT_ID')) if getenv('SPEAKERS_INPUT_ID') else None


def rvc_infer_colab():
    params_encoded = urlencode({'model': MODEL_NAME, 'pitch': PITCH_CHANGE, 'algo': PITCH_EXTRACTION_ALGO, 'volume': VOLUME_ENVELOPE, 'index_rate': INDEX_RATE})

    with open(INPUT_VOICE_PATH, 'rb') as infile:
        files = {'audio_file': infile}
        r = requests.post(f'{COLAB_URL}/infer?{params_encoded}', files=files)
    
    with open(OUTPUT_VOICE_PATH, 'wb') as outfile:
        outfile.write(r.content)


def play_voice(device_id):
    data, fs = sf.read(OUTPUT_VOICE_PATH, dtype='float32')

    if INGAME_PUSH_TO_TALK_KEY:
        keyboard.press(INGAME_PUSH_TO_TALK_KEY)

    sd.play(data, fs, device=device_id)
    sd.wait()

    if INGAME_PUSH_TO_TALK_KEY:
        keyboard.release(INGAME_PUSH_TO_TALK_KEY)


def on_press_key(_):
    global frames, recording, stream
    if not recording:
        print('\nRecording has started.')
        frames = []
        recording = True
        stream = p.open(format=FORMAT,
                        channels=MIC_CHANNELS,
                        rate=MIC_SAMPLING_RATE,
                        input=True,
                        frames_per_buffer=CHUNK,
                        input_device_index=MICROPHONE_ID)


def on_release_key(_):
    global recording, stream
    recording = False
    stream.stop_stream()
    stream.close()
    stream = None

    # if key not held down for long enough
    if not frames or len(frames) < 20:
        print('No audio file to transcribe detected. Hold down the key for a longer time.')
        return

    print('Converting voice...')

    start_time = time()
    # write microphone audio to file
    wf = wave.open(str(INPUT_VOICE_PATH), 'wb')
    wf.setnchannels(MIC_CHANNELS)
    wf.setsampwidth(p.get_sample_size(FORMAT))
    wf.setframerate(MIC_SAMPLING_RATE)
    wf.writeframes(b''.join(frames))
    wf.close()

    # voice change
    rvc_infer_colab()
    print(f'Time taken for RVC voice conversion: {time() - start_time}s')

    # play to both app mic input and speakers
    threads = [Thread(target=play_voice, args=[CABLE_INPUT_ID]), Thread(target=play_voice, args=[SPEAKERS_INPUT_ID])]
    [t.start() for t in threads]
    [t.join() for t in threads]


if __name__ == '__main__':
    INPUT_VOICE_PATH = str(BASE_DIR / 'AniVoiceChanger' / 'audio' / 'input.mp3')
    OUTPUT_VOICE_PATH = str(BASE_DIR / 'AniVoiceChanger' / 'audio' / 'output.wav')
    CHUNK = 1024
    FORMAT = pyaudio.paInt16

    p = pyaudio.PyAudio()
    if MICROPHONE_ID is None:
        MICROPHONE_ID = p.get_default_input_device_info()['index']

    if SPEAKERS_INPUT_ID is None:
        SPEAKERS_INPUT_ID = p.get_default_output_device_info()['index']

    CABLE_INPUT_ID = None
    for audio_device in sd.query_devices():
        if 'CABLE Input' in audio_device['name']:
            CABLE_INPUT_ID = audio_device['index']
            break

    if not CABLE_INPUT_ID:
        print('Virtual audio cable was not found. Please download and install it.')
        sys.exit()

    # get channels and sampling rate of mic
    mic_info = p.get_device_info_by_index(MICROPHONE_ID)
    MIC_CHANNELS = mic_info['maxInputChannels']
    MIC_SAMPLING_RATE = 40000

    print('Voice changer is booting up...')

    frames = []
    recording = False
    stream = None

    keyboard.on_press_key(MIC_RECORD_KEY, on_press_key)
    keyboard.on_release_key(MIC_RECORD_KEY, on_release_key)

    try:
        print('Voice changer is ready.')
        while True:
            if recording and stream:
                data = stream.read(CHUNK)
                frames.append(data)
            else:
                sleep(0.2)

    except KeyboardInterrupt:
        print('Closing voice changer...')


================================================
FILE: main_local.py
================================================
import sys
import wave
from pathlib import Path
from time import sleep, time
from os import getenv

import keyboard
from threading import Thread

BASE_DIR = Path(__file__).resolve().parent.parent
sys.path.append(str(BASE_DIR))

import torch
import pyaudio
import sounddevice as sd
import soundfile as sf
from multiprocessing import cpu_count
from dotenv import load_dotenv

from vc_infer_pipeline import VC
from infer_pack.models import (
    SynthesizerTrnMs256NSFsid,
    SynthesizerTrnMs256NSFsid_nono,
    SynthesizerTrnMs768NSFsid,
    SynthesizerTrnMs768NSFsid_nono,
)
from my_utils import load_audio
from fairseq import checkpoint_utils
from scipy.io import wavfile


# load environment variables
load_dotenv()
MODEL_NAME = getenv('MODEL_NAME')
if MODEL_NAME.endswith('.pth'):
    MODEL_NAME = MODEL_NAME[:-4]
PITCH_CHANGE = int(getenv('PITCH_CHANGE'))
VOLUME_ENVELOPE = float(getenv('VOLUME_ENVELOPE'))
INDEX_RATE = float(getenv('INDEX_RATE')) if getenv('INDEX_RATE') else 0
PITCH_EXTRACTION_ALGO = getenv('PITCH_EXTRACTION_ALGO')
GPU_INDEX = getenv('GPU_INDEX')
MIC_RECORD_KEY = getenv('MIC_RECORD_KEY')
INGAME_PUSH_TO_TALK_KEY = getenv('INGAME_PUSH_TO_TALK_KEY')
MICROPHONE_ID = int(getenv('MICROPHONE_ID')) if getenv('MICROPHONE_ID') else None
SPEAKERS_INPUT_ID = int(getenv('SPEAKERS_INPUT_ID')) if getenv('SPEAKERS_INPUT_ID') else None


class Config:
    def __init__(self, device, is_half):
        self.device = device
        self.is_half = is_half
        self.n_cpu = 0
        self.gpu_name = None
        self.gpu_mem = None
        self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()

    def device_config(self) -> tuple:
        if torch.cuda.is_available():
            i_device = int(self.device.split(":")[-1])
            self.gpu_name = torch.cuda.get_device_name(i_device)
            if (
                    ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
                    or "P40" in self.gpu_name.upper()
                    or "1060" in self.gpu_name
                    or "1070" in self.gpu_name
                    or "1080" in self.gpu_name
            ):
                print("16 series/10 series P40 forced single precision")
                self.is_half = False
                for config_file in ["32k.json", "40k.json", "48k.json"]:
                    with open(f"configs/{config_file}", "r") as f:
                        strr = f.read().replace("true", "false")
                    with open(f"configs/{config_file}", "w") as f:
                        f.write(strr)
                with open("trainset_preprocess_pipeline_print.py", "r") as f:
                    strr = f.read().replace("3.7", "3.0")
                with open("trainset_preprocess_pipeline_print.py", "w") as f:
                    f.write(strr)
            else:
                self.gpu_name = None
            self.gpu_mem = int(
                torch.cuda.get_device_properties(i_device).total_memory
                / 1024
                / 1024
                / 1024
                + 0.4
            )
            if self.gpu_mem <= 4:
                with open("trainset_preprocess_pipeline_print.py", "r") as f:
                    strr = f.read().replace("3.7", "3.0")
                with open("trainset_preprocess_pipeline_print.py", "w") as f:
                    f.write(strr)
        elif torch.backends.mps.is_available():
            print("No supported N-card found, use MPS for inference")
            self.device = "mps"
        else:
            print("No supported N-card found, use CPU for inference")
            self.device = "cpu"
            self.is_half = True

        if self.n_cpu == 0:
            self.n_cpu = cpu_count()

        if self.is_half:
            # 6G memory config
            x_pad = 3
            x_query = 10
            x_center = 60
            x_max = 65
        else:
            # 5G memory config
            x_pad = 1
            x_query = 6
            x_center = 38
            x_max = 41

        if self.gpu_mem != None and self.gpu_mem <= 4:
            x_pad = 1
            x_query = 5
            x_center = 30
            x_max = 32

        return x_pad, x_query, x_center, x_max


def load_hubert():
    models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(['hubert_base.pt'], suffix='', )
    hubert = models[0]
    hubert = hubert.to(device)

    if is_half:
        hubert = hubert.half()
    else:
        hubert = hubert.float()

    hubert.eval()
    return hubert


def get_vc():
    model_path = BASE_DIR / 'weights' / f'{MODEL_NAME}.pth'
    if not model_path.exists():
        print(f'The model {model_path} does not exist. Please ensure that you have filled in the proper MODEL_NAME in your .env file.')
        raise Exception()

    model_path = str(model_path)
    print(f'loading pth {model_path}')
    cpt = torch.load(model_path, map_location='cpu')
    tgt_sr = cpt["config"][-1]
    cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
    if_f0 = cpt.get("f0", 1)
    version = cpt.get("version", "v1")

    if version == "v1":
        if if_f0 == 1:
            net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half)
        else:
            net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
    elif version == "v2":
        if if_f0 == 1:
            net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=is_half)
        else:
            net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])

    del net_g.enc_q
    print(net_g.load_state_dict(cpt["weight"], strict=False))
    net_g.eval().to(device)

    if is_half:
        net_g = net_g.half()
    else:
        net_g = net_g.float()

    vc = VC(tgt_sr, config)
    return cpt, version, net_g, tgt_sr, vc


def rvc_infer():
    logs_dir = BASE_DIR / 'logs' / MODEL_NAME
    index_path = ''
    for file in logs_dir.iterdir():
        if file.suffix == '.index':
            index_path = str(logs_dir / file.name)
            break
    
    # vc single
    audio = load_audio(INPUT_VOICE_PATH, 16000)
    times = [0, 0, 0]
    if_f0 = cpt.get('f0', 1)
    audio_opt = vc.pipeline(hubert_model, net_g, 0, audio, INPUT_VOICE_PATH, times, PITCH_CHANGE, PITCH_EXTRACTION_ALGO, index_path, INDEX_RATE, if_f0, 3, tgt_sr, 0, VOLUME_ENVELOPE, version, 0.33, f0_file=None)
    wavfile.write(OUTPUT_VOICE_PATH, tgt_sr, audio_opt)


def play_voice(device_id):
    data, fs = sf.read(OUTPUT_VOICE_PATH, dtype='float32')

    if INGAME_PUSH_TO_TALK_KEY:
        keyboard.press(INGAME_PUSH_TO_TALK_KEY)

    sd.play(data, fs, device=device_id)
    sd.wait()

    if INGAME_PUSH_TO_TALK_KEY:
        keyboard.release(INGAME_PUSH_TO_TALK_KEY)


def on_press_key(_):
    global frames, recording, stream
    if not recording:
        print('\nRecording has started.')
        frames = []
        recording = True
        stream = p.open(format=FORMAT,
                        channels=MIC_CHANNELS,
                        rate=MIC_SAMPLING_RATE,
                        input=True,
                        frames_per_buffer=CHUNK,
                        input_device_index=MICROPHONE_ID)


def on_release_key(_):
    global recording, stream
    recording = False
    stream.stop_stream()
    stream.close()
    stream = None

    # if key not held down for long enough
    if not frames or len(frames) < 20:
        print('No audio file to transcribe detected. Hold down the key for a longer time.')
        return
    
    print('Converting voice...')

    start_time = time()
    # write microphone audio to file
    wf = wave.open(str(INPUT_VOICE_PATH), 'wb')
    wf.setnchannels(MIC_CHANNELS)
    wf.setsampwidth(p.get_sample_size(FORMAT))
    wf.setframerate(MIC_SAMPLING_RATE)
    wf.writeframes(b''.join(frames))
    wf.close()

    # voice change
    rvc_infer()
    print(f'Time taken for RVC voice conversion: {time() - start_time}s')

    # play to both app mic input and speakers
    threads = [Thread(target=play_voice, args=[CABLE_INPUT_ID]), Thread(target=play_voice, args=[SPEAKERS_INPUT_ID])]
    [t.start() for t in threads]
    [t.join() for t in threads]


if __name__ == '__main__':
    device = f'cuda:{GPU_INDEX}'
    is_half = True
    config = Config(device, is_half)
    INPUT_VOICE_PATH = str(BASE_DIR / 'AniVoiceChanger' / 'audio' / 'input.mp3')
    OUTPUT_VOICE_PATH = str(BASE_DIR / 'AniVoiceChanger' / 'audio' / 'output.wav')
    CHUNK = 1024
    FORMAT = pyaudio.paInt16

    p = pyaudio.PyAudio()
    if MICROPHONE_ID is None:
        MICROPHONE_ID = p.get_default_input_device_info()['index']

    if SPEAKERS_INPUT_ID is None:
        SPEAKERS_INPUT_ID = p.get_default_output_device_info()['index']

    CABLE_INPUT_ID = None
    for audio_device in sd.query_devices():
        if 'CABLE Input' in audio_device['name']:
            CABLE_INPUT_ID = audio_device['index']
            break

    if not CABLE_INPUT_ID:
        print('Virtual audio cable was not found. Please download and install it.')
        sys.exit()

    # get channels and sampling rate of mic
    mic_info = p.get_device_info_by_index(MICROPHONE_ID)
    MIC_CHANNELS = mic_info['maxInputChannels']
    MIC_SAMPLING_RATE = 40000

    print('Voice changer is booting up...')

    # load hubert model
    hubert_model = load_hubert()

    # get vc
    cpt, version, net_g, tgt_sr, vc = get_vc()

    frames = []
    recording = False
    stream = None

    keyboard.on_press_key(MIC_RECORD_KEY, on_press_key)
    keyboard.on_release_key(MIC_RECORD_KEY, on_release_key)

    try:
        print('Voice changer is ready.')
        while True:
            if recording and stream:
                data = stream.read(CHUNK)
                frames.append(data)
            else:
                sleep(0.2)

    except KeyboardInterrupt:
        print('Closing voice changer...')