Repository: ardha27/AI-Song-Cover-SOVITS Branch: main Commit: a1eda47fc900 Files: 3 Total size: 21.8 KB Directory structure: gitextract_7bhyr265/ ├── AI_Song_Cover_SOVITS.ipynb ├── FUNDING.yml └── README.md ================================================ FILE CONTENTS ================================================ ================================================ FILE: AI_Song_Cover_SOVITS.ipynb ================================================ { "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU", "gpuClass": "standard" }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "code", "source": [ "#@title 0. Check GPU\n", "!nvidia-smi" ], "metadata": { "id": "J5ES3aCa503T" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "#@title Mount Google Drive\n", "from google.colab import drive\n", "drive.mount('/content/drive')" ], "metadata": { "id": "DM9Y_tjiA2Yf" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "#@title 1. Install Library for Youtube WAV Download\n", "!pip install yt_dlp\n", "!pip install ffmpeg\n", "!mkdir youtubeaudio\n" ], "metadata": { "id": "td-Hhq5n6SFj" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "id": "doHo0Gzb5ePk" }, "outputs": [], "source": [ "#@title Download Youtube WAV\n", "from __future__ import unicode_literals\n", "import yt_dlp\n", "import ffmpeg\n", "import sys\n", "\n", "ydl_opts = {\n", " 'format': 'bestaudio/best',\n", "# 'outtmpl': 'output.%(ext)s',\n", " 'postprocessors': [{\n", " 'key': 'FFmpegExtractAudio',\n", " 'preferredcodec': 'wav',\n", " }],\n", " \"outtmpl\": 'youtubeaudio/audio', # this is where you can edit how you'd like the filenames to be formatted\n", "}\n", "def download_from_url(url):\n", " ydl.download([url])\n", " # stream = ffmpeg.input('output.m4a')\n", " # stream = ffmpeg.output(stream, 'output.wav')\n", "\n", "\n", "with yt_dlp.YoutubeDL(ydl_opts) as ydl:\n", " url = \"https://www.youtube.com/watch?v=cQGfLDnmWS8&pp=ygULYXNtYWxpYnJhc2k%3D\" #@param {type:\"string\"}\n", " download_from_url(url)\n", "\n", "\n", "\n" ] }, { "cell_type": "code", "source": [ "#@title 2. Install Demucs for Separating Audio\n", "!python3 -m pip install -U demucs" ], "metadata": { "id": "-RGvSkYv9_aL" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "#@title Separate Vocal and Instrument/Noise using Demucs\n", "import subprocess\n", "AUDIO_INPUT = \"/content/youtubeaudio/audio.wav\" #@param {type:\"string\"}\n", "\n", "command = f\"demucs --two-stems=vocals {AUDIO_INPUT}\"\n", "result = subprocess.run(command.split(), stdout=subprocess.PIPE)\n", "print(result.stdout.decode())" ], "metadata": { "id": "cZidoluC-EZ-" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "#@title 3. Split The Audio into Smaller Duration Before Training\n", "#@markdown don't put space on speaker name\n", "SPEAKER_NAME = \"Hutao\" #@param {type:\"string\"}\n", "!mkdir -p dataset_raw/{SPEAKER_NAME}" ], "metadata": { "id": "texSOl1mEaNL" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "\n", "from scipy.io import wavfile\n", "import os\n", "import numpy as np\n", "import argparse\n", "from tqdm import tqdm\n", "import json\n", "\n", "from datetime import datetime, timedelta\n", "\n", "# Utility functions\n", "\n", "def GetTime(video_seconds):\n", "\n", " if (video_seconds < 0) :\n", " return 00\n", "\n", " else:\n", " sec = timedelta(seconds=float(video_seconds))\n", " d = datetime(1,1,1) + sec\n", "\n", " instant = str(d.hour).zfill(2) + ':' + str(d.minute).zfill(2) + ':' + str(d.second).zfill(2) + str('.001')\n", "\n", " return instant\n", "\n", "def GetTotalTime(video_seconds):\n", "\n", " sec = timedelta(seconds=float(video_seconds))\n", " d = datetime(1,1,1) + sec\n", " delta = str(d.hour) + ':' + str(d.minute) + \":\" + str(d.second)\n", "\n", " return delta\n", "\n", "def windows(signal, window_size, step_size):\n", " if type(window_size) is not int:\n", " raise AttributeError(\"Window size must be an integer.\")\n", " if type(step_size) is not int:\n", " raise AttributeError(\"Step size must be an integer.\")\n", " for i_start in range(0, len(signal), step_size):\n", " i_end = i_start + window_size\n", " if i_end >= len(signal):\n", " break\n", " yield signal[i_start:i_end]\n", "\n", "def energy(samples):\n", " return np.sum(np.power(samples, 2.)) / float(len(samples))\n", "\n", "def rising_edges(binary_signal):\n", " previous_value = 0\n", " index = 0\n", " for x in binary_signal:\n", " if x and not previous_value:\n", " yield index\n", " previous_value = x\n", " index += 1\n", "\n", "'''\n", "Last Acceptable Values\n", "\n", "min_silence_length = 0.3\n", "silence_threshold = 1e-3\n", "step_duration = 0.03/10\n", "\n", "'''\n", "# Change the arguments and the input file here\n", "input_file = \"/content/separated/htdemucs/audio/vocals.wav\" #@param {type:\"string\"}\n", "output_dir = f\"/content/dataset_raw/{SPEAKER_NAME}\"\n", "min_silence_length = 0.6 # The minimum length of silence at which a split may occur [seconds]. Defaults to 3 seconds.\n", "silence_threshold = 1e-4 # The energy level (between 0.0 and 1.0) below which the signal is regarded as silent.\n", "step_duration = 0.03/10 # The amount of time to step forward in the input file after calculating energy. Smaller value = slower, but more accurate silence detection. Larger value = faster, but might miss some split opportunities. Defaults to (min-silence-length / 10.).\n", "\n", "\n", "input_filename = input_file\n", "window_duration = min_silence_length\n", "if step_duration is None:\n", " step_duration = window_duration / 10.\n", "else:\n", " step_duration = step_duration\n", "\n", "output_filename_prefix = os.path.splitext(os.path.basename(input_filename))[0]\n", "dry_run = False\n", "\n", "print(\"Splitting {} where energy is below {}% for longer than {}s.\".format(\n", " input_filename,\n", " silence_threshold * 100.,\n", " window_duration\n", " )\n", ")\n", "\n", "# Read and split the file\n", "\n", "sample_rate, samples = input_data=wavfile.read(filename=input_filename, mmap=True)\n", "\n", "max_amplitude = np.iinfo(samples.dtype).max\n", "print(max_amplitude)\n", "\n", "max_energy = energy([max_amplitude])\n", "print(max_energy)\n", "\n", "window_size = int(window_duration * sample_rate)\n", "step_size = int(step_duration * sample_rate)\n", "\n", "signal_windows = windows(\n", " signal=samples,\n", " window_size=window_size,\n", " step_size=step_size\n", ")\n", "\n", "window_energy = (energy(w) / max_energy for w in tqdm(\n", " signal_windows,\n", " total=int(len(samples) / float(step_size))\n", "))\n", "\n", "window_silence = (e > silence_threshold for e in window_energy)\n", "\n", "cut_times = (r * step_duration for r in rising_edges(window_silence))\n", "\n", "# This is the step that takes long, since we force the generators to run.\n", "print(\"Finding silences...\")\n", "cut_samples = [int(t * sample_rate) for t in cut_times]\n", "cut_samples.append(-1)\n", "\n", "cut_ranges = [(i, cut_samples[i], cut_samples[i+1]) for i in range(len(cut_samples) - 1)]\n", "\n", "video_sub = {str(i) : [str(GetTime(((cut_samples[i])/sample_rate))),\n", " str(GetTime(((cut_samples[i+1])/sample_rate)))]\n", " for i in range(len(cut_samples) - 1)}\n", "\n", "for i, start, stop in tqdm(cut_ranges):\n", " output_file_path = \"{}_{:03d}.wav\".format(\n", " os.path.join(output_dir, output_filename_prefix),\n", " i\n", " )\n", " if not dry_run:\n", " print(\"Writing file {}\".format(output_file_path))\n", " wavfile.write(\n", " filename=output_file_path,\n", " rate=sample_rate,\n", " data=samples[start:stop]\n", " )\n", " else:\n", " print(\"Not writing file {}\".format(output_file_path))\n", "\n", "with open (output_dir+'\\\\'+output_filename_prefix+'.json', 'w') as output:\n", " json.dump(video_sub, output)" ], "metadata": { "id": "taNJV4l-_ZZt" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "#@title 4. Install dependencies for Training\n", "#@markdown makesure there are no \"so-vits-svc-fork\" folder on your Drive before you run this\n", "!pip install pyworld==0.3.2\n", "!python -m pip install -U pip wheel\n", "%pip install -U ipython\n", "%pip install -U so-vits-svc-fork\n", "!mkdir drive/MyDrive/so-vits-svc-fork" ], "metadata": { "id": "RWk6EKflAvAS" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "#@title Automatic preprocessing\n", "!svc pre-resample" ], "metadata": { "id": "sD6Rmv7SD24X" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "!svc pre-config" ], "metadata": { "id": "IiM26DmFD5Of" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "#@title Copy configs file\n", "!cp configs/44k/config.json drive/MyDrive/so-vits-svc-fork" ], "metadata": { "id": "S-2xib_3D7Oy" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "F0_METHOD = \"dio\" #@param [\"crepe\", \"crepe-tiny\", \"parselmouth\", \"dio\", \"harvest\"]\n", "!svc pre-hubert -fm {F0_METHOD}" ], "metadata": { "id": "odKQPmXiD8gz" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "#@title Train\n", "%load_ext tensorboard\n", "%tensorboard --logdir drive/MyDrive/so-vits-svc-fork/logs/44k\n", "!svc train --model-path drive/MyDrive/so-vits-svc-fork/logs/44k" ], "metadata": { "id": "C1NukdvBEBNm" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "#@title 5. Inference\n", "#@markdown remove \".wav\" on AUDIO. if you facing \"SVC Command not Found\", install step 4 depedencies first. Don't put space of folder/file name\n", "from IPython.display import Audio\n", "\n", "AUDIO = \"/content/separated/htdemucs/audio/vocals\" #@param {type:\"string\"}\n", "MODEL = \"/content/drive/MyDrive/so-vits-svc-fork/logs/44k/G_178.pth\" #@param {type:\"string\"}\n", "CONFIG = \"/content/drive/MyDrive/so-vits-svc-fork/logs/44k/config.json\" #@param {type:\"string\"}\n", "#@markdown Change According to Your Voice Tone. 12 = 1 Octave | -12 = -1 Octave\n", "PITCH = 0 #@param {type:\"integer\"}\n", "\n", "!svc infer {AUDIO}.wav -c {CONFIG} -m {MODEL} -na -t {PITCH}\n", "# Try comment this line below if you got Runtime Error\n", "try:\n", " display(Audio(f\"{AUDIO}.out.wav\", autoplay=True))\n", "except Exception as e: print(\"Error:\", str(e))\n" ], "metadata": { "id": "dLDnmZ9pH0mG" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "#@title 5.1 Inference Using Pretrained Model" ], "metadata": { "id": "pHwuY3NH73n7" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "#@title Credit - https://huggingface.co/spaces/zomehwh/sovits-models (Diganti dengan model Alice karena Model Hololive di Private oleh yang punya)\n", "!mkdir so-vits-test\n", "!wget -N \"https://huggingface.co/spaces/zomehwh/sovits-models/resolve/main/models/alice/alice.pth\" -P so-vits-test/\n", "!wget -N \"https://huggingface.co/spaces/zomehwh/sovits-models/resolve/main/models/alice/config.json\" -P so-vits-test/" ], "metadata": { "id": "MAEujsmx7-U8" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "#@title 5.3 Combine Vocal and Instrument (Song Cover)\n", "!pip install pydub\n", "from pydub import AudioSegment\n", "\n", "VOCAL = \"/content/separated/htdemucs/audio/vocals.out.wav\" #@param {type:\"string\"}\n", "INSTRUMENT = \"/content/separated/htdemucs/audio/no_vocals.wav\" #@param {type:\"string\"}\n", "\n", "sound1 = AudioSegment.from_file(VOCAL)\n", "sound2 = AudioSegment.from_file(INSTRUMENT)\n", "\n", "combined = sound1.overlay(sound2)\n", "\n", "combined.export(\"/content/FinalCover.wav\", format='wav')\n", "try:\n", " display(Audio(f\"/content/FinalCover.wav\", autoplay=True))\n", "except Exception as e: print(\"Error:\", str(e))\n" ], "metadata": { "id": "w781LsPh_2Xf" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "#@title 6. Additional : Audio Recording\n", "!pip install ffmpeg-python" ], "metadata": { "id": "In4WzV9iMVp4" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "\"\"\"\n", "To write this piece of code I took inspiration/code from a lot of places.\n", "It was late night, so I'm not sure how much I created or just copied o.O\n", "Here are some of the possible references:\n", "https://blog.addpipe.com/recording-audio-in-the-browser-using-pure-html5-and-minimal-javascript/\n", "https://stackoverflow.com/a/18650249\n", "https://hacks.mozilla.org/2014/06/easy-audio-capture-with-the-mediarecorder-api/\n", "https://air.ghost.io/recording-to-an-audio-file-using-html5-and-js/\n", "https://stackoverflow.com/a/49019356\n", "\"\"\"\n", "from IPython.display import HTML, Audio\n", "from google.colab.output import eval_js\n", "from base64 import b64decode\n", "import numpy as np\n", "from scipy.io.wavfile import read as wav_read\n", "import io\n", "import ffmpeg\n", "\n", "AUDIO_HTML = \"\"\"\n", "\n", "\"\"\"\n", "\n", "def get_audio():\n", " display(HTML(AUDIO_HTML))\n", " data = eval_js(\"data\")\n", " binary = b64decode(data.split(',')[1])\n", "\n", " process = (ffmpeg\n", " .input('pipe:0')\n", " .output('pipe:1', format='wav')\n", " .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True)\n", " )\n", " output, err = process.communicate(input=binary)\n", "\n", " riff_chunk_size = len(output) - 8\n", " # Break up the chunk size into four bytes, held in b.\n", " q = riff_chunk_size\n", " b = []\n", " for i in range(4):\n", " q, r = divmod(q, 256)\n", " b.append(r)\n", "\n", " # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.\n", " riff = output[:4] + bytes(b) + output[8:]\n", "\n", " sr, audio = wav_read(io.BytesIO(riff))\n", "\n", " return audio, sr" ], "metadata": { "id": "0pFo47QTMMX8" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "audio, sr = get_audio()" ], "metadata": { "id": "hqcCW1h-NUB4" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from scipy.io import wavfile\n", "wavfile.write(\"my_audio.wav\", sr, audio)" ], "metadata": { "id": "IakSo65tOPme" }, "execution_count": null, "outputs": [] } ] } ================================================ FILE: FUNDING.yml ================================================ ko_fi: ardhach ================================================ FILE: README.md ================================================ # AI-Song-Cover-SOVITS All in One Version : Youtube WAV Download, Separating Vocal, Splitting Audio, Training, and Inference Using Google Colab. ## Leave A Star if This Repo Was Helpful [![ko-fi](https://ko-fi.com/img/githubbutton_sm.svg)](https://ko-fi.com/R6R7AH1FA) Trakteer ### Tutorial (Indonesian) https://youtu.be/v5MwAqQTc6Q ### Google Colab [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ardha27/AI-Song-Cover-SOVITS/blob/main/AI_Song_Cover_SOVITS.ipynb)