Repository: ardha27/AI-Song-Cover-SOVITS
Branch: main
Commit: a1eda47fc900
Files: 3
Total size: 21.8 KB
Directory structure:
gitextract_7bhyr265/
├── AI_Song_Cover_SOVITS.ipynb
├── FUNDING.yml
└── README.md
================================================
FILE CONTENTS
================================================
================================================
FILE: AI_Song_Cover_SOVITS.ipynb
================================================
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU",
"gpuClass": "standard"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"
"
]
},
{
"cell_type": "code",
"source": [
"#@title 0. Check GPU\n",
"!nvidia-smi"
],
"metadata": {
"id": "J5ES3aCa503T"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title Mount Google Drive\n",
"from google.colab import drive\n",
"drive.mount('/content/drive')"
],
"metadata": {
"id": "DM9Y_tjiA2Yf"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title 1. Install Library for Youtube WAV Download\n",
"!pip install yt_dlp\n",
"!pip install ffmpeg\n",
"!mkdir youtubeaudio\n"
],
"metadata": {
"id": "td-Hhq5n6SFj"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "doHo0Gzb5ePk"
},
"outputs": [],
"source": [
"#@title Download Youtube WAV\n",
"from __future__ import unicode_literals\n",
"import yt_dlp\n",
"import ffmpeg\n",
"import sys\n",
"\n",
"ydl_opts = {\n",
" 'format': 'bestaudio/best',\n",
"# 'outtmpl': 'output.%(ext)s',\n",
" 'postprocessors': [{\n",
" 'key': 'FFmpegExtractAudio',\n",
" 'preferredcodec': 'wav',\n",
" }],\n",
" \"outtmpl\": 'youtubeaudio/audio', # this is where you can edit how you'd like the filenames to be formatted\n",
"}\n",
"def download_from_url(url):\n",
" ydl.download([url])\n",
" # stream = ffmpeg.input('output.m4a')\n",
" # stream = ffmpeg.output(stream, 'output.wav')\n",
"\n",
"\n",
"with yt_dlp.YoutubeDL(ydl_opts) as ydl:\n",
" url = \"https://www.youtube.com/watch?v=cQGfLDnmWS8&pp=ygULYXNtYWxpYnJhc2k%3D\" #@param {type:\"string\"}\n",
" download_from_url(url)\n",
"\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"source": [
"#@title 2. Install Demucs for Separating Audio\n",
"!python3 -m pip install -U demucs"
],
"metadata": {
"id": "-RGvSkYv9_aL"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title Separate Vocal and Instrument/Noise using Demucs\n",
"import subprocess\n",
"AUDIO_INPUT = \"/content/youtubeaudio/audio.wav\" #@param {type:\"string\"}\n",
"\n",
"command = f\"demucs --two-stems=vocals {AUDIO_INPUT}\"\n",
"result = subprocess.run(command.split(), stdout=subprocess.PIPE)\n",
"print(result.stdout.decode())"
],
"metadata": {
"id": "cZidoluC-EZ-"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title 3. Split The Audio into Smaller Duration Before Training\n",
"#@markdown don't put space on speaker name\n",
"SPEAKER_NAME = \"Hutao\" #@param {type:\"string\"}\n",
"!mkdir -p dataset_raw/{SPEAKER_NAME}"
],
"metadata": {
"id": "texSOl1mEaNL"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"\n",
"from scipy.io import wavfile\n",
"import os\n",
"import numpy as np\n",
"import argparse\n",
"from tqdm import tqdm\n",
"import json\n",
"\n",
"from datetime import datetime, timedelta\n",
"\n",
"# Utility functions\n",
"\n",
"def GetTime(video_seconds):\n",
"\n",
" if (video_seconds < 0) :\n",
" return 00\n",
"\n",
" else:\n",
" sec = timedelta(seconds=float(video_seconds))\n",
" d = datetime(1,1,1) + sec\n",
"\n",
" instant = str(d.hour).zfill(2) + ':' + str(d.minute).zfill(2) + ':' + str(d.second).zfill(2) + str('.001')\n",
"\n",
" return instant\n",
"\n",
"def GetTotalTime(video_seconds):\n",
"\n",
" sec = timedelta(seconds=float(video_seconds))\n",
" d = datetime(1,1,1) + sec\n",
" delta = str(d.hour) + ':' + str(d.minute) + \":\" + str(d.second)\n",
"\n",
" return delta\n",
"\n",
"def windows(signal, window_size, step_size):\n",
" if type(window_size) is not int:\n",
" raise AttributeError(\"Window size must be an integer.\")\n",
" if type(step_size) is not int:\n",
" raise AttributeError(\"Step size must be an integer.\")\n",
" for i_start in range(0, len(signal), step_size):\n",
" i_end = i_start + window_size\n",
" if i_end >= len(signal):\n",
" break\n",
" yield signal[i_start:i_end]\n",
"\n",
"def energy(samples):\n",
" return np.sum(np.power(samples, 2.)) / float(len(samples))\n",
"\n",
"def rising_edges(binary_signal):\n",
" previous_value = 0\n",
" index = 0\n",
" for x in binary_signal:\n",
" if x and not previous_value:\n",
" yield index\n",
" previous_value = x\n",
" index += 1\n",
"\n",
"'''\n",
"Last Acceptable Values\n",
"\n",
"min_silence_length = 0.3\n",
"silence_threshold = 1e-3\n",
"step_duration = 0.03/10\n",
"\n",
"'''\n",
"# Change the arguments and the input file here\n",
"input_file = \"/content/separated/htdemucs/audio/vocals.wav\" #@param {type:\"string\"}\n",
"output_dir = f\"/content/dataset_raw/{SPEAKER_NAME}\"\n",
"min_silence_length = 0.6 # The minimum length of silence at which a split may occur [seconds]. Defaults to 3 seconds.\n",
"silence_threshold = 1e-4 # The energy level (between 0.0 and 1.0) below which the signal is regarded as silent.\n",
"step_duration = 0.03/10 # The amount of time to step forward in the input file after calculating energy. Smaller value = slower, but more accurate silence detection. Larger value = faster, but might miss some split opportunities. Defaults to (min-silence-length / 10.).\n",
"\n",
"\n",
"input_filename = input_file\n",
"window_duration = min_silence_length\n",
"if step_duration is None:\n",
" step_duration = window_duration / 10.\n",
"else:\n",
" step_duration = step_duration\n",
"\n",
"output_filename_prefix = os.path.splitext(os.path.basename(input_filename))[0]\n",
"dry_run = False\n",
"\n",
"print(\"Splitting {} where energy is below {}% for longer than {}s.\".format(\n",
" input_filename,\n",
" silence_threshold * 100.,\n",
" window_duration\n",
" )\n",
")\n",
"\n",
"# Read and split the file\n",
"\n",
"sample_rate, samples = input_data=wavfile.read(filename=input_filename, mmap=True)\n",
"\n",
"max_amplitude = np.iinfo(samples.dtype).max\n",
"print(max_amplitude)\n",
"\n",
"max_energy = energy([max_amplitude])\n",
"print(max_energy)\n",
"\n",
"window_size = int(window_duration * sample_rate)\n",
"step_size = int(step_duration * sample_rate)\n",
"\n",
"signal_windows = windows(\n",
" signal=samples,\n",
" window_size=window_size,\n",
" step_size=step_size\n",
")\n",
"\n",
"window_energy = (energy(w) / max_energy for w in tqdm(\n",
" signal_windows,\n",
" total=int(len(samples) / float(step_size))\n",
"))\n",
"\n",
"window_silence = (e > silence_threshold for e in window_energy)\n",
"\n",
"cut_times = (r * step_duration for r in rising_edges(window_silence))\n",
"\n",
"# This is the step that takes long, since we force the generators to run.\n",
"print(\"Finding silences...\")\n",
"cut_samples = [int(t * sample_rate) for t in cut_times]\n",
"cut_samples.append(-1)\n",
"\n",
"cut_ranges = [(i, cut_samples[i], cut_samples[i+1]) for i in range(len(cut_samples) - 1)]\n",
"\n",
"video_sub = {str(i) : [str(GetTime(((cut_samples[i])/sample_rate))),\n",
" str(GetTime(((cut_samples[i+1])/sample_rate)))]\n",
" for i in range(len(cut_samples) - 1)}\n",
"\n",
"for i, start, stop in tqdm(cut_ranges):\n",
" output_file_path = \"{}_{:03d}.wav\".format(\n",
" os.path.join(output_dir, output_filename_prefix),\n",
" i\n",
" )\n",
" if not dry_run:\n",
" print(\"Writing file {}\".format(output_file_path))\n",
" wavfile.write(\n",
" filename=output_file_path,\n",
" rate=sample_rate,\n",
" data=samples[start:stop]\n",
" )\n",
" else:\n",
" print(\"Not writing file {}\".format(output_file_path))\n",
"\n",
"with open (output_dir+'\\\\'+output_filename_prefix+'.json', 'w') as output:\n",
" json.dump(video_sub, output)"
],
"metadata": {
"id": "taNJV4l-_ZZt"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title 4. Install dependencies for Training\n",
"#@markdown makesure there are no \"so-vits-svc-fork\" folder on your Drive before you run this\n",
"!pip install pyworld==0.3.2\n",
"!python -m pip install -U pip wheel\n",
"%pip install -U ipython\n",
"%pip install -U so-vits-svc-fork\n",
"!mkdir drive/MyDrive/so-vits-svc-fork"
],
"metadata": {
"id": "RWk6EKflAvAS"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title Automatic preprocessing\n",
"!svc pre-resample"
],
"metadata": {
"id": "sD6Rmv7SD24X"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!svc pre-config"
],
"metadata": {
"id": "IiM26DmFD5Of"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title Copy configs file\n",
"!cp configs/44k/config.json drive/MyDrive/so-vits-svc-fork"
],
"metadata": {
"id": "S-2xib_3D7Oy"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"F0_METHOD = \"dio\" #@param [\"crepe\", \"crepe-tiny\", \"parselmouth\", \"dio\", \"harvest\"]\n",
"!svc pre-hubert -fm {F0_METHOD}"
],
"metadata": {
"id": "odKQPmXiD8gz"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title Train\n",
"%load_ext tensorboard\n",
"%tensorboard --logdir drive/MyDrive/so-vits-svc-fork/logs/44k\n",
"!svc train --model-path drive/MyDrive/so-vits-svc-fork/logs/44k"
],
"metadata": {
"id": "C1NukdvBEBNm"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title 5. Inference\n",
"#@markdown remove \".wav\" on AUDIO. if you facing \"SVC Command not Found\", install step 4 depedencies first. Don't put space of folder/file name\n",
"from IPython.display import Audio\n",
"\n",
"AUDIO = \"/content/separated/htdemucs/audio/vocals\" #@param {type:\"string\"}\n",
"MODEL = \"/content/drive/MyDrive/so-vits-svc-fork/logs/44k/G_178.pth\" #@param {type:\"string\"}\n",
"CONFIG = \"/content/drive/MyDrive/so-vits-svc-fork/logs/44k/config.json\" #@param {type:\"string\"}\n",
"#@markdown Change According to Your Voice Tone. 12 = 1 Octave | -12 = -1 Octave\n",
"PITCH = 0 #@param {type:\"integer\"}\n",
"\n",
"!svc infer {AUDIO}.wav -c {CONFIG} -m {MODEL} -na -t {PITCH}\n",
"# Try comment this line below if you got Runtime Error\n",
"try:\n",
" display(Audio(f\"{AUDIO}.out.wav\", autoplay=True))\n",
"except Exception as e: print(\"Error:\", str(e))\n"
],
"metadata": {
"id": "dLDnmZ9pH0mG"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title 5.1 Inference Using Pretrained Model"
],
"metadata": {
"id": "pHwuY3NH73n7"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title Credit - https://huggingface.co/spaces/zomehwh/sovits-models (Diganti dengan model Alice karena Model Hololive di Private oleh yang punya)\n",
"!mkdir so-vits-test\n",
"!wget -N \"https://huggingface.co/spaces/zomehwh/sovits-models/resolve/main/models/alice/alice.pth\" -P so-vits-test/\n",
"!wget -N \"https://huggingface.co/spaces/zomehwh/sovits-models/resolve/main/models/alice/config.json\" -P so-vits-test/"
],
"metadata": {
"id": "MAEujsmx7-U8"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title 5.3 Combine Vocal and Instrument (Song Cover)\n",
"!pip install pydub\n",
"from pydub import AudioSegment\n",
"\n",
"VOCAL = \"/content/separated/htdemucs/audio/vocals.out.wav\" #@param {type:\"string\"}\n",
"INSTRUMENT = \"/content/separated/htdemucs/audio/no_vocals.wav\" #@param {type:\"string\"}\n",
"\n",
"sound1 = AudioSegment.from_file(VOCAL)\n",
"sound2 = AudioSegment.from_file(INSTRUMENT)\n",
"\n",
"combined = sound1.overlay(sound2)\n",
"\n",
"combined.export(\"/content/FinalCover.wav\", format='wav')\n",
"try:\n",
" display(Audio(f\"/content/FinalCover.wav\", autoplay=True))\n",
"except Exception as e: print(\"Error:\", str(e))\n"
],
"metadata": {
"id": "w781LsPh_2Xf"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title 6. Additional : Audio Recording\n",
"!pip install ffmpeg-python"
],
"metadata": {
"id": "In4WzV9iMVp4"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"\"\"\"\n",
"To write this piece of code I took inspiration/code from a lot of places.\n",
"It was late night, so I'm not sure how much I created or just copied o.O\n",
"Here are some of the possible references:\n",
"https://blog.addpipe.com/recording-audio-in-the-browser-using-pure-html5-and-minimal-javascript/\n",
"https://stackoverflow.com/a/18650249\n",
"https://hacks.mozilla.org/2014/06/easy-audio-capture-with-the-mediarecorder-api/\n",
"https://air.ghost.io/recording-to-an-audio-file-using-html5-and-js/\n",
"https://stackoverflow.com/a/49019356\n",
"\"\"\"\n",
"from IPython.display import HTML, Audio\n",
"from google.colab.output import eval_js\n",
"from base64 import b64decode\n",
"import numpy as np\n",
"from scipy.io.wavfile import read as wav_read\n",
"import io\n",
"import ffmpeg\n",
"\n",
"AUDIO_HTML = \"\"\"\n",
"\n",
"\"\"\"\n",
"\n",
"def get_audio():\n",
" display(HTML(AUDIO_HTML))\n",
" data = eval_js(\"data\")\n",
" binary = b64decode(data.split(',')[1])\n",
"\n",
" process = (ffmpeg\n",
" .input('pipe:0')\n",
" .output('pipe:1', format='wav')\n",
" .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True)\n",
" )\n",
" output, err = process.communicate(input=binary)\n",
"\n",
" riff_chunk_size = len(output) - 8\n",
" # Break up the chunk size into four bytes, held in b.\n",
" q = riff_chunk_size\n",
" b = []\n",
" for i in range(4):\n",
" q, r = divmod(q, 256)\n",
" b.append(r)\n",
"\n",
" # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.\n",
" riff = output[:4] + bytes(b) + output[8:]\n",
"\n",
" sr, audio = wav_read(io.BytesIO(riff))\n",
"\n",
" return audio, sr"
],
"metadata": {
"id": "0pFo47QTMMX8"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"audio, sr = get_audio()"
],
"metadata": {
"id": "hqcCW1h-NUB4"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from scipy.io import wavfile\n",
"wavfile.write(\"my_audio.wav\", sr, audio)"
],
"metadata": {
"id": "IakSo65tOPme"
},
"execution_count": null,
"outputs": []
}
]
}
================================================
FILE: FUNDING.yml
================================================
ko_fi: ardhach
================================================
FILE: README.md
================================================
# AI-Song-Cover-SOVITS
All in One Version : Youtube WAV Download, Separating Vocal, Splitting Audio, Training, and Inference Using Google Colab.
## Leave A Star if This Repo Was Helpful
[](https://ko-fi.com/R6R7AH1FA)
### Tutorial (Indonesian)
https://youtu.be/v5MwAqQTc6Q
### Google Colab
[](https://colab.research.google.com/github/ardha27/AI-Song-Cover-SOVITS/blob/main/AI_Song_Cover_SOVITS.ipynb)