Full Code of gitmylo/bark-voice-cloning-HuBERT-quantizer for AI

master 4f42e44480fb

22 files

39.3 KB

10.6k tokens

Repository: gitmylo/bark-voice-cloning-HuBERT-quantizer
Branch: master
Commit: 4f42e44480fb
Files: 22
Total size: 39.3 KB

Directory structure:
gitextract_i_oo11tb/

├── .gitignore
├── .idea/
│   ├── .gitignore
│   ├── bark.iml
│   ├── inspectionProfiles/
│   │   ├── Project_Default.xml
│   │   └── profiles_settings.xml
│   ├── misc.xml
│   ├── modules.xml
│   └── vcs.xml
├── LICENSE
├── args.py
├── bark_hubert_quantizer/
│   ├── __init__.py
│   ├── customtokenizer.py
│   ├── hubert_manager.py
│   └── pre_kmeans_hubert.py
├── colab_notebook.ipynb
├── notebook.ipynb
├── prepare.py
├── process.py
├── readme.md
├── requirements.txt
├── setup.py
└── test_hubert.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
/model
/Literature/semantic/*
/Literature/wav/*
**/speaker.wav
**/speaker.npz

__pycache__


================================================
FILE: .idea/.gitignore
================================================
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml


================================================
FILE: .idea/bark.iml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
  <component name="NewModuleRootManager">
    <content url="file://$MODULE_DIR$">
      <excludeFolder url="file://$MODULE_DIR$/venv" />
    </content>
    <orderEntry type="jdk" jdkName="Python 3.10 (bark)" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
</module>

================================================
FILE: .idea/inspectionProfiles/Project_Default.xml
================================================
<component name="InspectionProjectProfileManager">
  <profile version="1.0">
    <option name="myName" value="Project Default" />
    <inspection_tool class="Eslint" enabled="true" level="WARNING" enabled_by_default="true" />
    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
      <option name="ignoredPackages">
        <value>
          <list size="8">
            <item index="0" class="java.lang.String" itemvalue="opencv-contrib-python" />
            <item index="1" class="java.lang.String" itemvalue="transformers" />
            <item index="2" class="java.lang.String" itemvalue="pytorch_lightning" />
            <item index="3" class="java.lang.String" itemvalue="invisible-watermark" />
            <item index="4" class="java.lang.String" itemvalue="timm" />
            <item index="5" class="java.lang.String" itemvalue="diffusers" />
            <item index="6" class="java.lang.String" itemvalue="tomesd" />
            <item index="7" class="java.lang.String" itemvalue="Pillow" />
          </list>
        </value>
      </option>
    </inspection_tool>
    <inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
      <option name="ignoredErrors">
        <list>
          <option value="E501" />
          <option value="E303" />
          <option value="E402" />
        </list>
      </option>
    </inspection_tool>
    <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
      <option name="ignoredErrors">
        <list>
          <option value="N812" />
          <option value="N806" />
          <option value="N803" />
        </list>
      </option>
    </inspection_tool>
    <inspection_tool class="PyTypeCheckerInspection" enabled="false" level="WARNING" enabled_by_default="false" />
  </profile>
</component>

================================================
FILE: .idea/inspectionProfiles/profiles_settings.xml
================================================
<component name="InspectionProjectProfileManager">
  <settings>
    <option name="USE_PROJECT_PROFILE" value="false" />
    <version value="1.0" />
  </settings>
</component>

================================================
FILE: .idea/misc.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (bark)" project-jdk-type="Python SDK" />
</project>

================================================
FILE: .idea/modules.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
  <component name="ProjectModuleManager">
    <modules>
      <module fileurl="file://$PROJECT_DIR$/.idea/bark.iml" filepath="$PROJECT_DIR$/.idea/bark.iml" />
    </modules>
  </component>
</project>

================================================
FILE: .idea/vcs.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
  <component name="VcsDirectoryMappings">
    <mapping directory="$PROJECT_DIR$" vcs="Git" />
  </component>
</project>

================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) 2023 Mylo

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: args.py
================================================
from argparse import ArgumentParser

parser = ArgumentParser()

parser.add_argument('--path', required=True, help='The path containing your semantic tokens and wavs')
parser.add_argument('--mode', required=True, help='The mode to use', choices=['prepare', 'prepare2', 'train', 'test'])
parser.add_argument('--hubert-model', default='model/hubert/hubert_base_ls960.pt', help='The hubert model to use for preparing the data and later creation of semantic tokens.')
parser.add_argument('--train-save-epochs', default=1, type=int, help='The amount of epochs to train before saving')

args = parser.parse_args()


================================================
FILE: bark_hubert_quantizer/__init__.py
================================================


================================================
FILE: bark_hubert_quantizer/customtokenizer.py
================================================
"""
Custom tokenizer model.
Author: https://www.github.com/gitmylo/
License: MIT
"""

import json
import os.path
from zipfile import ZipFile

import numpy
import torch
from torch import nn, optim
from torch.serialization import MAP_LOCATION


class CustomTokenizer(nn.Module):
    def __init__(self, hidden_size=1024, input_size=768, output_size=10000, version=0):
        super(CustomTokenizer, self).__init__()
        next_size = input_size
        if version == 0:
            self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True)
            next_size = hidden_size
        if version == 1:
            self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True)
            self.intermediate = nn.Linear(hidden_size, 4096)
            next_size = 4096

        self.fc = nn.Linear(next_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        self.optimizer: optim.Optimizer = None
        self.lossfunc = nn.CrossEntropyLoss()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.version = version

    def forward(self, x):
        x, _ = self.lstm(x)
        if self.version == 1:
            x = self.intermediate(x)
        x = self.fc(x)
        x = self.softmax(x)
        return x

    @torch.no_grad()
    def get_token(self, x):
        """
        Used to get the token for the first
        :param x: An array with shape (N, input_size) where N is a whole number greater or equal to 1, and input_size is the input size used when creating the model.
        :return: An array with shape (N,) where N is the same as N from the input. Every number in the array is a whole number in range 0...output_size - 1 where output_size is the output size used when creating the model.
        """
        return torch.argmax(self(x), dim=1)

    def prepare_training(self):
        self.optimizer = optim.Adam(self.parameters(), 0.001)

    def train_step(self, x_train, y_train, log_loss=False):
        # y_train = y_train[:-1]
        # y_train = y_train[1:]

        optimizer = self.optimizer
        lossfunc = self.lossfunc
        # Zero the gradients
        self.zero_grad()

        # Forward pass
        y_pred = self(x_train)

        y_train_len = len(y_train)
        y_pred_len = y_pred.shape[0]

        if y_train_len > y_pred_len:
            diff = y_train_len - y_pred_len
            y_train = y_train[diff:]
        elif y_train_len < y_pred_len:
            diff = y_pred_len - y_train_len
            y_pred = y_pred[:-diff, :]

        y_train_hot = torch.zeros(len(y_train), self.output_size)
        y_train_hot[range(len(y_train)), y_train] = 1
        y_train_hot = y_train_hot.to('cuda')

        # Calculate the loss
        loss = lossfunc(y_pred, y_train_hot)

        # Print loss
        if log_loss:
            print('Loss', loss.item())

        # Backward pass
        loss.backward()

        # Update the weights
        optimizer.step()

    def save(self, path):
        info_path = '.'.join(os.path.basename(path).split('.')[:-1]) + '/.info'
        torch.save(self.state_dict(), path)
        data_from_model = Data(self.input_size, self.hidden_size, self.output_size, self.version)
        with ZipFile(path, 'a') as model_zip:
            model_zip.writestr(info_path, data_from_model.save())
            model_zip.close()

    @staticmethod
    def load_from_checkpoint(path, map_location: MAP_LOCATION = None):
        old = True
        with ZipFile(path) as model_zip:
            filesMatch = [file for file in model_zip.namelist() if file.endswith('/.info')]
            file = filesMatch[0] if filesMatch else None
            if file:
                old = False
                data_from_model = Data.load(model_zip.read(file).decode('utf-8'))
            model_zip.close()
        if old:
            model = CustomTokenizer()
        else:
            model = CustomTokenizer(data_from_model.hidden_size, data_from_model.input_size, data_from_model.output_size, data_from_model.version)
        model.load_state_dict(torch.load(path, map_location=map_location))
        if map_location:
            model = model.to(map_location)
        return model



class Data:
    input_size: int
    hidden_size: int
    output_size: int
    version: int

    def __init__(self, input_size=768, hidden_size=1024, output_size=10000, version=0):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.version = version

    @staticmethod
    def load(string):
        data = json.loads(string)
        return Data(data['input_size'], data['hidden_size'], data['output_size'], data['version'])

    def save(self):
        data = {
            'input_size': self.input_size,
            'hidden_size': self.hidden_size,
            'output_size': self.output_size,
            'version': self.version,
        }
        return json.dumps(data)


def auto_train(data_path, save_path='model.pth', load_model: str | None = None, save_epochs=1):
    data_x, data_y = {}, {}

    if load_model and os.path.isfile(load_model):
        print('Loading model from', load_model)
        model_training = CustomTokenizer.load_from_checkpoint(load_model, 'cuda')
    else:
        print('Creating new model.')
        model_training = CustomTokenizer(version=1).to('cuda')
    save_path = os.path.join(data_path, save_path)
    base_save_path = '.'.join(save_path.split('.')[:-1])

    sem_string = '_semantic.npy'
    feat_string = '_semantic_features.npy'

    ready = os.path.join(data_path, 'ready')
    for input_file in os.listdir(ready):
        full_path = os.path.join(ready, input_file)        
        try:
            prefix = input_file.split("_")[0]
            number = int(prefix)
        except ValueError as e:            
            raise e
        if input_file.endswith(sem_string):
            data_y[number] = numpy.load(full_path)
        elif input_file.endswith(feat_string):
            data_x[number] = numpy.load(full_path)
    
    model_training.prepare_training()
    epoch = 1

    while 1:
        for i in range(save_epochs):
            j = 0
            for i in range(max(len(data_x), len(data_y))):
                x = data_x.get(i)
                y = data_y.get(i)
                if x is None or y is None:
                    print(f'The training data does not match. key={i}')
                    continue
                model_training.train_step(torch.tensor(x).to('cuda'), torch.tensor(y).to('cuda'), j % 50 == 0)  # Print loss every 50 steps
                j += 1
        save_p = save_path
        save_p_2 = f'{base_save_path}_epoch_{epoch}.pth'
        model_training.save(save_p)
        model_training.save(save_p_2)
        print(f'Epoch {epoch} completed')
        epoch += 1


================================================
FILE: bark_hubert_quantizer/hubert_manager.py
================================================
import os.path
import shutil
import urllib.request

import huggingface_hub


class HuBERTManager:
    @staticmethod
    def make_sure_hubert_installed(download_url: str = 'https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt', file_name: str = 'hubert.pt'):
        install_dir = os.path.join('data', 'models', 'hubert')
        if not os.path.isdir(install_dir):
            os.makedirs(install_dir, exist_ok=True)
        install_file = os.path.join(install_dir, file_name)
        if not os.path.isfile(install_file):
            print('Downloading HuBERT base model')
            urllib.request.urlretrieve(download_url, install_file)
            print('Downloaded HuBERT')
        return install_file


    @staticmethod
    def make_sure_tokenizer_installed(model: str = 'quantifier_hubert_base_ls960_14.pth', repo: str = 'GitMylo/bark-voice-cloning', local_file: str = 'tokenizer.pth'):
        install_dir = os.path.join('data', 'models', 'hubert')
        if not os.path.isdir(install_dir):
            os.makedirs(install_dir, exist_ok=True)
        install_file = os.path.join(install_dir, local_file)
        if not os.path.isfile(install_file):
            print('Downloading HuBERT custom tokenizer')
            huggingface_hub.hf_hub_download(repo, model, local_dir=install_dir, local_dir_use_symlinks=False)
            shutil.move(os.path.join(install_dir, model), install_file)
            print('Downloaded tokenizer')
        return install_file


================================================
FILE: bark_hubert_quantizer/pre_kmeans_hubert.py
================================================
"""
Modified HuBERT model without kmeans.
Original author: https://github.com/lucidrains/
Modified by: https://www.github.com/gitmylo/
License: MIT
"""

# Modified code from https://github.com/lucidrains/audiolm-pytorch/blob/main/audiolm_pytorch/hubert_kmeans.py

from pathlib import Path

import torch
from torch import nn
from einops import pack, unpack

import fairseq

from torchaudio.functional import resample

from audiolm_pytorch.utils import curtail_to_multiple

import logging
logging.root.setLevel(logging.ERROR)


def exists(val):
    return val is not None


def default(val, d):
    return val if exists(val) else d


class CustomHubert(nn.Module):
    """
    checkpoint and kmeans can be downloaded at https://github.com/facebookresearch/fairseq/tree/main/examples/hubert
    or you can train your own
    """

    def __init__(
        self,
        checkpoint_path,
        target_sample_hz=16000,
        seq_len_multiple_of=None,
        output_layer=9,
        device=None
    ):
        super().__init__()
        self.target_sample_hz = target_sample_hz
        self.seq_len_multiple_of = seq_len_multiple_of
        self.output_layer = output_layer

        if device is not None:
            self.to(device)

        model_path = Path(checkpoint_path)

        assert model_path.exists(), f'path {checkpoint_path} does not exist'

        checkpoint = torch.load(checkpoint_path, map_location=device)
        load_model_input = {checkpoint_path: checkpoint}
        model, *_ = fairseq.checkpoint_utils.load_model_ensemble_and_task(load_model_input)

        if device is not None:
            model[0].to(device)

        self.model = model[0]
        self.model.eval()

    @property
    def groups(self):
        return 1

    @torch.no_grad()
    def forward(
        self,
        wav_input,
        flatten=True,
        input_sample_hz=None
    ):
        device = wav_input.device

        if exists(input_sample_hz):
            wav_input = resample(wav_input, input_sample_hz, self.target_sample_hz)

        if exists(self.seq_len_multiple_of):
            wav_input = curtail_to_multiple(wav_input, self.seq_len_multiple_of)

        embed = self.model(
            wav_input,
            features_only=True,
            mask=False,  # thanks to @maitycyrus for noticing that mask is defaulted to True in the fairseq code
            output_layer=self.output_layer
        )

        embed, packed_shape = pack([embed['x']], '* d')

        # codebook_indices = self.kmeans.predict(embed.cpu().detach().numpy())

        codebook_indices = torch.from_numpy(embed.cpu().detach().numpy()).to(device)  # .long()

        if flatten:
            return codebook_indices

        codebook_indices, = unpack(codebook_indices, packed_shape, '*')
        return codebook_indices


================================================
FILE: colab_notebook.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "# Bark text-to-speech voice cloning.\n",
    "Clone voices to create speaker history prompt files (.npz) for [bark text-to-speech](https://github.com/suno-ai/bark).\n",
    "(This version of the notebook is made to work on Google Colab, make sure your runtime hardware accelerator is set to GPU)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "# Google Colab: Clone the repository"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "!git clone https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer/\n",
    "%cd bark-voice-cloning-HuBERT-quantizer"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "## Install packages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%pip install -r requirements.txt\n",
    "%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "## Load models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2023-05-26 21:27:49 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading HuBERT...\n",
      "Loading Quantizer...\n",
      "Loading Encodec...\n",
      "Downloaded and loaded models!\n"
     ]
    }
   ],
   "source": [
    "large_quant_model = False  # Use the larger pretrained model\n",
    "device = 'cuda'  # 'cuda', 'cpu', 'cuda:0', 0, -1, torch.device('cuda')\n",
    "\n",
    "import numpy as np\n",
    "import torch\n",
    "import torchaudio\n",
    "from encodec import EncodecModel\n",
    "from encodec.utils import convert_audio\n",
    "from bark_hubert_quantizer.hubert_manager import HuBERTManager\n",
    "from bark_hubert_quantizer.pre_kmeans_hubert import CustomHubert\n",
    "from bark_hubert_quantizer.customtokenizer import CustomTokenizer\n",
    "\n",
    "model = ('quantifier_V1_hubert_base_ls960_23.pth', 'tokenizer_large.pth') if large_quant_model else ('quantifier_hubert_base_ls960_14.pth', 'tokenizer.pth')\n",
    "\n",
    "print('Loading HuBERT...')\n",
    "hubert_model = CustomHubert(HuBERTManager.make_sure_hubert_installed(), device=device)\n",
    "print('Loading Quantizer...')\n",
    "quant_model = CustomTokenizer.load_from_checkpoint(HuBERTManager.make_sure_tokenizer_installed(model=model[0], local_file=model[1]), device)\n",
    "print('Loading Encodec...')\n",
    "encodec_model = EncodecModel.encodec_model_24khz()\n",
    "encodec_model.set_target_bandwidth(6.0)\n",
    "encodec_model.to(device)\n",
    "\n",
    "print('Downloaded and loaded models!')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "## Load wav and create speaker history prompt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Extracting semantics...\n",
      "Tokenizing semantics...\n",
      "Creating coarse and fine prompts...\n",
      "Done!\n"
     ]
    }
   ],
   "source": [
    "wav_file = 'speaker.wav'  # Put the path of the speaker you want to use here.\n",
    "out_file = 'speaker.npz'  # Put the path to save the cloned speaker to here.\n",
    "\n",
    "wav, sr = torchaudio.load(wav_file)\n",
    "\n",
    "wav_hubert = wav.to(device)\n",
    "\n",
    "if wav_hubert.shape[0] == 2:  # Stereo to mono if needed\n",
    "    wav_hubert = wav_hubert.mean(0, keepdim=True)\n",
    "\n",
    "print('Extracting semantics...')\n",
    "semantic_vectors = hubert_model.forward(wav_hubert, input_sample_hz=sr)\n",
    "print('Tokenizing semantics...')\n",
    "semantic_tokens = quant_model.get_token(semantic_vectors)\n",
    "print('Creating coarse and fine prompts...')\n",
    "wav = convert_audio(wav, sr, encodec_model.sample_rate, 1).unsqueeze(0)\n",
    "\n",
    "wav = wav.to(device)\n",
    "\n",
    "with torch.no_grad():\n",
    "    encoded_frames = encodec_model.encode(wav)\n",
    "codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()\n",
    "\n",
    "codes = codes.cpu()\n",
    "semantic_tokens = semantic_tokens.cpu()\n",
    "\n",
    "np.savez(out_file,\n",
    "         semantic_prompt=semantic_tokens,\n",
    "         fine_prompt=codes,\n",
    "         coarse_prompt=codes[:2, :]\n",
    "         )\n",
    "\n",
    "print('Done!')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}


================================================
FILE: notebook.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "# Bark text-to-speech voice cloning.\n",
    "Clone voices to create speaker history prompt files (.npz) for [bark text-to-speech](https://github.com/suno-ai/bark)."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "## Install packages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%pip install -r requirements.txt\n",
    "%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "## Load models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2023-05-26 21:27:49 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading HuBERT...\n",
      "Loading Quantizer...\n",
      "Loading Encodec...\n",
      "Downloaded and loaded models!\n"
     ]
    }
   ],
   "source": [
    "large_quant_model = False  # Use the larger pretrained model\n",
    "device = 'cuda'  # 'cuda', 'cpu', 'cuda:0', 0, -1, torch.device('cuda')\n",
    "\n",
    "import numpy as np\n",
    "import torch\n",
    "import torchaudio\n",
    "from encodec import EncodecModel\n",
    "from encodec.utils import convert_audio\n",
    "from bark_hubert_quantizer.hubert_manager import HuBERTManager\n",
    "from bark_hubert_quantizer.pre_kmeans_hubert import CustomHubert\n",
    "from bark_hubert_quantizer.customtokenizer import CustomTokenizer\n",
    "\n",
    "model = ('quantifier_V1_hubert_base_ls960_23.pth', 'tokenizer_large.pth') if large_quant_model else ('quantifier_hubert_base_ls960_14.pth', 'tokenizer.pth')\n",
    "\n",
    "print('Loading HuBERT...')\n",
    "hubert_model = CustomHubert(HuBERTManager.make_sure_hubert_installed(), device=device)\n",
    "print('Loading Quantizer...')\n",
    "quant_model = CustomTokenizer.load_from_checkpoint(HuBERTManager.make_sure_tokenizer_installed(model=model[0], local_file=model[1]), device)\n",
    "print('Loading Encodec...')\n",
    "encodec_model = EncodecModel.encodec_model_24khz()\n",
    "encodec_model.set_target_bandwidth(6.0)\n",
    "encodec_model.to(device)\n",
    "\n",
    "print('Downloaded and loaded models!')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "## Load wav and create speaker history prompt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Extracting semantics...\n",
      "Tokenizing semantics...\n",
      "Creating coarse and fine prompts...\n",
      "Done!\n"
     ]
    }
   ],
   "source": [
    "wav_file = 'speaker.wav'  # Put the path of the speaker you want to use here.\n",
    "out_file = 'speaker.npz'  # Put the path to save the cloned speaker to here.\n",
    "\n",
    "wav, sr = torchaudio.load(wav_file)\n",
    "\n",
    "wav_hubert = wav.to(device)\n",
    "\n",
    "if wav_hubert.shape[0] == 2:  # Stereo to mono if needed\n",
    "    wav_hubert = wav_hubert.mean(0, keepdim=True)\n",
    "\n",
    "print('Extracting semantics...')\n",
    "semantic_vectors = hubert_model.forward(wav_hubert, input_sample_hz=sr)\n",
    "print('Tokenizing semantics...')\n",
    "semantic_tokens = quant_model.get_token(semantic_vectors)\n",
    "print('Creating coarse and fine prompts...')\n",
    "wav = convert_audio(wav, sr, encodec_model.sample_rate, 1).unsqueeze(0)\n",
    "\n",
    "wav = wav.to(device)\n",
    "\n",
    "with torch.no_grad():\n",
    "    encoded_frames = encodec_model.encode(wav)\n",
    "codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()\n",
    "\n",
    "codes = codes.cpu()\n",
    "semantic_tokens = semantic_tokens.cpu()\n",
    "\n",
    "np.savez(out_file,\n",
    "         semantic_prompt=semantic_tokens,\n",
    "         fine_prompt=codes,\n",
    "         coarse_prompt=codes[:2, :]\n",
    "         )\n",
    "\n",
    "print('Done!')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}


================================================
FILE: prepare.py
================================================
import os
import shutil
import zipfile

import numpy
import torchaudio

from hubert.pre_kmeans_hubert import CustomHubert

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def prepare(path):
    """
    Put all the training data in one folder
    :param path: The path to the training data, with 2 subdirectories with zips, "semantic" and "wav", with equal pairs in both directories
    """
    path = os.path.abspath(path)
    raw_data_paths = {
        'semantic': os.path.join(path, 'semantic'),
        'wav': os.path.join(path, 'wav')
    }
    prepared_path = os.path.join(path, 'prepared')

    if not os.path.isdir(prepared_path):
        os.mkdir(prepared_path)

    offset = 0

    for zip_file in os.listdir(raw_data_paths['semantic']):
        print(f'Extracting {os.path.basename(zip_file)}')
        offset = extract_files({
            'semantic': os.path.join(raw_data_paths['semantic'], zip_file),
            'wav': os.path.join(raw_data_paths['wav'], zip_file)
        }, prepared_path, offset)


def extract_files(zip_files: dict[str, str], out: str, start_offset: int = 0) -> int:
    new_offset = start_offset
    with zipfile.ZipFile(zip_files['semantic'], 'r') as semantic_zip:
        with zipfile.ZipFile(zip_files['wav'], 'r') as wav_zip:
            for file in semantic_zip.infolist():
                for file2 in wav_zip.infolist():
                    if ''.join(file.filename.split('.')[:-1]).lower() == ''.join(file2.filename.split('.')[:-1]):
                        semantic_zip.extract(file, out)
                        shutil.move(os.path.join(out, file.filename), os.path.join(out, f'{new_offset}_semantic.npy'))
                        wav_zip.extract(file2, out)
                        shutil.move(os.path.join(out, file2.filename), os.path.join(out, f'{new_offset}_wav.wav'))
                        new_offset += 1
            wav_zip.close()
        semantic_zip.close()

    return new_offset


def prepare2(path, model):
    prepared = os.path.join(path, 'prepared')
    ready = os.path.join(path, 'ready')
    hubert_model = CustomHubert(checkpoint_path=model, device=device)
    if not os.path.isdir(ready):
        os.mkdir(ready)

    wav_string = '_wav.wav'
    sem_string = '_semantic.npy'

    for input_file in os.listdir(prepared):
        input_path = os.path.join(prepared, input_file)
        if input_file.endswith(wav_string):
            file_num = int(input_file[:-len(wav_string)])
            fname = f'{file_num}_semantic_features.npy'
            print('Processing', input_file)
            if os.path.isfile(fname):
                continue
            wav, sr = torchaudio.load(input_path)
            wav = wav.to(device)

            if wav.shape[0] == 2:  # Stereo to mono if needed
                wav = wav.mean(0, keepdim=True)

            output = hubert_model.forward(wav, input_sample_hz=sr)
            out_array = output.cpu().numpy()
            numpy.save(os.path.join(ready, fname), out_array)
        elif input_file.endswith(sem_string):
            fname = os.path.join(ready, input_file)
            if os.path.isfile(fname):
                continue
            shutil.copy(input_path, fname)
    print('All set! We\'re ready to train!')


================================================
FILE: process.py
================================================
import os.path

from args import args
from prepare import prepare, prepare2
from test_hubert import test_hubert
from hubert.customtokenizer import auto_train

path = args.path
mode = args.mode
model = args.hubert_model

if mode == 'prepare':
    prepare(path)

elif mode == 'prepare2':
    prepare2(path, model)

elif mode == 'train':
    auto_train(path, load_model=os.path.join(path, 'model.pth'), save_epochs=args.train_save_epochs)

elif mode == 'test':
    test_hubert(path, model)


================================================
FILE: readme.md
================================================
# Bark voice cloning

## Please read
This code works on python 3.10, i have not tested it on other versions. Some older versions will have issues.

## Voice cloning with bark in high quality?
It's possible now.

https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer/assets/36931363/516375e2-d699-44fe-a928-cd0411982049



## How do I clone a voice?
For developers:
* [code examples on huggingface model page](https://huggingface.co/GitMylo/bark-voice-cloning)

For everyone:
* [audio-webui with bark and voice cloning](https://github.com/gitmylo/audio-webui)
* [online huggingface voice cloning space](https://huggingface.co/spaces/GitMylo/bark-voice-cloning)
* [interactive python notebook](notebook.ipynb)

## Voices cloned aren't very convincing, why are other people's cloned voices better than mine?
Make sure these things are **NOT** in your voice input: (in no particular order)
* Noise (You can use a noise remover before)
* Music (There are also music remover tools) (Unless you want music in the background)
* A cut-off at the end (This will cause it to try and continue on the generation)
* Under 1 second of training data (i personally suggest around 10 seconds for good potential, but i've had great results with 5 seconds as well.)

What makes for good prompt audio? (in no particular order)
* Clearly spoken
* No weird background noises
* Only one speaker
* Audio which ends after a sentence ends
* Regular/common voice (They usually have more success, it's still capable of cloning complex voices, but not as good at it)
* Around 10 seconds of data

## Pretrained models
### Official

| Name                                                                                                                                         | HuBERT Model                                                              | Quantizer Version | Epoch | Language | Dataset                                                                                          |
|----------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------|-------------------|-------|----------|--------------------------------------------------------------------------------------------------|
| [quantifier_hubert_base_ls960.pth](https://huggingface.co/GitMylo/bark-voice-cloning/blob/main/quantifier_hubert_base_ls960.pth)             | [HuBERT Base](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt) | 0                 | 3     | ENG      | [GitMylo/bark-semantic-training](https://huggingface.co/datasets/GitMylo/bark-semantic-training) |
| [quantifier_hubert_base_ls960_14.pth](https://huggingface.co/GitMylo/bark-voice-cloning/blob/main/quantifier_hubert_base_ls960_14.pth)       | [HuBERT Base](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt) | 0                 | 14    | ENG      | [GitMylo/bark-semantic-training](https://huggingface.co/datasets/GitMylo/bark-semantic-training) |
| [quantifier_V1_hubert_base_ls960_23.pth](https://huggingface.co/GitMylo/bark-voice-cloning/blob/main/quantifier_V1_hubert_base_ls960_23.pth) | [HuBERT Base](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt) | 1                 | 23    | ENG      | [GitMylo/bark-semantic-training](https://huggingface.co/datasets/GitMylo/bark-semantic-training) |

### Community

| Author                                      | Name                                                                                                                                                                 | HuBERT Model                                                              | Quantizer Version | Epoch | Language | Dataset                                                                                                                      |
|---------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------|-------------------|-------|----------|------------------------------------------------------------------------------------------------------------------------------|
| [HobisPL](https://github.com/HobisPL)       | [polish-HuBERT-quantizer_8_epoch.pth](https://huggingface.co/Hobis/bark-voice-cloning-polish-HuBERT-quantizer/blob/main/polish-HuBERT-quantizer_8_epoch.pth)         | [HuBERT Base](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt) | 1                 | 8     | POL      | [Hobis/bark-polish-semantic-wav-training](https://huggingface.co/datasets/Hobis/bark-polish-semantic-wav-training)           |
| [C0untFloyd](https://github.com/C0untFloyd) | [ german-HuBERT-quantizer_14_epoch.pth](https://huggingface.co/CountFloyd/bark-voice-cloning-german-HuBERT-quantizer/blob/main/german-HuBERT-quantizer_14_epoch.pth) | [HuBERT Base](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt) | 1                 | 14    | GER      | [CountFloyd/bark-german-semantic-wav-training](https://huggingface.co/datasets/CountFloyd/bark-german-semantic-wav-training) |


## For developers: Implementing voice cloning in your bark projects
* Simply copy the files from [this directory](https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer/tree/master/bark_hubert_quantizer) into your project.
* The [hubert manager](https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer/blob/master/hubert/hubert_manager.py) contains methods to download HuBERT and the custom Quantizer model.
* Loading the [CustomHuBERT](https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer/blob/master/hubert/pre_kmeans_hubert.py) should be pretty straightforward
* The [notebook](notebook.ipynb) contains code to use on cuda or cpu. Instead of just cpu.
```python
from hubert.pre_kmeans_hubert import CustomHubert
import torchaudio

# Load the HuBERT model,
# checkpoint_path should work fine with data/models/hubert/hubert.pt for the default config
hubert_model = CustomHubert(checkpoint_path='path/to/checkpoint')

# Run the model to extract semantic features from an audio file, where wav is your audio file
wav, sr = torchaudio.load('path/to/wav') # This is where you load your wav, with soundfile or torchaudio for example

if wav.shape[0] == 2:  # Stereo to mono if needed
    wav = wav.mean(0, keepdim=True)

semantic_vectors = hubert_model.forward(wav, input_sample_hz=sr)
```
* Loading and running the [custom kmeans](https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer)

```python
import torch
from hubert.customtokenizer import CustomTokenizer

# Load the CustomTokenizer model from a checkpoint
# With default config, you can use the pretrained model from huggingface
# With the default setup from HuBERTManager, this will be in data/models/hubert/tokenizer.pth
tokenizer = CustomTokenizer.load_from_checkpoint('data/models/hubert/tokenizer.pth')  # Automatically uses the right layers

# Process the semantic vectors from the previous HuBERT run (This works in batches, so you can send the entire HuBERT output)
semantic_tokens = tokenizer.get_token(semantic_vectors)

# Congratulations! You now have semantic tokens which can be used inside of a speaker prompt file.
```

## How do I train it myself?
Simply run the training commands.

A simple way to create semantic data and wavs for training, is with my script: [bark-data-gen](https://github.com/gitmylo/bark-data-gen). But remember that the creation of the wavs will take around the same time if not longer than the creation of the semantics. This can take a while to generate because of that.

For example, if you have a dataset with zips containing audio files, one zip for semantics, and one for the wav files. Inside of a folder called "Literature"

You should run `process.py --path Literature --mode prepare` for extracting all the data to one directory

You should run `process.py --path Literature --mode prepare2` for creating HuBERT semantic vectors, ready for training

You should run `process.py --path Literature --mode train` for training

And when your model has trained enough, you can run `process.py --path Literature --mode test` to test the latest model.

## Disclaimer
I am not responsible for audio generated using semantics created by this model. Just don't use it for illegal purposes.


================================================
FILE: requirements.txt
================================================
audiolm-pytorch==1.1.4
fairseq
huggingface-hub
sentencepiece
transformers
encodec
soundfile; platform_system == "Windows"
sox; platform_system != "Windows"


================================================
FILE: setup.py
================================================
from setuptools import setup

setup(
    name='bark_hubert_quantizer',
    version='0.0.4',
    packages=['bark_hubert_quantizer'],
    install_requires=[
        'audiolm-pytorch==1.1.4',
        'fairseq',
        'huggingface-hub',
        'sentencepiece',
        'transformers',
        'encodec',
        'soundfile; platform_system == "Windows"',
        'sox; platform_system != "Windows"'
    ],
)

================================================
FILE: test_hubert.py
================================================
import os

import numpy
import torch
import torchaudio

from hubert.customtokenizer import CustomTokenizer
from hubert.pre_kmeans_hubert import CustomHubert


def test_hubert(path: str, model: str = 'model/hubert/hubert_base_ls960.pt', tokenizer: str = 'model.pth'):
    hubert_model = CustomHubert(checkpoint_path=model)
    customtokenizer = CustomTokenizer.load_from_checkpoint(os.path.join(path, tokenizer))

    wav, sr = torchaudio.load(os.path.join(path, 'test', 'wav.wav'))
    original = numpy.load(os.path.join(path, 'test', 'semantic.npy'))

    out = hubert_model.forward(wav, input_sample_hz=sr)
    out_tokenized = customtokenizer.get_token(out)

    # print(out.shape, out_tokenized.shape)
    print(original[:-1], out_tokenized)
    numpy.save(os.path.join(path, 'test', 'gen_semantic.npy'), out_tokenized)

gitextract_i_oo11tb/

├── .gitignore
├── .idea/
│   ├── .gitignore
│   ├── bark.iml
│   ├── inspectionProfiles/
│   │   ├── Project_Default.xml
│   │   └── profiles_settings.xml
│   ├── misc.xml
│   ├── modules.xml
│   └── vcs.xml
├── LICENSE
├── args.py
├── bark_hubert_quantizer/
│   ├── __init__.py
│   ├── customtokenizer.py
│   ├── hubert_manager.py
│   └── pre_kmeans_hubert.py
├── colab_notebook.ipynb
├── notebook.ipynb
├── prepare.py
├── process.py
├── readme.md
├── requirements.txt
├── setup.py
└── test_hubert.py

Condensed preview — 22 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (44K chars).

[
  {
    "path": ".gitignore",
    "chars": 91,
    "preview": "/model\n/Literature/semantic/*\n/Literature/wav/*\n**/speaker.wav\n**/speaker.npz\n\n__pycache__\n"
  },
  {
    "path": ".idea/.gitignore",
    "chars": 176,
    "preview": "# Default ignored files\n/shelf/\n/workspace.xml\n# Editor-based HTTP Client requests\n/httpRequests/\n# Datasource local sto..."
  },
  {
    "path": ".idea/bark.iml",
    "chars": 393,
    "preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<module type=\"PYTHON_MODULE\" version=\"4\">\n  <component name=\"NewModuleRootManager..."
  },
  {
    "path": ".idea/inspectionProfiles/Project_Default.xml",
    "chars": 1925,
    "preview": "<component name=\"InspectionProjectProfileManager\">\n  <profile version=\"1.0\">\n    <option name=\"myName\" value=\"Project De..."
  },
  {
    "path": ".idea/inspectionProfiles/profiles_settings.xml",
    "chars": 174,
    "preview": "<component name=\"InspectionProjectProfileManager\">\n  <settings>\n    <option name=\"USE_PROJECT_PROFILE\" value=\"false\" />..."
  },
  {
    "path": ".idea/misc.xml",
    "chars": 193,
    "preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<project version=\"4\">\n  <component name=\"ProjectRootManager\" version=\"2\" project-..."
  },
  {
    "path": ".idea/modules.xml",
    "chars": 260,
    "preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<project version=\"4\">\n  <component name=\"ProjectModuleManager\">\n    <modules>..."
  },
  {
    "path": ".idea/vcs.xml",
    "chars": 180,
    "preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<project version=\"4\">\n  <component name=\"VcsDirectoryMappings\">\n    <mapping dire..."
  },
  {
    "path": "LICENSE",
    "chars": 1061,
    "preview": "MIT License\n\nCopyright (c) 2023 Mylo\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th..."
  },
  {
    "path": "args.py",
    "chars": 607,
    "preview": "from argparse import ArgumentParser\n\nparser = ArgumentParser()\n\nparser.add_argument('--path', required=True, help='The p..."
  },
  {
    "path": "bark_hubert_quantizer/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "bark_hubert_quantizer/customtokenizer.py",
    "chars": 6853,
    "preview": "\"\"\"\nCustom tokenizer model.\nAuthor: https://www.github.com/gitmylo/\nLicense: MIT\n\"\"\"\n\nimport json\nimport os.path\nfrom zi..."
  },
  {
    "path": "bark_hubert_quantizer/hubert_manager.py",
    "chars": 1473,
    "preview": "import os.path\nimport shutil\nimport urllib.request\n\nimport huggingface_hub\n\n\nclass HuBERTManager:\n    @staticmethod..."
  },
  {
    "path": "bark_hubert_quantizer/pre_kmeans_hubert.py",
    "chars": 2808,
    "preview": "\"\"\"\nModified HuBERT model without kmeans.\nOriginal author: https://github.com/lucidrains/\nModified by: https://www.githu..."
  },
  {
    "path": "colab_notebook.ipynb",
    "chars": 5517,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {\n    \"collapsed\": false\n   },\n   \"source\": [\n    \"# Bark t..."
  },
  {
    "path": "notebook.ipynb",
    "chars": 4965,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {\n    \"collapsed\": false\n   },\n   \"source\": [\n    \"# Bark t..."
  },
  {
    "path": "prepare.py",
    "chars": 3263,
    "preview": "import os\nimport shutil\nimport zipfile\n\nimport numpy\nimport torchaudio\n\nfrom hubert.pre_kmeans_hubert import CustomHuber..."
  },
  {
    "path": "process.py",
    "chars": 487,
    "preview": "import os.path\n\nfrom args import args\nfrom prepare import prepare, prepare2\nfrom test_hubert import test_hubert\nfrom hub..."
  },
  {
    "path": "readme.md",
    "chars": 8467,
    "preview": "# Bark voice cloning\n\n## Please read\nThis code works on python 3.10, i have not tested it on other versions. Some older..."
  },
  {
    "path": "requirements.txt",
    "chars": 156,
    "preview": "audiolm-pytorch==1.1.4\nfairseq\nhuggingface-hub\nsentencepiece\ntransformers\nencodec\nsoundfile; platform_system == \"Windows..."
  },
  {
    "path": "setup.py",
    "chars": 422,
    "preview": "from setuptools import setup\r\n\r\nsetup(\r\n    name='bark_hubert_quantizer',\r\n    version='0.0.4',\r\n    packages=['bark_hub..."
  },
  {
    "path": "test_hubert.py",
    "chars": 823,
    "preview": "import os\n\nimport numpy\nimport torch\nimport torchaudio\n\nfrom hubert.customtokenizer import CustomTokenizer\nfrom hubert.p..."
  }
]

About this extraction

This page contains the full source code of the gitmylo/bark-voice-cloning-HuBERT-quantizer GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 22 files (39.3 KB), approximately 10.6k tokens. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo