Repository: liusongxiang/StarGAN-Voice-Conversion
Branch: master
Commit: a4633d8b4888
Files: 10
Total size: 51.6 KB
Directory structure:
gitextract_6c839ap3/
├── README.md
├── convert.py
├── converted_samples/
│ └── Readme
├── data_loader.py
├── logger.py
├── main.py
├── model.py
├── preprocess.py
├── solver.py
└── utils.py
================================================
FILE CONTENTS
================================================
================================================
FILE: README.md
================================================
# StarGAN-Voice-Conversion
This is a pytorch implementation of the paper: StarGAN-VC: Non-parallel many-to-many voice conversion with star generative adversarial networks https://arxiv.org/abs/1806.02169 .
Note that the model architecture is a little different from that of the original paper.
# Dependencies
* Python 3.6 (or 3.5)
* Pytorch 0.4.0
* pyworld
* tqdm
* librosa
* tensorboardX and tensorboard
# Usage
## Download Dataset
Download and unzip [VCTK](https://homepages.inf.ed.ac.uk/jyamagis/page3/page58/page58.html) corpus to designated directories.
```bash
mkdir ./data
wget https://datashare.is.ed.ac.uk/bitstream/handle/10283/2651/VCTK-Corpus.zip?sequence=2&isAllowed=y
unzip VCTK-Corpus.zip -d ./data
```
If the downloaded VCTK is in tar.gz, run this:
```bash
tar -xzvf VCTK-Corpus.tar.gz -C ./data
```
Preprocess data
We will use Mel-cepstral coefficients(MCEPs) here.
```bash
python preprocess.py --sample_rate 16000 \
--origin_wavpath data/VCTK-Corpus/wav48 \
--target_wavpath data/VCTK-Corpus/wav16 \
--mc_dir_train data/mc/train \
--mc_dir_test data/mc/test
```
Train model
Note: you may need to early stop the training process if the training-time test samples sounds good or the you can also see the training loss curves to determine early stop or not.
```
python main.py
```
Convert
For example: restore model at step 200000 and specify the source speaker and target speaker to `p262` and `p272`, respectively.
```
convert.py --resume_iters 200000 --src_spk p262 --trg_spk p272
```
## To-Do list
- [x] Post some converted samples (Please find some converted samples in the `converted_samples` folder).
## Papers that use this repo:
1. [AUTOVC: Zero-Shot Voice Style Transfer with Only Autoencoder Loss (ICML2019)](https://arxiv.org/pdf/1905.05879v2.pdf)
2. [Blow: a single-scale hyperconditioned flow for non-parallel raw-audio voice conversion (NeurIPS 2019)](https://arxiv.org/pdf/1906.00794.pdf)
3. [ADAGAN: ADAPTIVE GAN FOR MANY-TO-MANY NON-PARALLEL VOICE CONVERSION (under review for ICLR 2020)](https://openreview.net/pdf?id=HJlk-eHFwH)
================================================
FILE: convert.py
================================================
import argparse
from model import Generator
from torch.autograd import Variable
import torch
import torch.nn.functional as F
import numpy as np
import os
from os.path import join, basename, dirname, split
import time
import datetime
from data_loader import to_categorical
import librosa
from utils import *
import glob
# Below is the accent info for the used 10 speakers.
spk2acc = {'262': 'Edinburgh', #F
'272': 'Edinburgh', #M
'229': 'SouthEngland', #F
'232': 'SouthEngland', #M
'292': 'NorthernIrishBelfast', #M
'293': 'NorthernIrishBelfast', #F
'360': 'AmericanNewJersey', #M
'361': 'AmericanNewJersey', #F
'248': 'India', #F
'251': 'India'} #M
speakers = ['p262', 'p272', 'p229', 'p232', 'p292', 'p293', 'p360', 'p361', 'p248', 'p251']
spk2idx = dict(zip(speakers, range(len(speakers))))
class TestDataset(object):
"""Dataset for testing."""
def __init__(self, config):
assert config.trg_spk in speakers, f'The trg_spk should be chosen from {speakers}, but you choose {trg_spk}.'
# Source speaker
self.src_spk = config.src_spk
self.trg_spk = config.trg_spk
self.mc_files = sorted(glob.glob(join(config.test_data_dir, f'{config.src_spk}*.npy')))
self.src_spk_stats = np.load(join(config.train_data_dir, f'{config.src_spk}_stats.npz'))
self.src_wav_dir = f'{config.wav_dir}/{config.src_spk}'
self.trg_spk_stats = np.load(join(config.train_data_dir, f'{config.trg_spk}_stats.npz'))
self.logf0s_mean_src = self.src_spk_stats['log_f0s_mean']
self.logf0s_std_src = self.src_spk_stats['log_f0s_std']
self.logf0s_mean_trg = self.trg_spk_stats['log_f0s_mean']
self.logf0s_std_trg = self.trg_spk_stats['log_f0s_std']
self.mcep_mean_src = self.src_spk_stats['coded_sps_mean']
self.mcep_std_src = self.src_spk_stats['coded_sps_std']
self.mcep_mean_trg = self.trg_spk_stats['coded_sps_mean']
self.mcep_std_trg = self.trg_spk_stats['coded_sps_std']
self.spk_idx = spk2idx[config.trg_spk]
spk_cat = to_categorical([self.spk_idx], num_classes=len(speakers))
self.spk_c_trg = spk_cat
def get_batch_test_data(self, batch_size=4):
batch_data = []
for i in range(batch_size):
mcfile = self.mc_files[i]
filename = basename(mcfile).split('-')[-1]
wavfile_path = join(self.src_wav_dir, filename.replace('npy', 'wav'))
batch_data.append(wavfile_path)
return batch_data
def load_wav(wavfile, sr=16000):
wav, _ = librosa.load(wavfile, sr=sr, mono=True)
return wav_padding(wav, sr=sr, frame_period=5, multiple = 4) # TODO
# return wav
def test(config):
os.makedirs(join(config.convert_dir, str(config.resume_iters)), exist_ok=True)
sampling_rate, num_mcep, frame_period=16000, 36, 5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
G = Generator().to(device)
test_loader = TestDataset(config)
# Restore model
print(f'Loading the trained models from step {config.resume_iters}...')
G_path = join(config.model_save_dir, f'{config.resume_iters}-G.ckpt')
G.load_state_dict(torch.load(G_path, map_location=lambda storage, loc: storage))
# Read a batch of testdata
test_wavfiles = test_loader.get_batch_test_data(batch_size=config.num_converted_wavs)
test_wavs = [load_wav(wavfile, sampling_rate) for wavfile in test_wavfiles]
with torch.no_grad():
for idx, wav in enumerate(test_wavs):
print(len(wav))
wav_name = basename(test_wavfiles[idx])
# print(wav_name)
f0, timeaxis, sp, ap = world_decompose(wav=wav, fs=sampling_rate, frame_period=frame_period)
f0_converted = pitch_conversion(f0=f0,
mean_log_src=test_loader.logf0s_mean_src, std_log_src=test_loader.logf0s_std_src,
mean_log_target=test_loader.logf0s_mean_trg, std_log_target=test_loader.logf0s_std_trg)
coded_sp = world_encode_spectral_envelop(sp=sp, fs=sampling_rate, dim=num_mcep)
print("Before being fed into G: ", coded_sp.shape)
coded_sp_norm = (coded_sp - test_loader.mcep_mean_src) / test_loader.mcep_std_src
coded_sp_norm_tensor = torch.FloatTensor(coded_sp_norm.T).unsqueeze_(0).unsqueeze_(1).to(device)
spk_conds = torch.FloatTensor(test_loader.spk_c_trg).to(device)
# print(spk_conds.size())
coded_sp_converted_norm = G(coded_sp_norm_tensor, spk_conds).data.cpu().numpy()
coded_sp_converted = np.squeeze(coded_sp_converted_norm).T * test_loader.mcep_std_trg + test_loader.mcep_mean_trg
coded_sp_converted = np.ascontiguousarray(coded_sp_converted)
print("After being fed into G: ", coded_sp_converted.shape)
wav_transformed = world_speech_synthesis(f0=f0_converted, coded_sp=coded_sp_converted,
ap=ap, fs=sampling_rate, frame_period=frame_period)
wav_id = wav_name.split('.')[0]
librosa.output.write_wav(join(config.convert_dir, str(config.resume_iters),
f'{wav_id}-vcto-{test_loader.trg_spk}.wav'), wav_transformed, sampling_rate)
if [True, False][0]:
wav_cpsyn = world_speech_synthesis(f0=f0, coded_sp=coded_sp,
ap=ap, fs=sampling_rate, frame_period=frame_period)
librosa.output.write_wav(join(config.convert_dir, str(config.resume_iters), f'cpsyn-{wav_name}'), wav_cpsyn, sampling_rate)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
# Model configuration.
parser.add_argument('--num_speakers', type=int, default=10, help='dimension of speaker labels')
parser.add_argument('--num_converted_wavs', type=int, default=8, help='number of wavs to convert.')
parser.add_argument('--resume_iters', type=int, default=None, help='step to resume for testing.')
parser.add_argument('--src_spk', type=str, default='p262', help = 'target speaker.')
parser.add_argument('--trg_spk', type=str, default='p272', help = 'target speaker.')
# Directories.
parser.add_argument('--train_data_dir', type=str, default='./data/mc/train')
parser.add_argument('--test_data_dir', type=str, default='./data/mc/test')
parser.add_argument('--wav_dir', type=str, default="./data/VCTK-Corpus/wav16")
parser.add_argument('--log_dir', type=str, default='./logs')
parser.add_argument('--model_save_dir', type=str, default='./models')
parser.add_argument('--convert_dir', type=str, default='./converted')
config = parser.parse_args()
print(config)
if config.resume_iters is None:
raise RuntimeError("Please specify the step number for resuming.")
test(config)
================================================
FILE: converted_samples/Readme
================================================
cpsyn stands for copy-synthesis for references.
These converted samples were obtained from not-well-fine-tuned model. If you want to get better results, please tune the hyper-parameters carefully.
================================================
FILE: data_loader.py
================================================
from torch.utils import data
import torch
import os
import random
import glob
from os.path import join, basename, dirname, split
import numpy as np
# Below is the accent info for the used 10 speakers.
spk2acc = {'262': 'Edinburgh', #F
'272': 'Edinburgh', #M
'229': 'SouthEngland', #F
'232': 'SouthEngland', #M
'292': 'NorthernIrishBelfast', #M
'293': 'NorthernIrishBelfast', #F
'360': 'AmericanNewJersey', #M
'361': 'AmericanNewJersey', #F
'248': 'India', #F
'251': 'India'} #M
min_length = 256 # Since we slice 256 frames from each utterance when training.
# Build a dict useful when we want to get one-hot representation of speakers.
speakers = ['p262', 'p272', 'p229', 'p232', 'p292', 'p293', 'p360', 'p361', 'p248', 'p251']
spk2idx = dict(zip(speakers, range(len(speakers))))
def to_categorical(y, num_classes=None):
"""Converts a class vector (integers) to binary class matrix.
E.g. for use with categorical_crossentropy.
# Arguments
y: class vector to be converted into a matrix
(integers from 0 to num_classes).
num_classes: total number of classes.
# Returns
A binary matrix representation of the input. The classes axis
is placed last.
From Keras np_utils
"""
y = np.array(y, dtype='int')
input_shape = y.shape
if input_shape and input_shape[-1] == 1 and len(input_shape) > 1:
input_shape = tuple(input_shape[:-1])
y = y.ravel()
if not num_classes:
num_classes = np.max(y) + 1
n = y.shape[0]
categorical = np.zeros((n, num_classes), dtype=np.float32)
categorical[np.arange(n), y] = 1
output_shape = input_shape + (num_classes,)
categorical = np.reshape(categorical, output_shape)
return categorical
class MyDataset(data.Dataset):
"""Dataset for MCEP features and speaker labels."""
def __init__(self, data_dir):
mc_files = glob.glob(join(data_dir, '*.npy'))
mc_files = [i for i in mc_files if basename(i)[:4] in speakers]
self.mc_files = self.rm_too_short_utt(mc_files)
self.num_files = len(self.mc_files)
print("\t Number of training samples: ", self.num_files)
for f in self.mc_files:
mc = np.load(f)
if mc.shape[0] <= min_length:
print(f)
raise RuntimeError(f"The data may be corrupted! We need all MCEP features having more than {min_length} frames!")
def rm_too_short_utt(self, mc_files, min_length=min_length):
new_mc_files = []
for mcfile in mc_files:
mc = np.load(mcfile)
if mc.shape[0] > min_length:
new_mc_files.append(mcfile)
return new_mc_files
def sample_seg(self, feat, sample_len=min_length):
assert feat.shape[0] - sample_len >= 0
s = np.random.randint(0, feat.shape[0] - sample_len + 1)
return feat[s:s+sample_len, :]
def __len__(self):
return self.num_files
def __getitem__(self, index):
filename = self.mc_files[index]
spk = basename(filename).split('_')[0]
spk_idx = spk2idx[spk]
mc = np.load(filename)
mc = self.sample_seg(mc)
mc = np.transpose(mc, (1, 0)) # (T, D) -> (D, T), since pytorch need feature having shape
# to one-hot
spk_cat = np.squeeze(to_categorical([spk_idx], num_classes=len(speakers)))
return torch.FloatTensor(mc), torch.LongTensor([spk_idx]).squeeze_(), torch.FloatTensor(spk_cat)
class TestDataset(object):
"""Dataset for testing."""
def __init__(self, data_dir, wav_dir, src_spk='p262', trg_spk='p272'):
self.src_spk = src_spk
self.trg_spk = trg_spk
self.mc_files = sorted(glob.glob(join(data_dir, '{}*.npy'.format(self.src_spk))))
self.src_spk_stats = np.load(join(data_dir.replace('test', 'train'), '{}_stats.npz'.format(src_spk)))
self.trg_spk_stats = np.load(join(data_dir.replace('test', 'train'), '{}_stats.npz'.format(trg_spk)))
self.logf0s_mean_src = self.src_spk_stats['log_f0s_mean']
self.logf0s_std_src = self.src_spk_stats['log_f0s_std']
self.logf0s_mean_trg = self.trg_spk_stats['log_f0s_mean']
self.logf0s_std_trg = self.trg_spk_stats['log_f0s_std']
self.mcep_mean_src = self.src_spk_stats['coded_sps_mean']
self.mcep_std_src = self.src_spk_stats['coded_sps_std']
self.mcep_mean_trg = self.trg_spk_stats['coded_sps_mean']
self.mcep_std_trg = self.trg_spk_stats['coded_sps_std']
self.src_wav_dir = f'{wav_dir}/{src_spk}'
self.spk_idx = spk2idx[trg_spk]
spk_cat = to_categorical([self.spk_idx], num_classes=len(speakers))
self.spk_c_trg = spk_cat
def get_batch_test_data(self, batch_size=8):
batch_data = []
for i in range(batch_size):
mcfile = self.mc_files[i]
filename = basename(mcfile).split('-')[-1]
wavfile_path = join(self.src_wav_dir, filename.replace('npy', 'wav'))
batch_data.append(wavfile_path)
return batch_data
def get_loader(data_dir, batch_size=32, mode='train', num_workers=1):
dataset = MyDataset(data_dir)
data_loader = data.DataLoader(dataset=dataset,
batch_size=batch_size,
shuffle=(mode=='train'),
num_workers=num_workers,
drop_last=True)
return data_loader
if __name__ == '__main__':
loader = get_loader('./data/mc/train')
data_iter = iter(loader)
for i in range(10):
mc, spk_idx, acc_idx, spk_acc_cat = next(data_iter)
print('-'*50)
print(mc.size())
print(spk_idx.size())
print(acc_idx.size())
print(spk_acc_cat.size())
print(spk_idx.squeeze_())
print(spk_acc_cat)
print('-'*50)
================================================
FILE: logger.py
================================================
# import tensorflow as tf
from tensorboardX import SummaryWriter
# class Logger(object):
# """Tensorflow Tensorboard logger."""
# def __init__(self, log_dir):
# """Initialize summary writer."""
# self.writer = tf.summary.FileWriter(log_dir)
# def scalar_summary(self, tag, value, step):
# """Add scalar summary."""
# summary = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)])
# self.writer.add_summary(summary, step)
class Logger(object):
"""Using tensorboardX such that need no dependency on tensorflow."""
def __init__(self, log_dir):
"""Initialize summary writer."""
self.writer = SummaryWriter(log_dir)
def scalar_summary(self, tag, value, step):
self.writer.add_scalar(tag, value, step)
================================================
FILE: main.py
================================================
import os
import argparse
from solver import Solver
from data_loader import get_loader, TestDataset
from torch.backends import cudnn
def str2bool(v):
return v.lower() in ('true')
def main(config):
# For fast training.
cudnn.benchmark = True
# Create directories if not exist.
if not os.path.exists(config.log_dir):
os.makedirs(config.log_dir)
if not os.path.exists(config.model_save_dir):
os.makedirs(config.model_save_dir)
if not os.path.exists(config.sample_dir):
os.makedirs(config.sample_dir)
# Data loader.
train_loader = get_loader(config.train_data_dir, config.batch_size, 'train', num_workers=config.num_workers)
test_loader = TestDataset(config.test_data_dir, config.wav_dir, src_spk='p262', trg_spk='p272')
# Solver for training and testing StarGAN.
solver = Solver(train_loader, test_loader, config)
if config.mode == 'train':
solver.train()
# elif config.mode == 'test':
# solver.test()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
# Model configuration.
parser.add_argument('--num_speakers', type=int, default=10, help='dimension of speaker labels')
parser.add_argument('--lambda_cls', type=float, default=10, help='weight for domain classification loss')
parser.add_argument('--lambda_rec', type=float, default=10, help='weight for reconstruction loss')
parser.add_argument('--lambda_gp', type=float, default=10, help='weight for gradient penalty')
parser.add_argument('--sampling_rate', type=int, default=16000, help='sampling rate')
# Training configuration.
parser.add_argument('--batch_size', type=int, default=32, help='mini-batch size')
parser.add_argument('--num_iters', type=int, default=200000, help='number of total iterations for training D')
parser.add_argument('--num_iters_decay', type=int, default=100000, help='number of iterations for decaying lr')
parser.add_argument('--g_lr', type=float, default=0.0001, help='learning rate for G')
parser.add_argument('--d_lr', type=float, default=0.0001, help='learning rate for D')
parser.add_argument('--n_critic', type=int, default=5, help='number of D updates per each G update')
parser.add_argument('--beta1', type=float, default=0.5, help='beta1 for Adam optimizer')
parser.add_argument('--beta2', type=float, default=0.999, help='beta2 for Adam optimizer')
parser.add_argument('--resume_iters', type=int, default=None, help='resume training from this step')
# Test configuration.
parser.add_argument('--test_iters', type=int, default=100000, help='test model from this step')
# Miscellaneous.
parser.add_argument('--num_workers', type=int, default=1)
parser.add_argument('--mode', type=str, default='train', choices=['train', 'test'])
parser.add_argument('--use_tensorboard', type=str2bool, default=True)
# Directories.
parser.add_argument('--train_data_dir', type=str, default='./data/mc/train')
parser.add_argument('--test_data_dir', type=str, default='./data/mc/test')
parser.add_argument('--wav_dir', type=str, default="./data/VCTK-Corpus/wav16")
parser.add_argument('--log_dir', type=str, default='./logs')
parser.add_argument('--model_save_dir', type=str, default='./models')
parser.add_argument('--sample_dir', type=str, default='./samples')
# Step size.
parser.add_argument('--log_step', type=int, default=10)
parser.add_argument('--sample_step', type=int, default=1000)
parser.add_argument('--model_save_step', type=int, default=1000)
parser.add_argument('--lr_update_step', type=int, default=1000)
config = parser.parse_args()
print(config)
main(config)
================================================
FILE: model.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
# from data_loader import get_loader
class ResidualBlock(nn.Module):
"""Residual Block with instance normalization."""
def __init__(self, dim_in, dim_out):
super(ResidualBlock, self).__init__()
self.main = nn.Sequential(
nn.Conv2d(dim_in, dim_out, kernel_size=3, stride=1, padding=1, bias=False),
nn.InstanceNorm2d(dim_out, affine=True, track_running_stats=True),
nn.ReLU(inplace=True),
nn.Conv2d(dim_out, dim_out, kernel_size=3, stride=1, padding=1, bias=False),
nn.InstanceNorm2d(dim_out, affine=True, track_running_stats=True))
def forward(self, x):
return x + self.main(x)
class Generator(nn.Module):
"""Generator network."""
def __init__(self, conv_dim=64, num_speakers=10, repeat_num=6):
super(Generator, self).__init__()
c_dim = num_speakers
layers = []
layers.append(nn.Conv2d(1+c_dim, conv_dim, kernel_size=(3, 9), padding=(1, 4), bias=False))
layers.append(nn.InstanceNorm2d(conv_dim, affine=True, track_running_stats=True))
layers.append(nn.ReLU(inplace=True))
# Down-sampling layers.
curr_dim = conv_dim
for i in range(2):
layers.append(nn.Conv2d(curr_dim, curr_dim*2, kernel_size=(4, 8), stride=(2, 2), padding=(1, 3), bias=False))
layers.append(nn.InstanceNorm2d(curr_dim*2, affine=True, track_running_stats=True))
layers.append(nn.ReLU(inplace=True))
curr_dim = curr_dim * 2
# Bottleneck layers.
for i in range(repeat_num):
layers.append(ResidualBlock(dim_in=curr_dim, dim_out=curr_dim))
# Up-sampling layers.
for i in range(2):
layers.append(nn.ConvTranspose2d(curr_dim, curr_dim//2, kernel_size=4, stride=2, padding=1, bias=False))
layers.append(nn.InstanceNorm2d(curr_dim//2, affine=True, track_running_stats=True))
layers.append(nn.ReLU(inplace=True))
curr_dim = curr_dim // 2
layers.append(nn.Conv2d(curr_dim, 1, kernel_size=7, stride=1, padding=3, bias=False))
self.main = nn.Sequential(*layers)
def forward(self, x, c):
# Replicate spatially and concatenate domain information.
c = c.view(c.size(0), c.size(1), 1, 1)
c = c.repeat(1, 1, x.size(2), x.size(3))
x = torch.cat([x, c], dim=1)
return self.main(x)
class Discriminator(nn.Module):
"""Discriminator network with PatchGAN."""
def __init__(self, input_size=(36, 256), conv_dim=64, repeat_num=5, num_speakers=10):
super(Discriminator, self).__init__()
layers = []
layers.append(nn.Conv2d(1, conv_dim, kernel_size=4, stride=2, padding=1))
layers.append(nn.LeakyReLU(0.01))
curr_dim = conv_dim
for i in range(1, repeat_num):
layers.append(nn.Conv2d(curr_dim, curr_dim*2, kernel_size=4, stride=2, padding=1))
layers.append(nn.LeakyReLU(0.01))
curr_dim = curr_dim * 2
kernel_size_0 = int(input_size[0] / np.power(2, repeat_num)) # 1
kernel_size_1 = int(input_size[1] / np.power(2, repeat_num)) # 8
self.main = nn.Sequential(*layers)
self.conv_dis = nn.Conv2d(curr_dim, 1, kernel_size=(kernel_size_0, kernel_size_1), stride=1, padding=0, bias=False) # padding should be 0
self.conv_clf_spks = nn.Conv2d(curr_dim, num_speakers, kernel_size=(kernel_size_0, kernel_size_1), stride=1, padding=0, bias=False) # for num_speaker
def forward(self, x):
h = self.main(x)
out_src = self.conv_dis(h)
out_cls_spks = self.conv_clf_spks(h)
return out_src, out_cls_spks.view(out_cls_spks.size(0), out_cls_spks.size(1))
if __name__ == '__main__':
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_loader = get_loader('/scratch/sxliu/data_exp/VCTK-Corpus-22.05k/mc/train', 16, 'train', num_workers=1)
data_iter = iter(train_loader)
G = Generator().to(device)
D = Discriminator().to(device)
for i in range(10):
mc_real, spk_label_org, acc_label_org, spk_acc_c_org = next(data_iter)
mc_real.unsqueeze_(1) # (B, D, T) -> (B, 1, D, T) for conv2d
mc_real = mc_real.to(device) # Input mc.
spk_label_org = spk_label_org.to(device) # Original spk labels.
acc_label_org = acc_label_org.to(device) # Original acc labels.
spk_acc_c_org = spk_acc_c_org.to(device) # Original spk acc conditioning.
mc_fake = G(mc_real, spk_acc_c_org)
print(mc_fake.size())
out_src, out_cls_spks, out_cls_emos = D(mc_fake)
================================================
FILE: preprocess.py
================================================
import librosa
import numpy as np
import os, sys
import argparse
import pyworld
from multiprocessing import cpu_count
from concurrent.futures import ProcessPoolExecutor
from functools import partial
from utils import *
from tqdm import tqdm
from collections import defaultdict
from collections import namedtuple
from sklearn.model_selection import train_test_split
import glob
from os.path import join, basename
import subprocess
def resample(spk, origin_wavpath, target_wavpath):
wavfiles = [i for i in os.listdir(join(origin_wavpath, spk)) if i.endswith(".wav")]
for wav in wavfiles:
folder_to = join(target_wavpath, spk)
os.makedirs(folder_to, exist_ok=True)
wav_to = join(folder_to, wav)
wav_from = join(origin_wavpath, spk, wav)
subprocess.call(['sox', wav_from, "-r", "16000", wav_to])
return 0
def resample_to_16k(origin_wavpath, target_wavpath, num_workers=1):
os.makedirs(target_wavpath, exist_ok=True)
spk_folders = os.listdir(origin_wavpath)
print(f"> Using {num_workers} workers!")
executor = ProcessPoolExecutor(max_workers=num_workers)
futures = []
for spk in spk_folders:
futures.append(executor.submit(partial(resample, spk, origin_wavpath, target_wavpath)))
result_list = [future.result() for future in tqdm(futures)]
print(result_list)
def split_data(paths):
indices = np.arange(len(paths))
test_size = 0.1
train_indices, test_indices = train_test_split(indices, test_size=test_size, random_state=1234)
train_paths = list(np.array(paths)[train_indices])
test_paths = list(np.array(paths)[test_indices])
return train_paths, test_paths
def get_spk_world_feats(spk_fold_path, mc_dir_train, mc_dir_test, sample_rate=16000):
paths = glob.glob(join(spk_fold_path, '*.wav'))
spk_name = basename(spk_fold_path)
train_paths, test_paths = split_data(paths)
f0s = []
coded_sps = []
for wav_file in train_paths:
f0, _, _, _, coded_sp = world_encode_wav(wav_file, fs=sample_rate)
f0s.append(f0)
coded_sps.append(coded_sp)
log_f0s_mean, log_f0s_std = logf0_statistics(f0s)
coded_sps_mean, coded_sps_std = coded_sp_statistics(coded_sps)
np.savez(join(mc_dir_train, spk_name+'_stats.npz'),
log_f0s_mean=log_f0s_mean,
log_f0s_std=log_f0s_std,
coded_sps_mean=coded_sps_mean,
coded_sps_std=coded_sps_std)
for wav_file in tqdm(train_paths):
wav_nam = basename(wav_file)
f0, timeaxis, sp, ap, coded_sp = world_encode_wav(wav_file, fs=sample_rate)
normed_coded_sp = normalize_coded_sp(coded_sp, coded_sps_mean, coded_sps_std)
np.save(join(mc_dir_train, wav_nam.replace('.wav', '.npy')), normed_coded_sp, allow_pickle=False)
for wav_file in tqdm(test_paths):
wav_nam = basename(wav_file)
f0, timeaxis, sp, ap, coded_sp = world_encode_wav(wav_file, fs=sample_rate)
normed_coded_sp = normalize_coded_sp(coded_sp, coded_sps_mean, coded_sps_std)
np.save(join(mc_dir_test, wav_nam.replace('.wav', '.npy')), normed_coded_sp, allow_pickle=False)
return 0
if __name__ == '__main__':
parser = argparse.ArgumentParser()
sample_rate_default = 16000
origin_wavpath_default = "./data/VCTK-Corpus/wav48"
target_wavpath_default = "./data/VCTK-Corpus/wav16"
mc_dir_train_default = './data/mc/train'
mc_dir_test_default = './data/mc/test'
parser.add_argument("--sample_rate", type = int, default = 16000, help = "Sample rate.")
parser.add_argument("--origin_wavpath", type = str, default = origin_wavpath_default, help = "The original wav path to resample.")
parser.add_argument("--target_wavpath", type = str, default = target_wavpath_default, help = "The original wav path to resample.")
parser.add_argument("--mc_dir_train", type = str, default = mc_dir_train_default, help = "The directory to store the training features.")
parser.add_argument("--mc_dir_test", type = str, default = mc_dir_test_default, help = "The directory to store the testing features.")
parser.add_argument("--num_workers", type = int, default = None, help = "The number of cpus to use.")
argv = parser.parse_args()
sample_rate = argv.sample_rate
origin_wavpath = argv.origin_wavpath
target_wavpath = argv.target_wavpath
mc_dir_train = argv.mc_dir_train
mc_dir_test = argv.mc_dir_test
num_workers = argv.num_workers if argv.num_workers is not None else cpu_count()
# The original wav in VCTK is 48K, first we want to resample to 16K
resample_to_16k(origin_wavpath, target_wavpath, num_workers=num_workers)
# WE only use 10 speakers listed below for this experiment.
speaker_used = ['262', '272', '229', '232', '292', '293', '360', '361', '248', '251']
speaker_used = ['p'+i for i in speaker_used]
## Next we are to extract the acoustic features (MCEPs, lf0) and compute the corresponding stats (means, stds).
# Make dirs to contain the MCEPs
os.makedirs(mc_dir_train, exist_ok=True)
os.makedirs(mc_dir_test, exist_ok=True)
num_workers = len(speaker_used) #cpu_count()
print("number of workers: ", num_workers)
executor = ProcessPoolExecutor(max_workers=num_workers)
work_dir = target_wavpath
# spk_folders = os.listdir(work_dir)
# print("processing {} speaker folders".format(len(spk_folders)))
# print(spk_folders)
futures = []
for spk in speaker_used:
spk_path = os.path.join(work_dir, spk)
futures.append(executor.submit(partial(get_spk_world_feats, spk_path, mc_dir_train, mc_dir_test, sample_rate)))
result_list = [future.result() for future in tqdm(futures)]
print(result_list)
sys.exit(0)
================================================
FILE: solver.py
================================================
from model import Generator
from model import Discriminator
from torch.autograd import Variable
from torchvision.utils import save_image
import torch
import torch.nn.functional as F
import numpy as np
import os
from os.path import join, basename, dirname, split
import time
import datetime
from data_loader import to_categorical
import librosa
from utils import *
from tqdm import tqdm
class Solver(object):
"""Solver for training and testing StarGAN."""
def __init__(self, train_loader, test_loader, config):
"""Initialize configurations."""
# Data loader.
self.train_loader = train_loader
self.test_loader = test_loader
self.sampling_rate = config.sampling_rate
# Model configurations.
self.num_speakers = config.num_speakers
self.lambda_cls = config.lambda_cls
self.lambda_rec = config.lambda_rec
self.lambda_gp = config.lambda_gp
# Training configurations.
self.batch_size = config.batch_size
self.num_iters = config.num_iters
self.num_iters_decay = config.num_iters_decay
self.g_lr = config.g_lr
self.d_lr = config.d_lr
self.n_critic = config.n_critic
self.beta1 = config.beta1
self.beta2 = config.beta2
self.resume_iters = config.resume_iters
# Test configurations.
self.test_iters = config.test_iters
# Miscellaneous.
self.use_tensorboard = config.use_tensorboard
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Directories.
self.log_dir = config.log_dir
self.sample_dir = config.sample_dir
self.model_save_dir = config.model_save_dir
# Step size.
self.log_step = config.log_step
self.sample_step = config.sample_step
self.model_save_step = config.model_save_step
self.lr_update_step = config.lr_update_step
# Build the model and tensorboard.
self.build_model()
if self.use_tensorboard:
self.build_tensorboard()
def build_model(self):
"""Create a generator and a discriminator."""
self.G = Generator(num_speakers=self.num_speakers)
self.D = Discriminator(num_speakers=self.num_speakers)
self.g_optimizer = torch.optim.Adam(self.G.parameters(), self.g_lr, [self.beta1, self.beta2])
self.d_optimizer = torch.optim.Adam(self.D.parameters(), self.d_lr, [self.beta1, self.beta2])
self.print_network(self.G, 'G')
self.print_network(self.D, 'D')
self.G.to(self.device)
self.D.to(self.device)
def print_network(self, model, name):
"""Print out the network information."""
num_params = 0
for p in model.parameters():
num_params += p.numel()
print(model)
print(name)
print("The number of parameters: {}".format(num_params))
def restore_model(self, resume_iters):
"""Restore the trained generator and discriminator."""
print('Loading the trained models from step {}...'.format(resume_iters))
G_path = os.path.join(self.model_save_dir, '{}-G.ckpt'.format(resume_iters))
D_path = os.path.join(self.model_save_dir, '{}-D.ckpt'.format(resume_iters))
self.G.load_state_dict(torch.load(G_path, map_location=lambda storage, loc: storage))
self.D.load_state_dict(torch.load(D_path, map_location=lambda storage, loc: storage))
def build_tensorboard(self):
"""Build a tensorboard logger."""
from logger import Logger
self.logger = Logger(self.log_dir)
def update_lr(self, g_lr, d_lr):
"""Decay learning rates of the generator and discriminator."""
for param_group in self.g_optimizer.param_groups:
param_group['lr'] = g_lr
for param_group in self.d_optimizer.param_groups:
param_group['lr'] = d_lr
def reset_grad(self):
"""Reset the gradient buffers."""
self.g_optimizer.zero_grad()
self.d_optimizer.zero_grad()
def denorm(self, x):
"""Convert the range from [-1, 1] to [0, 1]."""
out = (x + 1) / 2
return out.clamp_(0, 1)
def gradient_penalty(self, y, x):
"""Compute gradient penalty: (L2_norm(dy/dx) - 1)**2."""
weight = torch.ones(y.size()).to(self.device)
dydx = torch.autograd.grad(outputs=y,
inputs=x,
grad_outputs=weight,
retain_graph=True,
create_graph=True,
only_inputs=True)[0]
dydx = dydx.view(dydx.size(0), -1)
dydx_l2norm = torch.sqrt(torch.sum(dydx**2, dim=1))
return torch.mean((dydx_l2norm-1)**2)
def label2onehot(self, labels, dim):
"""Convert label indices to one-hot vectors."""
batch_size = labels.size(0)
out = torch.zeros(batch_size, dim)
out[np.arange(batch_size), labels.long()] = 1
return out
def sample_spk_c(self, size):
spk_c = np.random.randint(0, self.num_speakers, size=size)
spk_c_cat = to_categorical(spk_c, self.num_speakers)
return torch.LongTensor(spk_c), torch.FloatTensor(spk_c_cat)
def classification_loss(self, logit, target):
"""Compute softmax cross entropy loss."""
return F.cross_entropy(logit, target)
def load_wav(self, wavfile, sr=16000):
wav, _ = librosa.load(wavfile, sr=sr, mono=True)
return wav_padding(wav, sr=16000, frame_period=5, multiple = 4) # TODO
def train(self):
"""Train StarGAN."""
# Set data loader.
train_loader = self.train_loader
data_iter = iter(train_loader)
# Read a batch of testdata
test_wavfiles = self.test_loader.get_batch_test_data(batch_size=4)
test_wavs = [self.load_wav(wavfile) for wavfile in test_wavfiles]
# Determine whether do copysynthesize when first do training-time conversion test.
cpsyn_flag = [True, False][0]
# f0, timeaxis, sp, ap = world_decompose(wav = wav, fs = sampling_rate, frame_period = frame_period)
# Learning rate cache for decaying.
g_lr = self.g_lr
d_lr = self.d_lr
# Start training from scratch or resume training.
start_iters = 0
if self.resume_iters:
print("resuming step %d ..."% self.resume_iters)
start_iters = self.resume_iters
self.restore_model(self.resume_iters)
# Start training.
print('Start training...')
start_time = time.time()
for i in range(start_iters, self.num_iters):
# =================================================================================== #
# 1. Preprocess input data #
# =================================================================================== #
# Fetch labels.
try:
mc_real, spk_label_org, spk_c_org = next(data_iter)
except:
data_iter = iter(train_loader)
mc_real, spk_label_org, spk_c_org = next(data_iter)
mc_real.unsqueeze_(1) # (B, D, T) -> (B, 1, D, T) for conv2d
# Generate target domain labels randomly.
# spk_label_trg: int, spk_c_trg:one-hot representation
spk_label_trg, spk_c_trg = self.sample_spk_c(mc_real.size(0))
mc_real = mc_real.to(self.device) # Input mc.
spk_label_org = spk_label_org.to(self.device) # Original spk labels.
spk_c_org = spk_c_org.to(self.device) # Original spk acc conditioning.
spk_label_trg = spk_label_trg.to(self.device) # Target spk labels for classification loss for G.
spk_c_trg = spk_c_trg.to(self.device) # Target spk conditioning.
# =================================================================================== #
# 2. Train the discriminator #
# =================================================================================== #
# Compute loss with real mc feats.
out_src, out_cls_spks = self.D(mc_real)
d_loss_real = - torch.mean(out_src)
d_loss_cls_spks = self.classification_loss(out_cls_spks, spk_label_org)
# Compute loss with fake mc feats.
mc_fake = self.G(mc_real, spk_c_trg)
out_src, out_cls_spks = self.D(mc_fake.detach())
d_loss_fake = torch.mean(out_src)
# Compute loss for gradient penalty.
alpha = torch.rand(mc_real.size(0), 1, 1, 1).to(self.device)
x_hat = (alpha * mc_real.data + (1 - alpha) * mc_fake.data).requires_grad_(True)
out_src, _ = self.D(x_hat)
d_loss_gp = self.gradient_penalty(out_src, x_hat)
# Backward and optimize.
d_loss = d_loss_real + d_loss_fake + self.lambda_cls * d_loss_cls_spks + self.lambda_gp * d_loss_gp
self.reset_grad()
d_loss.backward()
self.d_optimizer.step()
# Logging.
loss = {}
loss['D/loss_real'] = d_loss_real.item()
loss['D/loss_fake'] = d_loss_fake.item()
loss['D/loss_cls_spks'] = d_loss_cls_spks.item()
loss['D/loss_gp'] = d_loss_gp.item()
# =================================================================================== #
# 3. Train the generator #
# =================================================================================== #
if (i+1) % self.n_critic == 0:
# Original-to-target domain.
mc_fake = self.G(mc_real, spk_c_trg)
out_src, out_cls_spks = self.D(mc_fake)
g_loss_fake = - torch.mean(out_src)
g_loss_cls_spks = self.classification_loss(out_cls_spks, spk_label_trg)
# Target-to-original domain.
mc_reconst = self.G(mc_fake, spk_c_org)
g_loss_rec = torch.mean(torch.abs(mc_real - mc_reconst))
# Backward and optimize.
g_loss = g_loss_fake + self.lambda_rec * g_loss_rec + self.lambda_cls * g_loss_cls_spks
self.reset_grad()
g_loss.backward()
self.g_optimizer.step()
# Logging.
loss['G/loss_fake'] = g_loss_fake.item()
loss['G/loss_rec'] = g_loss_rec.item()
loss['G/loss_cls_spks'] = g_loss_cls_spks.item()
# =================================================================================== #
# 4. Miscellaneous #
# =================================================================================== #
# Print out training information.
if (i+1) % self.log_step == 0:
et = time.time() - start_time
et = str(datetime.timedelta(seconds=et))[:-7]
log = "Elapsed [{}], Iteration [{}/{}]".format(et, i+1, self.num_iters)
for tag, value in loss.items():
log += ", {}: {:.4f}".format(tag, value)
print(log)
if self.use_tensorboard:
for tag, value in loss.items():
self.logger.scalar_summary(tag, value, i+1)
if (i+1) % self.sample_step == 0:
sampling_rate=16000
num_mcep=36
frame_period=5
with torch.no_grad():
for idx, wav in tqdm(enumerate(test_wavs)):
wav_name = basename(test_wavfiles[idx])
# print(wav_name)
f0, timeaxis, sp, ap = world_decompose(wav=wav, fs=sampling_rate, frame_period=frame_period)
f0_converted = pitch_conversion(f0=f0,
mean_log_src=self.test_loader.logf0s_mean_src, std_log_src=self.test_loader.logf0s_std_src,
mean_log_target=self.test_loader.logf0s_mean_trg, std_log_target=self.test_loader.logf0s_std_trg)
coded_sp = world_encode_spectral_envelop(sp=sp, fs=sampling_rate, dim=num_mcep)
coded_sp_norm = (coded_sp - self.test_loader.mcep_mean_src) / self.test_loader.mcep_std_src
coded_sp_norm_tensor = torch.FloatTensor(coded_sp_norm.T).unsqueeze_(0).unsqueeze_(1).to(self.device)
conds = torch.FloatTensor(self.test_loader.spk_c_trg).to(self.device)
# print(conds.size())
coded_sp_converted_norm = self.G(coded_sp_norm_tensor, conds).data.cpu().numpy()
coded_sp_converted = np.squeeze(coded_sp_converted_norm).T * self.test_loader.mcep_std_trg + self.test_loader.mcep_mean_trg
coded_sp_converted = np.ascontiguousarray(coded_sp_converted)
# decoded_sp_converted = world_decode_spectral_envelop(coded_sp = coded_sp_converted, fs = sampling_rate)
wav_transformed = world_speech_synthesis(f0=f0_converted, coded_sp=coded_sp_converted,
ap=ap, fs=sampling_rate, frame_period=frame_period)
librosa.output.write_wav(
join(self.sample_dir, str(i+1)+'-'+wav_name.split('.')[0]+'-vcto-{}'.format(self.test_loader.trg_spk)+'.wav'), wav_transformed, sampling_rate)
if cpsyn_flag:
wav_cpsyn = world_speech_synthesis(f0=f0, coded_sp=coded_sp,
ap=ap, fs=sampling_rate, frame_period=frame_period)
librosa.output.write_wav(join(self.sample_dir, 'cpsyn-'+wav_name), wav_cpsyn, sampling_rate)
cpsyn_flag = False
# Save model checkpoints.
if (i+1) % self.model_save_step == 0:
G_path = os.path.join(self.model_save_dir, '{}-G.ckpt'.format(i+1))
D_path = os.path.join(self.model_save_dir, '{}-D.ckpt'.format(i+1))
torch.save(self.G.state_dict(), G_path)
torch.save(self.D.state_dict(), D_path)
print('Saved model checkpoints into {}...'.format(self.model_save_dir))
# Decay learning rates.
if (i+1) % self.lr_update_step == 0 and (i+1) > (self.num_iters - self.num_iters_decay):
g_lr -= (self.g_lr / float(self.num_iters_decay))
d_lr -= (self.d_lr / float(self.num_iters_decay))
self.update_lr(g_lr, d_lr)
print ('Decayed learning rates, g_lr: {}, d_lr: {}.'.format(g_lr, d_lr))
================================================
FILE: utils.py
================================================
import librosa
import numpy as np
import os
import pyworld
def load_wav(wav_file, sr):
wav, _ = librosa.load(wav_file, sr=sr, mono=True)
return wav
def world_decompose(wav, fs, frame_period = 5.0):
# Decompose speech signal into f0, spectral envelope and aperiodicity using WORLD
wav = wav.astype(np.float64)
f0, timeaxis = pyworld.harvest(wav, fs, frame_period = frame_period, f0_floor = 71.0, f0_ceil = 800.0)
sp = pyworld.cheaptrick(wav, f0, timeaxis, fs)
ap = pyworld.d4c(wav, f0, timeaxis, fs)
return f0, timeaxis, sp, ap
def world_encode_spectral_envelop(sp, fs, dim=36):
# Get Mel-cepstral coefficients (MCEPs)
#sp = sp.astype(np.float64)
coded_sp = pyworld.code_spectral_envelope(sp, fs, dim)
return coded_sp
def world_decode_spectral_envelop(coded_sp, fs):
# Decode Mel-cepstral to sp
fftlen = pyworld.get_cheaptrick_fft_size(fs)
decoded_sp = pyworld.decode_spectral_envelope(coded_sp, fs, fftlen)
return decoded_sp
def world_encode_wav(wav_file, fs, frame_period=5.0, coded_dim=36):
wav = load_wav(wav_file, sr=fs)
f0, timeaxis, sp, ap = world_decompose(wav=wav, fs=fs, frame_period=frame_period)
coded_sp = world_encode_spectral_envelop(sp = sp, fs = fs, dim = coded_dim)
return f0, timeaxis, sp, ap, coded_sp
def world_speech_synthesis(f0, coded_sp, ap, fs, frame_period):
decoded_sp = world_decode_spectral_envelop(coded_sp, fs)
# TODO
min_len = min([len(f0), len(coded_sp), len(ap)])
f0 = f0[:min_len]
coded_sp = coded_sp[:min_len]
ap = ap[:min_len]
wav = pyworld.synthesize(f0, decoded_sp, ap, fs, frame_period)
# Librosa could not save wav if not doing so
wav = wav.astype(np.float32)
return wav
def world_synthesis_data(f0s, coded_sps, aps, fs, frame_period):
wavs = list()
for f0, decoded_sp, ap in zip(f0s, coded_sps, aps):
wav = world_speech_synthesis(f0, coded_sp, ap, fs, frame_period)
wavs.append(wav)
return wavs
def coded_sps_normalization_fit_transoform(coded_sps):
coded_sps_concatenated = np.concatenate(coded_sps, axis = 1)
coded_sps_mean = np.mean(coded_sps_concatenated, axis = 1, keepdims = True)
coded_sps_std = np.std(coded_sps_concatenated, axis = 1, keepdims = True)
coded_sps_normalized = list()
for coded_sp in coded_sps:
coded_sps_normalized.append((coded_sp - coded_sps_mean) / coded_sps_std)
return coded_sps_normalized, coded_sps_mean, coded_sps_std
def coded_sp_statistics(coded_sps):
# sp shape (T, D)
coded_sps_concatenated = np.concatenate(coded_sps, axis = 0)
coded_sps_mean = np.mean(coded_sps_concatenated, axis = 0, keepdims = False)
coded_sps_std = np.std(coded_sps_concatenated, axis = 0, keepdims = False)
return coded_sps_mean, coded_sps_std
def normalize_coded_sp(coded_sp, coded_sp_mean, coded_sp_std):
normed = (coded_sp - coded_sp_mean) / coded_sp_std
return normed
def coded_sps_normalization_transoform(coded_sps, coded_sps_mean, coded_sps_std):
coded_sps_normalized = list()
for coded_sp in coded_sps:
coded_sps_normalized.append((coded_sp - coded_sps_mean) / coded_sps_std)
return coded_sps_normalized
def coded_sps_normalization_inverse_transoform(normalized_coded_sps, coded_sps_mean, coded_sps_std):
coded_sps = list()
for normalized_coded_sp in normalized_coded_sps:
coded_sps.append(normalized_coded_sp * coded_sps_std + coded_sps_mean)
return coded_sps
def coded_sp_padding(coded_sp, multiple = 4):
num_features = coded_sp.shape[0]
num_frames = coded_sp.shape[1]
num_frames_padded = int(np.ceil(num_frames / multiple)) * multiple
num_frames_diff = num_frames_padded - num_frames
num_pad_left = num_frames_diff // 2
num_pad_right = num_frames_diff - num_pad_left
coded_sp_padded = np.pad(coded_sp, ((0, 0), (num_pad_left, num_pad_right)), 'constant', constant_values = 0)
return coded_sp_padded
def wav_padding(wav, sr, frame_period, multiple = 4):
assert wav.ndim == 1
num_frames = len(wav)
num_frames_padded = int((np.ceil((np.floor(num_frames / (sr * frame_period / 1000)) + 1) / multiple + 1) * multiple - 1) * (sr * frame_period / 1000))
num_frames_diff = num_frames_padded - num_frames
num_pad_left = num_frames_diff // 2
num_pad_right = num_frames_diff - num_pad_left
wav_padded = np.pad(wav, (num_pad_left, num_pad_right), 'constant', constant_values = 0)
return wav_padded
def logf0_statistics(f0s):
log_f0s_concatenated = np.ma.log(np.concatenate(f0s))
log_f0s_mean = log_f0s_concatenated.mean()
log_f0s_std = log_f0s_concatenated.std()
return log_f0s_mean, log_f0s_std
def pitch_conversion(f0, mean_log_src, std_log_src, mean_log_target, std_log_target):
# Logarithm Gaussian normalization for Pitch Conversions
f0_converted = np.exp((np.ma.log(f0) - mean_log_src) / std_log_src * std_log_target + mean_log_target)
return f0_converted
def wavs_to_specs(wavs, n_fft = 1024, hop_length = None):
stfts = list()
for wav in wavs:
stft = librosa.stft(wav, n_fft = n_fft, hop_length = hop_length)
stfts.append(stft)
return stfts
def wavs_to_mfccs(wavs, sr, n_fft = 1024, hop_length = None, n_mels = 128, n_mfcc = 24):
mfccs = list()
for wav in wavs:
mfcc = librosa.feature.mfcc(y = wav, sr = sr, n_fft = n_fft, hop_length = hop_length, n_mels = n_mels, n_mfcc = n_mfcc)
mfccs.append(mfcc)
return mfccs
def mfccs_normalization(mfccs):
mfccs_concatenated = np.concatenate(mfccs, axis = 1)
mfccs_mean = np.mean(mfccs_concatenated, axis = 1, keepdims = True)
mfccs_std = np.std(mfccs_concatenated, axis = 1, keepdims = True)
mfccs_normalized = list()
for mfcc in mfccs:
mfccs_normalized.append((mfcc - mfccs_mean) / mfccs_std)
return mfccs_normalized, mfccs_mean, mfccs_std
def sample_train_data(dataset_A, dataset_B, n_frames = 128):
num_samples = min(len(dataset_A), len(dataset_B))
train_data_A_idx = np.arange(len(dataset_A))
train_data_B_idx = np.arange(len(dataset_B))
np.random.shuffle(train_data_A_idx)
np.random.shuffle(train_data_B_idx)
train_data_A_idx_subset = train_data_A_idx[:num_samples]
train_data_B_idx_subset = train_data_B_idx[:num_samples]
train_data_A = list()
train_data_B = list()
for idx_A, idx_B in zip(train_data_A_idx_subset, train_data_B_idx_subset):
data_A = dataset_A[idx_A]
frames_A_total = data_A.shape[1]
assert frames_A_total >= n_frames
start_A = np.random.randint(frames_A_total - n_frames + 1)
end_A = start_A + n_frames
train_data_A.append(data_A[:,start_A:end_A])
data_B = dataset_B[idx_B]
frames_B_total = data_B.shape[1]
assert frames_B_total >= n_frames
start_B = np.random.randint(frames_B_total - n_frames + 1)
end_B = start_B + n_frames
train_data_B.append(data_B[:,start_B:end_B])
train_data_A = np.array(train_data_A)
train_data_B = np.array(train_data_B)
return train_data_A, train_data_B
gitextract_6c839ap3/ ├── README.md ├── convert.py ├── converted_samples/ │ └── Readme ├── data_loader.py ├── logger.py ├── main.py ├── model.py ├── preprocess.py ├── solver.py └── utils.py
SYMBOL INDEX (69 symbols across 8 files)
FILE: convert.py
class TestDataset (line 31) | class TestDataset(object):
method __init__ (line 33) | def __init__(self, config):
method get_batch_test_data (line 60) | def get_batch_test_data(self, batch_size=4):
function load_wav (line 70) | def load_wav(wavfile, sr=16000):
function test (line 75) | def test(config):
FILE: data_loader.py
function to_categorical (line 25) | def to_categorical(y, num_classes=None):
class MyDataset (line 51) | class MyDataset(data.Dataset):
method __init__ (line 53) | def __init__(self, data_dir):
method rm_too_short_utt (line 65) | def rm_too_short_utt(self, mc_files, min_length=min_length):
method sample_seg (line 73) | def sample_seg(self, feat, sample_len=min_length):
method __len__ (line 78) | def __len__(self):
method __getitem__ (line 81) | def __getitem__(self, index):
class TestDataset (line 94) | class TestDataset(object):
method __init__ (line 96) | def __init__(self, data_dir, wav_dir, src_spk='p262', trg_spk='p272'):
method get_batch_test_data (line 117) | def get_batch_test_data(self, batch_size=8):
function get_loader (line 126) | def get_loader(data_dir, batch_size=32, mode='train', num_workers=1):
FILE: logger.py
class Logger (line 16) | class Logger(object):
method __init__ (line 19) | def __init__(self, log_dir):
method scalar_summary (line 23) | def scalar_summary(self, tag, value, step):
FILE: main.py
function str2bool (line 8) | def str2bool(v):
function main (line 11) | def main(config):
FILE: model.py
class ResidualBlock (line 8) | class ResidualBlock(nn.Module):
method __init__ (line 10) | def __init__(self, dim_in, dim_out):
method forward (line 19) | def forward(self, x):
class Generator (line 22) | class Generator(nn.Module):
method __init__ (line 24) | def __init__(self, conv_dim=64, num_speakers=10, repeat_num=6):
method forward (line 54) | def forward(self, x, c):
class Discriminator (line 61) | class Discriminator(nn.Module):
method __init__ (line 63) | def __init__(self, input_size=(36, 256), conv_dim=64, repeat_num=5, nu...
method forward (line 81) | def forward(self, x):
FILE: preprocess.py
function resample (line 18) | def resample(spk, origin_wavpath, target_wavpath):
function resample_to_16k (line 28) | def resample_to_16k(origin_wavpath, target_wavpath, num_workers=1):
function split_data (line 39) | def split_data(paths):
function get_spk_world_feats (line 47) | def get_spk_world_feats(spk_fold_path, mc_dir_train, mc_dir_test, sample...
FILE: solver.py
class Solver (line 18) | class Solver(object):
method __init__ (line 21) | def __init__(self, train_loader, test_loader, config):
method build_model (line 69) | def build_model(self):
method print_network (line 82) | def print_network(self, model, name):
method restore_model (line 91) | def restore_model(self, resume_iters):
method build_tensorboard (line 99) | def build_tensorboard(self):
method update_lr (line 104) | def update_lr(self, g_lr, d_lr):
method reset_grad (line 111) | def reset_grad(self):
method denorm (line 116) | def denorm(self, x):
method gradient_penalty (line 121) | def gradient_penalty(self, y, x):
method label2onehot (line 135) | def label2onehot(self, labels, dim):
method sample_spk_c (line 142) | def sample_spk_c(self, size):
method classification_loss (line 147) | def classification_loss(self, logit, target):
method load_wav (line 151) | def load_wav(self, wavfile, sr=16000):
method train (line 155) | def train(self):
FILE: utils.py
function load_wav (line 7) | def load_wav(wav_file, sr):
function world_decompose (line 11) | def world_decompose(wav, fs, frame_period = 5.0):
function world_encode_spectral_envelop (line 19) | def world_encode_spectral_envelop(sp, fs, dim=36):
function world_decode_spectral_envelop (line 25) | def world_decode_spectral_envelop(coded_sp, fs):
function world_encode_wav (line 31) | def world_encode_wav(wav_file, fs, frame_period=5.0, coded_dim=36):
function world_speech_synthesis (line 37) | def world_speech_synthesis(f0, coded_sp, ap, fs, frame_period):
function world_synthesis_data (line 49) | def world_synthesis_data(f0s, coded_sps, aps, fs, frame_period):
function coded_sps_normalization_fit_transoform (line 56) | def coded_sps_normalization_fit_transoform(coded_sps):
function coded_sp_statistics (line 65) | def coded_sp_statistics(coded_sps):
function normalize_coded_sp (line 72) | def normalize_coded_sp(coded_sp, coded_sp_mean, coded_sp_std):
function coded_sps_normalization_transoform (line 76) | def coded_sps_normalization_transoform(coded_sps, coded_sps_mean, coded_...
function coded_sps_normalization_inverse_transoform (line 84) | def coded_sps_normalization_inverse_transoform(normalized_coded_sps, cod...
function coded_sp_padding (line 92) | def coded_sp_padding(coded_sp, multiple = 4):
function wav_padding (line 102) | def wav_padding(wav, sr, frame_period, multiple = 4):
function logf0_statistics (line 114) | def logf0_statistics(f0s):
function pitch_conversion (line 121) | def pitch_conversion(f0, mean_log_src, std_log_src, mean_log_target, std...
function wavs_to_specs (line 128) | def wavs_to_specs(wavs, n_fft = 1024, hop_length = None):
function wavs_to_mfccs (line 138) | def wavs_to_mfccs(wavs, sr, n_fft = 1024, hop_length = None, n_mels = 12...
function mfccs_normalization (line 148) | def mfccs_normalization(mfccs):
function sample_train_data (line 161) | def sample_train_data(dataset_A, dataset_B, n_frames = 128):
Condensed preview — 10 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (55K chars).
[
{
"path": "README.md",
"chars": 2162,
"preview": "# StarGAN-Voice-Conversion\nThis is a pytorch implementation of the paper: StarGAN-VC: Non-parallel many-to-many voice co"
},
{
"path": "convert.py",
"chars": 6952,
"preview": "import argparse\nfrom model import Generator\nfrom torch.autograd import Variable\nimport torch\nimport torch.nn.functional "
},
{
"path": "converted_samples/Readme",
"chars": 198,
"preview": "cpsyn stands for copy-synthesis for references.\n\n\nThese converted samples were obtained from not-well-fine-tuned model. "
},
{
"path": "data_loader.py",
"chars": 5997,
"preview": "from torch.utils import data\nimport torch\nimport os\nimport random\nimport glob\nfrom os.path import join, basename, dirnam"
},
{
"path": "logger.py",
"chars": 802,
"preview": "# import tensorflow as tf\nfrom tensorboardX import SummaryWriter\n\n# class Logger(object):\n# \"\"\"Tensorflow Tensorboar"
},
{
"path": "main.py",
"chars": 3729,
"preview": "import os\nimport argparse\nfrom solver import Solver\nfrom data_loader import get_loader, TestDataset\nfrom torch.backends "
},
{
"path": "model.py",
"chars": 4783,
"preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport numpy as np\n# from data_loader import get_load"
},
{
"path": "preprocess.py",
"chars": 5756,
"preview": "import librosa\nimport numpy as np\nimport os, sys\nimport argparse\nimport pyworld\nfrom multiprocessing import cpu_count\nfr"
},
{
"path": "solver.py",
"chars": 15305,
"preview": "from model import Generator\nfrom model import Discriminator\nfrom torch.autograd import Variable\nfrom torchvision.utils i"
},
{
"path": "utils.py",
"chars": 7135,
"preview": "import librosa\nimport numpy as np\nimport os\nimport pyworld\n\n\ndef load_wav(wav_file, sr):\n wav, _ = librosa.load(wav_f"
}
]
About this extraction
This page contains the full source code of the liusongxiang/StarGAN-Voice-Conversion GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 10 files (51.6 KB), approximately 13.5k tokens, and a symbol index with 69 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.