Repository: jjery2243542/voice_conversion Branch: master Commit: 768efc110df0 Files: 12 Total size: 57.5 KB Directory structure: gitextract_sujrz6do/ ├── README.md ├── convert.py ├── main.py ├── model.py ├── preprocess/ │ ├── make_dataset_vctk.py │ ├── make_single_samples.py │ ├── preprocess.sh │ └── tacotron/ │ └── norm_utils.py ├── solver.py ├── test.py ├── utils.py └── vctk.json ================================================ FILE CONTENTS ================================================ ================================================ FILE: README.md ================================================ # Multi-target Voice Conversion without Parallel Data by Adversarially Learning Disentangled Audio Representations This is the official implementation of the paper [Multi-target Voice Conversion without Parallel Data by Adversarially Learning Disentangled Audio Representations](https://arxiv.org/abs/1804.02812). You can find the demo webpage [here](https://jjery2243542.github.io/voice_conversion_demo/), and the pretrained model [here](http://speech.ee.ntu.edu.tw/~jjery2243542/resource/model/is18/model.pkl). # Dependency - python 3.6+ - pytorch 0.4.0 - h5py 2.8 - tensorboardX We also use some preprocess script from [Kyubyong/tacotron](https://github.com/Kyubyong/tacotron). # Preprocess Our model is trained on [CSTR VCTK Corpus](https://homepages.inf.ed.ac.uk/jyamagis/page3/page58/page58.html). ### Feature extraction We use the code from [Kyubyong/tacotron](https://github.com/Kyubyong/tacotron) to extract feature. The default paprameters can be found at ```preprocess/tacotron/norm_utils.py```. The configuration for preprocess is at ```preprocess/vctk.config```, where: - **data_root_dir**: the path of VCTK Corpus (VCTK-Corpus). - **h5py_path**: the path to store extracted features. - **index_path**: the path to store sampled segments. - **traini_proportion**: the proportion of training utterances. Default: 0.9. - **n_samples**: the number of sampled samples. Default: 500000. - **seg_len**: the length of sampled segments. Default: 128. - **speaker_used_path**: the path of used speaker list. Our speakers set used in the paper is [here](http://speech.ee.ntu.edu.tw/~jjery2243542/resource/model/is18/en_speaker_used.txt). Once you edited the config file, you can run ```preprocess.sh``` to preprocess the dataset. # Training You can start training by running ```main.py```. The arguments are listed below. - **--load_model**: whether to load the model from checkpoint. - **-flag**: flag of this training episode for tensorboard. Default: train. - **-hps_path**: the path of hyper-parameters set. You can find the default setting at ```vctk.json```. - **--load_model_path**: If **--load_model** is on, it will load the model parameters from this path. - **-dataset_path**: the path of processed features (.h5). - **-index_path**: the path of sampled segment indexes (.json). - **-output_model_path**: the path to store trained model. # Testing You can inference by running ```python3 test.py```. The arguments are listed below. - **-hps**: the path of hyper-parameter set. Default: vctk.json - **-m**: the path of model checkpoint to load. - **-s**: the path of source .wav file. - **-t**: the index of target speaker id (integer). Same order as the speaker list (```en_speaker_used.txt```). - **-o**: output .wav path. - **-sr**: sample rate of the output .wav file. Default: 16000. - **--use_gen**: if the flag is on, inference will use generator. Default: True. # Reference Please cite our paper if you find this repository useful. ``` @article{chou2018multi, title={Multi-target voice conversion without parallel data by adversarially learning disentangled audio representations}, author={Chou, Ju-chieh and Yeh, Cheng-chieh and Lee, Hung-yi and Lee, Lin-shan}, journal={arXiv preprint arXiv:1804.02812}, year={2018} } ``` # Contact If you have any question about the paper or the code, feel free to email me at [jjery2243542@gmail.com](jjery2243542@gmail.com). ================================================ FILE: convert.py ================================================ import torch from torch import optim from torch.autograd import Variable import numpy as np import pickle from utils import Hps from utils import DataLoader from utils import Logger from utils import myDataset from utils import Indexer from solver import Solver from preprocess.tacotron.norm_utils import spectrogram2wav #from preprocess.tacotron.audio import inv_spectrogram, save_wav from scipy.io.wavfile import write #from preprocess.tacotron.mcep import mc2wav import h5py import os import soundfile as sf #import pysptk #import pyworld as pw def sp2wav(sp): #exp_sp = np.exp(sp) exp_sp = sp wav_data = spectrogram2wav(exp_sp) return wav_data def get_world_param(f_h5, src_speaker, utt_id, tar_speaker, tar_speaker_id, solver, dset='test', gen=True): mc = f_h5[f'{dset}/{src_speaker}/{utt_id}/norm_mc'][()] converted_mc = convert_mc(mc, tar_speaker_id, solver, gen=gen) #converted_mc = mc mc_mean = f_h5[f'train/{tar_speaker}'].attrs['mc_mean'] mc_std = f_h5[f'train/{tar_speaker}'].attrs['mc_std'] converted_mc = converted_mc * mc_std + mc_mean log_f0 = f_h5[f'{dset}/{src_speaker}/{utt_id}/log_f0'][()] src_mean = f_h5[f'train/{src_speaker}'].attrs['f0_mean'] src_std = f_h5[f'train/{src_speaker}'].attrs['f0_std'] tar_mean = f_h5[f'train/{tar_speaker}'].attrs['f0_mean'] tar_std = f_h5[f'train/{tar_speaker}'].attrs['f0_std'] index = np.where(log_f0 > 1e-10)[0] log_f0[index] = (log_f0[index] - src_mean) * tar_std / src_std + tar_mean log_f0[index] = np.exp(log_f0[index]) f0 = log_f0 ap = f_h5[f'{dset}/{src_speaker}/{utt_id}/ap'][()] converted_mc = converted_mc[:ap.shape[0]] sp = pysptk.conversion.mc2sp(converted_mc, alpha=0.41, fftlen=1024) return f0, sp, ap def synthesis(f0, sp, ap, sr=16000): y = pw.synthesize( f0.astype(np.float64), sp.astype(np.float64), ap.astype(np.float64), sr, pw.default_frame_period) return y def convert_sp(sp, c, solver, gen=True): c_var = Variable(torch.from_numpy(np.array([c]))).cuda() sp_tensor = torch.from_numpy(np.expand_dims(sp, axis=0)) sp_tensor = sp_tensor.type(torch.FloatTensor) converted_sp = solver.test_step(sp_tensor, c_var, gen=gen) converted_sp = converted_sp.squeeze(axis=0).transpose((1, 0)) return converted_sp def convert_mc(mc, c, solver, gen=True): c_var = Variable(torch.from_numpy(np.array([c]))).cuda() mc_tensor = torch.from_numpy(np.expand_dims(mc, axis=0)) mc_tensor = mc_tensor.type(torch.FloatTensor) converted_mc = solver.test_step(mc_tensor, c_var, gen=gen) converted_mc = converted_mc.squeeze(axis=0).transpose((1, 0)) return converted_mc def get_model(hps_path='./hps/vctk.json', model_path='/storage/model/voice_conversion/vctk/clf/model.pkl-109999'): hps = Hps() hps.load(hps_path) hps_tuple = hps.get_tuple() solver = Solver(hps_tuple, None) solver.load_model(model_path) return solver def convert_all_sp(h5_path, src_speaker, tar_speaker, gen=True, dset='test', speaker_used_path='/storage/feature/voice_conversion/vctk/dataset_used/en_speaker_used.txt', root_dir='/storage/result/voice_conversion/vctk/p226_to_p225/', model_path='/storage/model/voice_conversion/vctk/clf/wo_tanh/model_0.001.pkl-79999'): # read speaker id file with open(speaker_used_path) as f: speakers = [line.strip() for line in f] speaker2id = {speaker:i for i, speaker in enumerate(speakers)} solver = get_model(hps_path='hps/vctk.json', model_path=model_path) with h5py.File(h5_path, 'r') as f_h5: for utt_id in f_h5[f'{dset}/{src_speaker}']: sp = f_h5[f'{dset}/{src_speaker}/{utt_id}/lin'][()] converted_sp = convert_sp(sp, speaker2id[tar_speaker], solver, gen=gen) wav_data = sp2wav(converted_sp) wav_path = os.path.join(root_dir, f'{src_speaker}_{tar_speaker}_{utt_id}.wav') sf.write(wav_path, wav_data, 16000, 'PCM_24') def convert_all_mc(h5_path, src_speaker, tar_speaker, gen=False, dset='test', speaker_used_path='/storage/feature/voice_conversion/vctk/mcep/en_speaker_used.txt', root_dir='/storage/result/voice_conversion/vctk/p226_to_p225', model_path='/storage/model/voice_conversion/vctk/clf/wo_tanh/model_0.001.pkl-79999'): # read speaker id file with open(speaker_used_path) as f: speakers = [line.strip() for line in f] speaker2id = {speaker:i for i, speaker in enumerate(speakers)} solver = get_model(hps_path='hps/vctk.json', model_path=model_path) with h5py.File(h5_path, 'r') as f_h5: for utt_id in f_h5[f'{dset}/{src_speaker}']: f0, sp, ap = get_world_param(f_h5, src_speaker, utt_id, tar_speaker, tar_speaker_id=speaker2id[tar_speaker], solver=solver, dset='test', gen=gen) wav_data = synthesis(f0, sp, ap) wav_path = os.path.join(root_dir, f'{src_speaker}_{tar_speaker}_{utt_id}.wav') sf.write(wav_path, wav_data, 16000, 'PCM_24') if __name__ == '__main__': h5_path = '/storage/feature/voice_conversion/vctk/dataset_used/norm_vctk.h5' root_dir = '/storage/result/voice_conversion/vctk/norm/clf_gen' #h5_path = '/storage/feature/voice_conversion/LibriSpeech/libri.h5' #h5_path = '/storage/feature/voice_conversion/vctk/mcep/trim_mc_vctk_backup.h5' #convert_all_mc(h5_path, '226', '225', root_dir='./test_mc/', gen=False, # model_path='/storage/model/voice_conversion/vctk/mcep/clf/model.pkl-129999') #convert_all_mc(h5_path, '225', '226', root_dir='./test_mc/', gen=False, # model_path='/storage/model/voice_conversion/vctk/mcep/clf/model.pkl-129999') #convert_all_mc(h5_path, '225', '228', root_dir='./test_mc/', gen=False, # model_path='/storage/model/voice_conversion/vctk/mcep/clf/model.pkl-129999') #model_path = '/storage/model/voice_conversion/vctk/mcep/clf/model.pkl-129999' model_path = '/storage/model/voice_conversion/vctk/clf/norm/wo_tanh/model_0.001_no_ins.pkl-124000' #model_path = '/storage/model/voice_conversion/librispeech/ls_1e-3.pkl-99999' speakers = ['225', '226', '227'] for speaker_A in speakers: for speaker_B in speakers: if speaker_A == speaker_B: continue else: dir_path = os.path.join(root_dir, f'p{speaker_A}_p{speaker_B}') if not os.path.exists(dir_path): os.makedirs(dir_path) convert_all_sp(h5_path,speaker_A,speaker_B, root_dir=dir_path, gen=True, model_path=model_path) # diff accent #dir_path = os.path.join(root_dir, '163_6925') #if not os.path.exists(dir_path): # os.makedirs(dir_path) #convert_all_sp(h5_path,'163','6925',root_dir=dir_path, # gen=False, model_path=model_path) #dir_path = os.path.join(root_dir, '460_1363') #if not os.path.exists(dir_path): # os.makedirs(dir_path) #convert_all_sp(h5_path,'460','1363',root_dir=dir_path, # gen=False, model_path=model_path) #convert_all_sp(h5_path,'363','256',root_dir=os.path.join(root_dir, 'p363_p256'), # gen=True, model_path=model_path) #convert_all_sp(h5_path,'340','251',root_dir=os.path.join(root_dir, 'p340_p251'), # gen=True, model_path=model_path) #convert_all_sp(h5_path,'285','251',root_dir=os.path.join(root_dir, 'p285_p251'), # gen=True, model_path=model_path) ================================================ FILE: main.py ================================================ import torch from torch import optim from torch.autograd import Variable import numpy as np import pickle from utils import Hps from utils import DataLoader from utils import Logger from utils import SingleDataset from solver import Solver import argparse if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--load_model', default=False, action='store_true') parser.add_argument('-flag', default='train') parser.add_argument('-hps_path') parser.add_argument('-load_model_path') parser.add_argument('-dataset_path') parser.add_argument('-index_path') parser.add_argument('-output_model_path') args = parser.parse_args() hps = Hps() hps.load(args.hps_path) hps_tuple = hps.get_tuple() dataset = SingleDataset(args.dataset_path, args.index_path, seg_len=hps_tuple.seg_len) data_loader = DataLoader(dataset) solver = Solver(hps_tuple, data_loader) if args.load_model: solver.load_model(args.load_model_path) solver.train(args.output_model_path, args.flag, mode='pretrain_G') solver.train(args.output_model_path, args.flag, mode='pretrain_D') solver.train(args.output_model_path, args.flag, mode='train') solver.train(args.output_model_path, args.flag, mode='patchGAN') ================================================ FILE: model.py ================================================ import numpy as np import torch.nn as nn import torch.nn.functional as F import torch from torch.autograd import Variable def pad_layer(inp, layer, is_2d=False): if type(layer.kernel_size) == tuple: kernel_size = layer.kernel_size[0] else: kernel_size = layer.kernel_size if not is_2d: if kernel_size % 2 == 0: pad = (kernel_size//2, kernel_size//2 - 1) else: pad = (kernel_size//2, kernel_size//2) else: if kernel_size % 2 == 0: pad = (kernel_size//2, kernel_size//2 - 1, kernel_size//2, kernel_size//2 - 1) else: pad = (kernel_size//2, kernel_size//2, kernel_size//2, kernel_size//2) # padding inp = F.pad(inp, pad=pad, mode='reflect') out = layer(inp) return out def pixel_shuffle_1d(inp, upscale_factor=2): batch_size, channels, in_width = inp.size() channels //= upscale_factor out_width = in_width * upscale_factor inp_view = inp.contiguous().view(batch_size, channels, upscale_factor, in_width) shuffle_out = inp_view.permute(0, 1, 3, 2).contiguous() shuffle_out = shuffle_out.view(batch_size, channels, out_width) return shuffle_out def upsample(x, scale_factor=2): x_up = F.upsample(x, scale_factor=2, mode='nearest') return x_up def RNN(inp, layer): inp_permuted = inp.permute(2, 0, 1) state_mul = (int(layer.bidirectional) + 1) * layer.num_layers zero_state = Variable(torch.zeros(state_mul, inp.size(0), layer.hidden_size)) zero_state = zero_state.cuda() if torch.cuda.is_available() else zero_state out_permuted, _ = layer(inp_permuted, zero_state) out_rnn = out_permuted.permute(1, 2, 0) return out_rnn def linear(inp, layer): batch_size = inp.size(0) hidden_dim = inp.size(1) seq_len = inp.size(2) inp_permuted = inp.permute(0, 2, 1) inp_expand = inp_permuted.contiguous().view(batch_size*seq_len, hidden_dim) out_expand = layer(inp_expand) out_permuted = out_expand.view(batch_size, seq_len, out_expand.size(1)) out = out_permuted.permute(0, 2, 1) return out def append_emb(emb, expand_size, output): emb = emb.unsqueeze(dim=2) emb_expand = emb.expand(emb.size(0), emb.size(1), expand_size) output = torch.cat([output, emb_expand], dim=1) return output class PatchDiscriminator(nn.Module): def __init__(self, n_class=33, ns=0.2, dp=0.1): super(PatchDiscriminator, self).__init__() self.ns = ns self.conv1 = nn.Conv2d(1, 64, kernel_size=5, stride=2) self.conv2 = nn.Conv2d(64, 128, kernel_size=5, stride=2) self.conv3 = nn.Conv2d(128, 256, kernel_size=5, stride=2) self.conv4 = nn.Conv2d(256, 512, kernel_size=5, stride=2) self.conv5 = nn.Conv2d(512, 512, kernel_size=5, stride=2) self.conv6 = nn.Conv2d(512, 32, kernel_size=1) self.conv7 = nn.Conv2d(32, 1, kernel_size=(17, 4)) #self.conv_classify = nn.Conv2d(512, n_class, kernel_size=(17, 4)) self.conv_classify = nn.Conv2d(32, n_class, kernel_size=(17, 4)) self.drop1 = nn.Dropout2d(p=dp) self.drop2 = nn.Dropout2d(p=dp) self.drop3 = nn.Dropout2d(p=dp) self.drop4 = nn.Dropout2d(p=dp) self.drop5 = nn.Dropout2d(p=dp) self.drop6 = nn.Dropout2d(p=dp) self.ins_norm1 = nn.InstanceNorm2d(self.conv1.out_channels) self.ins_norm2 = nn.InstanceNorm2d(self.conv2.out_channels) self.ins_norm3 = nn.InstanceNorm2d(self.conv3.out_channels) self.ins_norm4 = nn.InstanceNorm2d(self.conv4.out_channels) self.ins_norm5 = nn.InstanceNorm2d(self.conv5.out_channels) self.ins_norm6 = nn.InstanceNorm2d(self.conv6.out_channels) def conv_block(self, x, conv_layer, after_layers): out = pad_layer(x, conv_layer, is_2d=True) out = F.leaky_relu(out, negative_slope=self.ns) for layer in after_layers: out = layer(out) return out def forward(self, x, classify=False): x = torch.unsqueeze(x, dim=1) out = self.conv_block(x, self.conv1, [self.ins_norm1, self.drop1]) out = self.conv_block(out, self.conv2, [self.ins_norm2, self.drop2]) out = self.conv_block(out, self.conv3, [self.ins_norm3, self.drop3]) out = self.conv_block(out, self.conv4, [self.ins_norm4, self.drop4]) out = self.conv_block(out, self.conv5, [self.ins_norm5, self.drop5]) out = self.conv_block(out, self.conv6, [self.ins_norm6, self.drop6]) # GAN output value val = self.conv7(out) val = val.view(val.size(0), -1) mean_val = torch.mean(val, dim=1) if classify: # classify logits = self.conv_classify(out) logits = logits.view(logits.size(0), -1) return mean_val, logits else: return mean_val class SpeakerClassifier(nn.Module): def __init__(self, c_in=512, c_h=512, n_class=8, dp=0.1, ns=0.01): super(SpeakerClassifier, self).__init__() self.dp, self.ns = dp, ns self.conv1 = nn.Conv1d(c_in, c_h, kernel_size=5) self.conv2 = nn.Conv1d(c_h, c_h, kernel_size=5) self.conv3 = nn.Conv1d(c_h, c_h, kernel_size=5) self.conv4 = nn.Conv1d(c_h, c_h, kernel_size=5) self.conv5 = nn.Conv1d(c_h, c_h, kernel_size=5) self.conv6 = nn.Conv1d(c_h, c_h, kernel_size=5) self.conv7 = nn.Conv1d(c_h, c_h//2, kernel_size=3) self.conv8 = nn.Conv1d(c_h//2, c_h//4, kernel_size=3) self.conv9 = nn.Conv1d(c_h//4, n_class, kernel_size=16) self.drop1 = nn.Dropout(p=dp) self.drop2 = nn.Dropout(p=dp) self.drop3 = nn.Dropout(p=dp) self.drop4 = nn.Dropout(p=dp) self.ins_norm1 = nn.InstanceNorm1d(c_h) self.ins_norm2 = nn.InstanceNorm1d(c_h) self.ins_norm3 = nn.InstanceNorm1d(c_h) self.ins_norm4 = nn.InstanceNorm1d(c_h//4) def conv_block(self, x, conv_layers, after_layers, res=True): out = x for layer in conv_layers: out = pad_layer(out, layer) out = F.leaky_relu(out, negative_slope=self.ns) for layer in after_layers: out = layer(out) if res: out = out + x return out def forward(self, x): out = self.conv_block(x, [self.conv1, self.conv2], [self.ins_norm1, self.drop1], res=False) out = self.conv_block(out, [self.conv3, self.conv4], [self.ins_norm2, self.drop2], res=True) out = self.conv_block(out, [self.conv5, self.conv6], [self.ins_norm3, self.drop3], res=True) out = self.conv_block(out, [self.conv7, self.conv8], [self.ins_norm4, self.drop4], res=False) out = self.conv9(out) out = out.view(out.size()[0], -1) return out class Decoder(nn.Module): def __init__(self, c_in=512, c_out=513, c_h=512, c_a=8, emb_size=128, ns=0.2): super(Decoder, self).__init__() self.ns = ns self.conv1 = nn.Conv1d(c_in, 2*c_h, kernel_size=3) self.conv2 = nn.Conv1d(c_h, c_h, kernel_size=3) self.conv3 = nn.Conv1d(c_h, 2*c_h, kernel_size=3) self.conv4 = nn.Conv1d(c_h, c_h, kernel_size=3) self.conv5 = nn.Conv1d(c_h, 2*c_h, kernel_size=3) self.conv6 = nn.Conv1d(c_h, c_h, kernel_size=3) self.dense1 = nn.Linear(c_h, c_h) self.dense2 = nn.Linear(c_h, c_h) self.dense3 = nn.Linear(c_h, c_h) self.dense4 = nn.Linear(c_h, c_h) self.RNN = nn.GRU(input_size=c_h, hidden_size=c_h//2, num_layers=1, bidirectional=True) self.dense5 = nn.Linear(2*c_h + c_h, c_h) self.linear = nn.Linear(c_h, c_out) # normalization layer self.ins_norm1 = nn.InstanceNorm1d(c_h) self.ins_norm2 = nn.InstanceNorm1d(c_h) self.ins_norm3 = nn.InstanceNorm1d(c_h) self.ins_norm4 = nn.InstanceNorm1d(c_h) self.ins_norm5 = nn.InstanceNorm1d(c_h) # embedding layer self.emb1 = nn.Embedding(c_a, c_h) self.emb2 = nn.Embedding(c_a, c_h) self.emb3 = nn.Embedding(c_a, c_h) self.emb4 = nn.Embedding(c_a, c_h) self.emb5 = nn.Embedding(c_a, c_h) def conv_block(self, x, conv_layers, norm_layer, emb, res=True): # first layer x_add = x + emb.view(emb.size(0), emb.size(1), 1) out = pad_layer(x_add, conv_layers[0]) out = F.leaky_relu(out, negative_slope=self.ns) # upsample by pixelshuffle out = pixel_shuffle_1d(out, upscale_factor=2) out = out + emb.view(emb.size(0), emb.size(1), 1) out = pad_layer(out, conv_layers[1]) out = F.leaky_relu(out, negative_slope=self.ns) out = norm_layer(out) if res: x_up = upsample(x, scale_factor=2) out = out + x_up return out def dense_block(self, x, emb, layers, norm_layer, res=True): out = x for layer in layers: out = out + emb.view(emb.size(0), emb.size(1), 1) out = linear(out, layer) out = F.leaky_relu(out, negative_slope=self.ns) out = norm_layer(out) if res: out = out + x return out def forward(self, x, c): # conv layer out = self.conv_block(x, [self.conv1, self.conv2], self.ins_norm1, self.emb1(c), res=True ) out = self.conv_block(out, [self.conv3, self.conv4], self.ins_norm2, self.emb2(c), res=True) out = self.conv_block(out, [self.conv5, self.conv6], self.ins_norm3, self.emb3(c), res=True) # dense layer out = self.dense_block(out, self.emb4(c), [self.dense1, self.dense2], self.ins_norm4, res=True) out = self.dense_block(out, self.emb4(c), [self.dense3, self.dense4], self.ins_norm5, res=True) emb = self.emb5(c) out_add = out + emb.view(emb.size(0), emb.size(1), 1) # rnn layer out_rnn = RNN(out_add, self.RNN) out = torch.cat([out, out_rnn], dim=1) out = append_emb(self.emb5(c), out.size(2), out) out = linear(out, self.dense5) out = F.leaky_relu(out, negative_slope=self.ns) out = linear(out, self.linear) #out = torch.tanh(out) return out class Encoder(nn.Module): def __init__(self, c_in=513, c_h1=128, c_h2=512, c_h3=128, ns=0.2, dp=0.5): super(Encoder, self).__init__() self.ns = ns self.conv1s = nn.ModuleList( [nn.Conv1d(c_in, c_h1, kernel_size=k) for k in range(1, 8)] ) self.conv2 = nn.Conv1d(len(self.conv1s)*c_h1 + c_in, c_h2, kernel_size=1) self.conv3 = nn.Conv1d(c_h2, c_h2, kernel_size=5) self.conv4 = nn.Conv1d(c_h2, c_h2, kernel_size=5, stride=2) self.conv5 = nn.Conv1d(c_h2, c_h2, kernel_size=5) self.conv6 = nn.Conv1d(c_h2, c_h2, kernel_size=5, stride=2) self.conv7 = nn.Conv1d(c_h2, c_h2, kernel_size=5) self.conv8 = nn.Conv1d(c_h2, c_h2, kernel_size=5, stride=2) self.dense1 = nn.Linear(c_h2, c_h2) self.dense2 = nn.Linear(c_h2, c_h2) self.dense3 = nn.Linear(c_h2, c_h2) self.dense4 = nn.Linear(c_h2, c_h2) self.RNN = nn.GRU(input_size=c_h2, hidden_size=c_h3, num_layers=1, bidirectional=True) self.linear = nn.Linear(c_h2 + 2*c_h3, c_h2) # normalization layer self.ins_norm1 = nn.InstanceNorm1d(c_h2) self.ins_norm2 = nn.InstanceNorm1d(c_h2) self.ins_norm3 = nn.InstanceNorm1d(c_h2) self.ins_norm4 = nn.InstanceNorm1d(c_h2) self.ins_norm5 = nn.InstanceNorm1d(c_h2) self.ins_norm6 = nn.InstanceNorm1d(c_h2) # dropout layer self.drop1 = nn.Dropout(p=dp) self.drop2 = nn.Dropout(p=dp) self.drop3 = nn.Dropout(p=dp) self.drop4 = nn.Dropout(p=dp) self.drop5 = nn.Dropout(p=dp) self.drop6 = nn.Dropout(p=dp) def conv_block(self, x, conv_layers, norm_layers, res=True): out = x for layer in conv_layers: out = pad_layer(out, layer) out = F.leaky_relu(out, negative_slope=self.ns) for layer in norm_layers: out = layer(out) if res: x_pad = F.pad(x, pad=(0, x.size(2) % 2), mode='reflect') x_down = F.avg_pool1d(x_pad, kernel_size=2) out = x_down + out return out def dense_block(self, x, layers, norm_layers, res=True): out = x for layer in layers: out = linear(out, layer) out = F.leaky_relu(out, negative_slope=self.ns) for layer in norm_layers: out = layer(out) if res: out = out + x return out def forward(self, x): outs = [] for l in self.conv1s: out = pad_layer(x, l) outs.append(out) out = torch.cat(outs + [x], dim=1) out = F.leaky_relu(out, negative_slope=self.ns) out = self.conv_block(out, [self.conv2], [self.ins_norm1, self.drop1], res=False) out = self.conv_block(out, [self.conv3, self.conv4], [self.ins_norm2, self.drop2]) out = self.conv_block(out, [self.conv5, self.conv6], [self.ins_norm3, self.drop3]) out = self.conv_block(out, [self.conv7, self.conv8], [self.ins_norm4, self.drop4]) # dense layer out = self.dense_block(out, [self.dense1, self.dense2], [self.ins_norm5, self.drop5], res=True) out = self.dense_block(out, [self.dense3, self.dense4], [self.ins_norm6, self.drop6], res=True) out_rnn = RNN(out, self.RNN) out = torch.cat([out, out_rnn], dim=1) out = linear(out, self.linear) out = F.leaky_relu(out, negative_slope=self.ns) return out ================================================ FILE: preprocess/make_dataset_vctk.py ================================================ import h5py import numpy as np import sys import os import glob import re from collections import defaultdict #from tacotron.audio import load_wav, spectrogram, melspectrogram from tacotron.norm_utils import get_spectrograms def read_speaker_info(path='/storage/datasets/VCTK/VCTK-Corpus/speaker-info.txt'): accent2speaker = defaultdict(lambda: []) with open(path) as f: splited_lines = [line.strip().split() for line in f][1:] speakers = [line[0] for line in splited_lines] regions = [line[3] for line in splited_lines] for speaker, region in zip(speakers, regions): accent2speaker[region].append(speaker) return accent2speaker if __name__ == '__main__': if len(sys.argv) < 4: print('usage: python3 make_dataset_vctk.py [data root directory (VCTK-Corpus)] [h5py path] ' '[training proportion]') exit(0) root_dir = sys.argv[1] h5py_path = sys.argv[2] proportion = float(sys.argv[3]) accent2speaker = read_speaker_info(os.path.join(root_dir, 'speaker-info.txt')) filename_groups = defaultdict(lambda : []) with h5py.File(h5py_path, 'w') as f_h5: filenames = sorted(glob.glob(os.path.join(root_dir, 'wav48/*/*.wav'))) for filename in filenames: # divide into groups sub_filename = filename.strip().split('/')[-1] # format: p{speaker}_{sid}.wav speaker_id, utt_id = re.match(r'p(\d+)_(\d+)\.wav', sub_filename).groups() filename_groups[speaker_id].append(filename) for speaker_id, filenames in filename_groups.items(): # only use the speakers who are English accent. if speaker_id not in accent2speaker['English']: continue print('processing {}'.format(speaker_id)) train_size = int(len(filenames) * proportion) for i, filename in enumerate(filenames): sub_filename = filename.strip().split('/')[-1] # format: p{speaker}_{sid}.wav speaker_id, utt_id = re.match(r'p(\d+)_(\d+)\.wav', sub_filename).groups() _, lin_spec = get_spectrograms(filename) if i < train_size: datatype = 'train' else: datatype = 'test' f_h5.create_dataset(f'{datatype}/{speaker_id}/{utt_id}', \ data=lin_spec, dtype=np.float32) ================================================ FILE: preprocess/make_single_samples.py ================================================ import sys import h5py import numpy as np import json from collections import namedtuple import random class Sampler(object): def __init__(self, h5_path, dset, seg_len, used_speaker_path): self.dset = dset self.f_h5 = h5py.File(h5_path, 'r') self.seg_len = seg_len self.utt2len = self.get_utt_len() self.speakers = self.read_speakers(used_speaker_path) self.n_speaker = len(self.speakers) print(self.speakers) self.speaker2utts = {speaker:list(self.f_h5[f'{dset}/{speaker}'].keys()) for speaker in self.speakers} # remove too short utterence self.rm_too_short_utt(limit=self.seg_len) self.single_indexer = namedtuple('single_index', ['speaker', 'i', 't']) def get_utt_len(self): utt2len = {} for dset in ['train', 'test']: for speaker in self.f_h5[f'{dset}']: for utt_id in self.f_h5[f'{dset}/{speaker}']: length = self.f_h5[f'{dset}/{speaker}/{utt_id}'][()].shape[0] utt2len[(speaker, utt_id)] = length return utt2len def rm_too_short_utt(self, limit): for (speaker, utt_id), length in self.utt2len.items(): if speaker in self.speakers and length <= limit and utt_id in self.speaker2utts[speaker]: self.speaker2utts[speaker].remove(utt_id) def read_speakers(self, path): with open(path) as f: speakers = [line.strip() for line in f] return speakers def sample_utt(self, speaker_id, n_samples=1): # sample an utterence dset = self.dset utt_ids = random.sample(self.speaker2utts[speaker_id], n_samples) lengths = [self.f_h5[f'{dset}/{speaker_id}/{utt_id}'].shape[0] for utt_id in utt_ids] return [(utt_id, length) for utt_id, length in zip(utt_ids, lengths)] def rand(self, l): rand_idx = random.randint(0, len(l) - 1) return l[rand_idx] def sample_single(self): seg_len = self.seg_len speaker_idx, = random.sample(range(len(self.speakers)), 1) speaker = self.speakers[speaker_idx] (utt_id, utt_len), = self.sample_utt(speaker, 1) t = random.randint(0, utt_len - seg_len) index_tuple = self.single_indexer(speaker=speaker_idx, i=f'{speaker}/{utt_id}', t=t) return index_tuple if __name__ == '__main__': if len(sys.argv) < 6: print('usage: python3 make_single_samples.py [h5py path] [training sampled index path (.json)] ' '[n_samples] [segment length] [used speaker file path]') exit(0) h5py_path = sys.argv[1] output_path = sys.argv[2] n_samples = int(sys.argv[3]) segment_len = int(sys.argv[4]) speaker_path = sys.argv[5] sampler = Sampler(h5_path=h5py_path, seg_len=segment_len, dset='train', used_speaker_path=speaker_path) samples = [sampler.sample_single()._asdict() for _ in range(n_samples)] with open(sys.argv[2], 'w') as f_json: json.dump(samples, f_json, indent=4, separators=(',', ': ')) ================================================ FILE: preprocess/preprocess.sh ================================================ . vctk.config python3 make_dataset_vctk.py $data_root_dir $h5py_path $train_proportion python3 make_single_samples.py $h5py_path $index_path $n_samples $seg_len $speaker_used_path ================================================ FILE: preprocess/tacotron/norm_utils.py ================================================ # -*- coding: utf-8 -*- # /usr/bin/python2 ''' By kyubyong park. kbpark.linguist@gmail.com. https://www.github.com/kyubyong/dc_tts ''' from __future__ import print_function, division #from hyperparams import Hyperparams as hp import numpy as np #import tensorflow as tf import librosa import copy import matplotlib matplotlib.use('pdf') import matplotlib.pyplot as plt from scipy import signal import os class hyperparams(object): def __init__(self): self.max_duration = 10.0 # signal processing self.sr = 16000 # Sample rate. self.n_fft = 1024 # fft points (samples) self.frame_shift = 0.0125 # seconds self.frame_length = 0.05 # seconds self.hop_length = int(self.sr*self.frame_shift) # samples. self.win_length = int(self.sr*self.frame_length) # samples. self.n_mels = 80 # Number of Mel banks to generate self.power = 1.2 # Exponent for amplifying the predicted magnitude self.n_iter = 300 # Number of inversion iterations self.preemphasis = .97 # or None self.max_db = 100 self.ref_db = 20 hp = hyperparams() def get_spectrograms(fpath): '''Returns normalized log(melspectrogram) and log(magnitude) from `sound_file`. Args: sound_file: A string. The full path of a sound file. Returns: mel: A 2d array of shape (T, n_mels) <- Transposed mag: A 2d array of shape (T, 1+n_fft/2) <- Transposed ''' # num = np.random.randn() # if num < .2: # y, sr = librosa.load(fpath, sr=hp.sr) # else: # if num < .4: # tempo = 1.1 # elif num < .6: # tempo = 1.2 # elif num < .8: # tempo = 0.9 # else: # tempo = 0.8 # cmd = "ffmpeg -i {} -y ar {} -hide_banner -loglevel panic -ac 1 -filter:a atempo={} -vn temp.wav".format(fpath, hp.sr, tempo) # os.system(cmd) # y, sr = librosa.load('temp.wav', sr=hp.sr) # Loading sound file y, sr = librosa.load(fpath, sr=hp.sr) # Trimming y, _ = librosa.effects.trim(y) # Preemphasis y = np.append(y[0], y[1:] - hp.preemphasis * y[:-1]) # stft linear = librosa.stft(y=y, n_fft=hp.n_fft, hop_length=hp.hop_length, win_length=hp.win_length) # magnitude spectrogram mag = np.abs(linear) # (1+n_fft//2, T) # mel spectrogram mel_basis = librosa.filters.mel(hp.sr, hp.n_fft, hp.n_mels) # (n_mels, 1+n_fft//2) mel = np.dot(mel_basis, mag) # (n_mels, t) # to decibel mel = 20 * np.log10(np.maximum(1e-5, mel)) mag = 20 * np.log10(np.maximum(1e-5, mag)) # normalize mel = np.clip((mel - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1) mag = np.clip((mag - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1) # Transpose mel = mel.T.astype(np.float32) # (T, n_mels) mag = mag.T.astype(np.float32) # (T, 1+n_fft//2) return mel, mag def spectrogram2wav(mag): '''# Generate wave file from spectrogram''' # transpose mag = mag.T # de-noramlize mag = (np.clip(mag, 0, 1) * hp.max_db) - hp.max_db + hp.ref_db # to amplitude mag = np.power(10.0, mag * 0.05) # wav reconstruction wav = griffin_lim(mag) # de-preemphasis wav = signal.lfilter([1], [1, -hp.preemphasis], wav) # trim wav, _ = librosa.effects.trim(wav) return wav.astype(np.float32) def griffin_lim(spectrogram): '''Applies Griffin-Lim's raw. ''' X_best = copy.deepcopy(spectrogram) for i in range(hp.n_iter): X_t = invert_spectrogram(X_best) est = librosa.stft(X_t, hp.n_fft, hp.hop_length, win_length=hp.win_length) phase = est / np.maximum(1e-8, np.abs(est)) X_best = spectrogram * phase X_t = invert_spectrogram(X_best) y = np.real(X_t) return y def invert_spectrogram(spectrogram): ''' spectrogram: [f, t] ''' return librosa.istft(spectrogram, hp.hop_length, win_length=hp.win_length, window="hann") def plot_alignment(alignment, gs): """Plots the alignment alignments: A list of (numpy) matrix of shape (encoder_steps, decoder_steps) gs : (int) global step """ fig, ax = plt.subplots() im = ax.imshow(alignment) # cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7]) fig.colorbar(im) plt.title('{} Steps'.format(gs)) plt.savefig('{}/alignment_{}k.png'.format(hp.logdir, gs//1000), format='png') def learning_rate_decay(init_lr, global_step, warmup_steps=4000.): '''Noam scheme from tensor2tensor''' step = tf.cast(global_step + 1, dtype=tf.float32) return init_lr * warmup_steps ** 0.5 * tf.minimum(step * warmup_steps ** -1.5, step ** -0.5) def load_spectrograms(fpath): fname = os.path.basename(fpath) mel, mag = get_spectrograms(fpath) t = mel.shape[0] num_paddings = hp.r - (t % hp.r) if t % hp.r != 0 else 0 # for reduction mel = np.pad(mel, [[0, num_paddings], [0, 0]], mode="constant") mag = np.pad(mag, [[0, num_paddings], [0, 0]], mode="constant") return fname, mel.reshape((-1, hp.n_mels*hp.r)), mag ================================================ FILE: solver.py ================================================ import torch from torch import optim from torch.autograd import Variable from torch import nn import torch.nn.functional as F import numpy as np import pickle from model import Encoder from model import Decoder from model import SpeakerClassifier from model import PatchDiscriminator import os from utils import Hps from utils import Logger from utils import DataLoader from utils import to_var from utils import reset_grad from utils import grad_clip from utils import cal_acc from utils import cc from utils import calculate_gradients_penalty from utils import gen_noise class Solver(object): def __init__(self, hps, data_loader, log_dir='./log/'): self.hps = hps self.data_loader = data_loader self.model_kept = [] self.max_keep = 100 self.build_model() self.logger = Logger(log_dir) def build_model(self): hps = self.hps ns = self.hps.ns emb_size = self.hps.emb_size self.Encoder = cc(Encoder(ns=ns, dp=hps.enc_dp)) self.Decoder = cc(Decoder(ns=ns, c_a=hps.n_speakers, emb_size=emb_size)) self.Generator = cc(Decoder(ns=ns, c_a=hps.n_speakers, emb_size=emb_size)) self.SpeakerClassifier = cc(SpeakerClassifier(ns=ns, n_class=hps.n_speakers, dp=hps.dis_dp)) self.PatchDiscriminator = cc(nn.DataParallel(PatchDiscriminator(ns=ns, n_class=hps.n_speakers))) betas = (0.5, 0.9) params = list(self.Encoder.parameters()) + list(self.Decoder.parameters()) self.ae_opt = optim.Adam(params, lr=self.hps.lr, betas=betas) self.clf_opt = optim.Adam(self.SpeakerClassifier.parameters(), lr=self.hps.lr, betas=betas) self.gen_opt = optim.Adam(self.Generator.parameters(), lr=self.hps.lr, betas=betas) self.patch_opt = optim.Adam(self.PatchDiscriminator.parameters(), lr=self.hps.lr, betas=betas) def save_model(self, model_path, iteration, enc_only=True): if not enc_only: all_model = { 'encoder': self.Encoder.state_dict(), 'decoder': self.Decoder.state_dict(), 'generator': self.Generator.state_dict(), 'classifier': self.SpeakerClassifier.state_dict(), 'patch_discriminator': self.PatchDiscriminator.state_dict(), } else: all_model = { 'encoder': self.Encoder.state_dict(), 'decoder': self.Decoder.state_dict(), 'generator': self.Generator.state_dict(), } new_model_path = '{}-{}'.format(model_path, iteration) with open(new_model_path, 'wb') as f_out: torch.save(all_model, f_out) self.model_kept.append(new_model_path) if len(self.model_kept) >= self.max_keep: os.remove(self.model_kept[0]) self.model_kept.pop(0) def load_model(self, model_path, enc_only=True): print('load model from {}'.format(model_path)) with open(model_path, 'rb') as f_in: all_model = torch.load(f_in) self.Encoder.load_state_dict(all_model['encoder']) self.Decoder.load_state_dict(all_model['decoder']) self.Generator.load_state_dict(all_model['generator']) if not enc_only: self.SpeakerClassifier.load_state_dict(all_model['classifier']) self.PatchDiscriminator.load_state_dict(all_model['patch_discriminator']) def set_eval(self): self.Encoder.eval() self.Decoder.eval() self.Generator.eval() self.SpeakerClassifier.eval() self.PatchDiscriminator.eval() def test_step(self, x, c, gen=False): self.set_eval() x = to_var(x).permute(0, 2, 1) enc = self.Encoder(x) x_tilde = self.Decoder(enc, c) if gen: x_tilde += self.Generator(enc, c) return x_tilde.data.cpu().numpy() def permute_data(self, data): C = to_var(data[0], requires_grad=False) X = to_var(data[1]).permute(0, 2, 1) return C, X def sample_c(self, size): n_speakers = self.hps.n_speakers c_sample = Variable( torch.multinomial(torch.ones(n_speakers), num_samples=size, replacement=True), requires_grad=False) c_sample = c_sample.cuda() if torch.cuda.is_available() else c_sample return c_sample def encode_step(self, x): enc = self.Encoder(x) return enc def decode_step(self, enc, c): x_tilde = self.Decoder(enc, c) return x_tilde def patch_step(self, x, x_tilde, is_dis=True): D_real, real_logits = self.PatchDiscriminator(x, classify=True) D_fake, fake_logits = self.PatchDiscriminator(x_tilde, classify=True) if is_dis: w_dis = torch.mean(D_real - D_fake) gp = calculate_gradients_penalty(self.PatchDiscriminator, x, x_tilde) return w_dis, real_logits, gp else: return -torch.mean(D_fake), fake_logits def gen_step(self, enc, c): x_gen = self.Decoder(enc, c) + self.Generator(enc, c) return x_gen def clf_step(self, enc): logits = self.SpeakerClassifier(enc) return logits def cal_loss(self, logits, y_true): # calculate loss criterion = nn.CrossEntropyLoss() loss = criterion(logits, y_true) return loss def train(self, model_path, flag='train', mode='train'): # load hyperparams hps = self.hps if mode == 'pretrain_G': for iteration in range(hps.enc_pretrain_iters): data = next(self.data_loader) c, x = self.permute_data(data) # encode enc = self.encode_step(x) x_tilde = self.decode_step(enc, c) loss_rec = torch.mean(torch.abs(x_tilde - x)) reset_grad([self.Encoder, self.Decoder]) loss_rec.backward() grad_clip([self.Encoder, self.Decoder], self.hps.max_grad_norm) self.ae_opt.step() # tb info info = { f'{flag}/pre_loss_rec': loss_rec.item(), } slot_value = (iteration + 1, hps.enc_pretrain_iters) + tuple([value for value in info.values()]) log = 'pre_G:[%06d/%06d], loss_rec=%.3f' print(log % slot_value) if iteration % 100 == 0: for tag, value in info.items(): self.logger.scalar_summary(tag, value, iteration + 1) elif mode == 'pretrain_D': for iteration in range(hps.dis_pretrain_iters): data = next(self.data_loader) c, x = self.permute_data(data) # encode enc = self.encode_step(x) # classify speaker logits = self.clf_step(enc) loss_clf = self.cal_loss(logits, c) # update reset_grad([self.SpeakerClassifier]) loss_clf.backward() grad_clip([self.SpeakerClassifier], self.hps.max_grad_norm) self.clf_opt.step() # calculate acc acc = cal_acc(logits, c) info = { f'{flag}/pre_loss_clf': loss_clf.item(), f'{flag}/pre_acc': acc, } slot_value = (iteration + 1, hps.dis_pretrain_iters) + tuple([value for value in info.values()]) log = 'pre_D:[%06d/%06d], loss_clf=%.2f, acc=%.2f' print(log % slot_value) if iteration % 100 == 0: for tag, value in info.items(): self.logger.scalar_summary(tag, value, iteration + 1) elif mode == 'patchGAN': for iteration in range(hps.patch_iters): #=======train D=========# for step in range(hps.n_patch_steps): data = next(self.data_loader) c, x = self.permute_data(data) ## encode enc = self.encode_step(x) # sample c c_prime = self.sample_c(x.size(0)) # generator x_tilde = self.gen_step(enc, c_prime) # discriminstor w_dis, real_logits, gp = self.patch_step(x, x_tilde, is_dis=True) # aux classification loss loss_clf = self.cal_loss(real_logits, c) loss = -hps.beta_dis * w_dis + hps.beta_clf * loss_clf + hps.lambda_ * gp reset_grad([self.PatchDiscriminator]) loss.backward() grad_clip([self.PatchDiscriminator], self.hps.max_grad_norm) self.patch_opt.step() # calculate acc acc = cal_acc(real_logits, c) info = { f'{flag}/w_dis': w_dis.item(), f'{flag}/gp': gp.item(), f'{flag}/real_loss_clf': loss_clf.item(), f'{flag}/real_acc': acc, } slot_value = (step, iteration+1, hps.patch_iters) + tuple([value for value in info.values()]) log = 'patch_D-%d:[%06d/%06d], w_dis=%.2f, gp=%.2f, loss_clf=%.2f, acc=%.2f' print(log % slot_value) if iteration % 100 == 0: for tag, value in info.items(): self.logger.scalar_summary(tag, value, iteration + 1) #=======train G=========# data = next(self.data_loader) c, x = self.permute_data(data) # encode enc = self.encode_step(x) # sample c c_prime = self.sample_c(x.size(0)) # generator x_tilde = self.gen_step(enc, c_prime) # discriminstor loss_adv, fake_logits = self.patch_step(x, x_tilde, is_dis=False) # aux classification loss loss_clf = self.cal_loss(fake_logits, c_prime) loss = hps.beta_clf * loss_clf + hps.beta_gen * loss_adv reset_grad([self.Generator]) loss.backward() grad_clip([self.Generator], self.hps.max_grad_norm) self.gen_opt.step() # calculate acc acc = cal_acc(fake_logits, c_prime) info = { f'{flag}/loss_adv': loss_adv.item(), f'{flag}/fake_loss_clf': loss_clf.item(), f'{flag}/fake_acc': acc, } slot_value = (iteration+1, hps.patch_iters) + tuple([value for value in info.values()]) log = 'patch_G:[%06d/%06d], loss_adv=%.2f, loss_clf=%.2f, acc=%.2f' print(log % slot_value) if iteration % 100 == 0: for tag, value in info.items(): self.logger.scalar_summary(tag, value, iteration + 1) if iteration % 1000 == 0 or iteration + 1 == hps.patch_iters: self.save_model(model_path, iteration + hps.iters) elif mode == 'train': for iteration in range(hps.iters): # calculate current alpha if iteration < hps.lat_sched_iters: current_alpha = hps.alpha_enc * (iteration / hps.lat_sched_iters) else: current_alpha = hps.alpha_enc #==================train D==================# for step in range(hps.n_latent_steps): data = next(self.data_loader) c, x = self.permute_data(data) # encode enc = self.encode_step(x) # classify speaker logits = self.clf_step(enc) loss_clf = self.cal_loss(logits, c) loss = hps.alpha_dis * loss_clf # update reset_grad([self.SpeakerClassifier]) loss.backward() grad_clip([self.SpeakerClassifier], self.hps.max_grad_norm) self.clf_opt.step() # calculate acc acc = cal_acc(logits, c) info = { f'{flag}/D_loss_clf': loss_clf.item(), f'{flag}/D_acc': acc, } slot_value = (step, iteration + 1, hps.iters) + tuple([value for value in info.values()]) log = 'D-%d:[%06d/%06d], loss_clf=%.2f, acc=%.2f' print(log % slot_value) if iteration % 100 == 0: for tag, value in info.items(): self.logger.scalar_summary(tag, value, iteration + 1) #==================train G==================# data = next(self.data_loader) c, x = self.permute_data(data) # encode enc = self.encode_step(x) # decode x_tilde = self.decode_step(enc, c) loss_rec = torch.mean(torch.abs(x_tilde - x)) # classify speaker logits = self.clf_step(enc) acc = cal_acc(logits, c) loss_clf = self.cal_loss(logits, c) # maximize classification loss loss = loss_rec - current_alpha * loss_clf reset_grad([self.Encoder, self.Decoder]) loss.backward() grad_clip([self.Encoder, self.Decoder], self.hps.max_grad_norm) self.ae_opt.step() info = { f'{flag}/loss_rec': loss_rec.item(), f'{flag}/G_loss_clf': loss_clf.item(), f'{flag}/alpha': current_alpha, f'{flag}/G_acc': acc, } slot_value = (iteration + 1, hps.iters) + tuple([value for value in info.values()]) log = 'G:[%06d/%06d], loss_rec=%.3f, loss_clf=%.2f, alpha=%.2e, acc=%.2f' print(log % slot_value) if iteration % 100 == 0: for tag, value in info.items(): self.logger.scalar_summary(tag, value, iteration + 1) if iteration % 1000 == 0 or iteration + 1 == hps.iters: self.save_model(model_path, iteration) ================================================ FILE: test.py ================================================ import torch from torch import optim from torch.autograd import Variable import numpy as np import pickle from utils import Hps from preprocess.tacotron.norm_utils import spectrogram2wav, get_spectrograms from scipy.io.wavfile import write import glob import os import argparse from solver import Solver if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-hps', help='The path of hyper-parameter set', default='vctk.json') parser.add_argument('-model', '-m', help='The path of model checkpoint') parser.add_argument('-source', '-s', help='The path of source .wav file') parser.add_argument('-target', '-t', help='Target speaker id (integer). Same order as the speaker list when preprocessing (en_speaker_used.txt)') parser.add_argument('-output', '-o', help='output .wav path') parser.add_argument('-sample_rate', '-sr', default=16000, type=int) parser.add_argument('--use_gen', default=True, action='store_true') args = parser.parse_args() hps = Hps() hps.load(args.hps) hps_tuple = hps.get_tuple() solver = Solver(hps_tuple, None) solver.load_model(args.model) _, spec = get_spectrograms(args.source) spec_expand = np.expand_dims(spec, axis=0) spec_tensor = torch.from_numpy(spec_expand).type(torch.FloatTensor) c = Variable(torch.from_numpy(np.array([int(args.target)]))).cuda() result = solver.test_step(spec_tensor, c, gen=args.use_gen) result = result.squeeze(axis=0).transpose((1, 0)) wav_data = spectrogram2wav(result) write(args.output, rate=args.sample_rate, data=wav_data) ================================================ FILE: utils.py ================================================ import json import h5py import pickle import os from collections import defaultdict from collections import namedtuple import numpy as np import math import argparse import random import time import torch from torch.utils import data from tensorboardX import SummaryWriter from torch.autograd import Variable def cc(net): if torch.cuda.is_available(): return net.cuda() else: return net def gen_noise(x_dim, y_dim): x = torch.randn(x_dim, 1) y = torch.randn(1, y_dim) return x @ y def to_var(x, requires_grad=True): x = Variable(x, requires_grad=requires_grad) return x.cuda() if torch.cuda.is_available() else x def reset_grad(net_list): for net in net_list: net.zero_grad() def grad_clip(net_list, max_grad_norm): for net in net_list: torch.nn.utils.clip_grad_norm_(net.parameters(), max_grad_norm) def calculate_gradients_penalty(netD, real_data, fake_data): alpha = torch.rand(real_data.size(0)) alpha = alpha.view(real_data.size(0), 1, 1) alpha = alpha.cuda() if torch.cuda.is_available() else alpha alpha = Variable(alpha) interpolates = alpha * real_data + (1 - alpha) * fake_data disc_interpolates = netD(interpolates) use_cuda = torch.cuda.is_available() grad_outputs = torch.ones(disc_interpolates.size()).cuda() if use_cuda else torch.ones(disc_interpolates.size()) gradients = torch.autograd.grad( outputs=disc_interpolates, inputs=interpolates, grad_outputs=grad_outputs, create_graph=True, retain_graph=True, only_inputs=True)[0] gradients_penalty = (1. - torch.sqrt(1e-12 + torch.sum(gradients.view(gradients.size(0), -1)**2, dim=1))) ** 2 gradients_penalty = torch.mean(gradients_penalty) return gradients_penalty def cal_acc(logits, y_true): _, ind = torch.max(logits, dim=1) acc = torch.sum((ind == y_true).type(torch.FloatTensor)) / y_true.size(0) return acc class Hps(object): def __init__(self): self.hps = namedtuple('hps', [ 'lr', 'alpha_dis', 'alpha_enc', 'beta_dis', 'beta_gen', 'beta_clf', 'lambda_', 'ns', 'enc_dp', 'dis_dp', 'max_grad_norm', 'seg_len', 'emb_size', 'n_speakers', 'n_latent_steps', 'n_patch_steps', 'batch_size', 'lat_sched_iters', 'enc_pretrain_iters', 'dis_pretrain_iters', 'patch_iters', 'iters', ] ) default = \ [1e-4, 1, 1e-4, 0, 0, 0, 10, 0.01, 0.5, 0.1, 5, 128, 128, 8, 5, 0, 32, 50000, 5000, 5000, 30000, 60000] self._hps = self.hps._make(default) def get_tuple(self): return self._hps def load(self, path): with open(path, 'r') as f_json: hps_dict = json.load(f_json) self._hps = self.hps(**hps_dict) def dump(self, path): with open(path, 'w') as f_json: json.dump(self._hps._asdict(), f_json, indent=4, separators=(',', ': ')) class DataLoader(object): def __init__(self, dataset, batch_size=16): self.dataset = dataset self.n_elements = len(self.dataset[0]) self.batch_size = batch_size self.index = 0 def all(self, size=1000): samples = [self.dataset[self.index + i] for i in range(size)] batch = [[s for s in sample] for sample in zip(*samples)] batch_tensor = [torch.from_numpy(np.array(data)) for data in batch] if self.index + 2 * self.batch_size >= len(self.dataset): self.index = 0 else: self.index += self.batch_size return tuple(batch_tensor) def __iter__(self): return self def __next__(self): samples = [self.dataset[self.index + i] for i in range(self.batch_size)] batch = [[s for s in sample] for sample in zip(*samples)] batch_tensor = [torch.from_numpy(np.array(data)) for data in batch] if self.index + 2 * self.batch_size >= len(self.dataset): self.index = 0 else: self.index += self.batch_size return tuple(batch_tensor) class SingleDataset(data.Dataset): def __init__(self, h5_path, index_path, dset='train', seg_len=128): self.dataset = h5py.File(h5_path, 'r') with open(index_path) as f_index: self.indexes = json.load(f_index) self.indexer = namedtuple('index', ['speaker', 'i', 't']) self.seg_len = seg_len self.dset = dset def __getitem__(self, i): index = self.indexes[i] index = self.indexer(**index) speaker = index.speaker i, t = index.i, index.t seg_len = self.seg_len data = [speaker, self.dataset[f'{self.dset}/{i}'][t:t+seg_len]] return tuple(data) def __len__(self): return len(self.indexes) class Logger(object): def __init__(self, log_dir='./log'): self.writer = SummaryWriter(log_dir) def scalar_summary(self, tag, value, step): self.writer.add_scalar(tag, value, step) ================================================ FILE: vctk.json ================================================ { "lr": 0.0001, "alpha_dis": 1, "alpha_enc": 0.01, "beta_dis": 1, "beta_gen": 1, "beta_clf": 1, "lambda_": 10, "ns": 0.01, "enc_dp": 0.5, "dis_dp": 0.3, "max_grad_norm": 5, "seg_len": 128, "emb_size": 512, "n_speakers": 20, "n_latent_steps": 5, "n_patch_steps": 5, "batch_size": 32, "lat_sched_iters": 50000, "enc_pretrain_iters": 8000, "dis_pretrain_iters": 20000, "patch_iters": 50000, "iters": 80000 }