Repository: liusongxiang/StarGAN-Voice-Conversion Branch: master Commit: a4633d8b4888 Files: 10 Total size: 51.6 KB Directory structure: gitextract_6c839ap3/ ├── README.md ├── convert.py ├── converted_samples/ │ └── Readme ├── data_loader.py ├── logger.py ├── main.py ├── model.py ├── preprocess.py ├── solver.py └── utils.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: README.md ================================================ # StarGAN-Voice-Conversion This is a pytorch implementation of the paper: StarGAN-VC: Non-parallel many-to-many voice conversion with star generative adversarial networks https://arxiv.org/abs/1806.02169 . Note that the model architecture is a little different from that of the original paper. # Dependencies * Python 3.6 (or 3.5) * Pytorch 0.4.0 * pyworld * tqdm * librosa * tensorboardX and tensorboard # Usage ## Download Dataset Download and unzip [VCTK](https://homepages.inf.ed.ac.uk/jyamagis/page3/page58/page58.html) corpus to designated directories. ```bash mkdir ./data wget https://datashare.is.ed.ac.uk/bitstream/handle/10283/2651/VCTK-Corpus.zip?sequence=2&isAllowed=y unzip VCTK-Corpus.zip -d ./data ``` If the downloaded VCTK is in tar.gz, run this: ```bash tar -xzvf VCTK-Corpus.tar.gz -C ./data ``` Preprocess data We will use Mel-cepstral coefficients(MCEPs) here. ```bash python preprocess.py --sample_rate 16000 \ --origin_wavpath data/VCTK-Corpus/wav48 \ --target_wavpath data/VCTK-Corpus/wav16 \ --mc_dir_train data/mc/train \ --mc_dir_test data/mc/test ``` Train model Note: you may need to early stop the training process if the training-time test samples sounds good or the you can also see the training loss curves to determine early stop or not. ``` python main.py ``` Convert For example: restore model at step 200000 and specify the source speaker and target speaker to `p262` and `p272`, respectively. ``` convert.py --resume_iters 200000 --src_spk p262 --trg_spk p272 ``` ## To-Do list - [x] Post some converted samples (Please find some converted samples in the `converted_samples` folder). ## Papers that use this repo: 1. [AUTOVC: Zero-Shot Voice Style Transfer with Only Autoencoder Loss (ICML2019)](https://arxiv.org/pdf/1905.05879v2.pdf) 2. [Blow: a single-scale hyperconditioned flow for non-parallel raw-audio voice conversion (NeurIPS 2019)](https://arxiv.org/pdf/1906.00794.pdf) 3. [ADAGAN: ADAPTIVE GAN FOR MANY-TO-MANY NON-PARALLEL VOICE CONVERSION (under review for ICLR 2020)](https://openreview.net/pdf?id=HJlk-eHFwH) ================================================ FILE: convert.py ================================================ import argparse from model import Generator from torch.autograd import Variable import torch import torch.nn.functional as F import numpy as np import os from os.path import join, basename, dirname, split import time import datetime from data_loader import to_categorical import librosa from utils import * import glob # Below is the accent info for the used 10 speakers. spk2acc = {'262': 'Edinburgh', #F '272': 'Edinburgh', #M '229': 'SouthEngland', #F '232': 'SouthEngland', #M '292': 'NorthernIrishBelfast', #M '293': 'NorthernIrishBelfast', #F '360': 'AmericanNewJersey', #M '361': 'AmericanNewJersey', #F '248': 'India', #F '251': 'India'} #M speakers = ['p262', 'p272', 'p229', 'p232', 'p292', 'p293', 'p360', 'p361', 'p248', 'p251'] spk2idx = dict(zip(speakers, range(len(speakers)))) class TestDataset(object): """Dataset for testing.""" def __init__(self, config): assert config.trg_spk in speakers, f'The trg_spk should be chosen from {speakers}, but you choose {trg_spk}.' # Source speaker self.src_spk = config.src_spk self.trg_spk = config.trg_spk self.mc_files = sorted(glob.glob(join(config.test_data_dir, f'{config.src_spk}*.npy'))) self.src_spk_stats = np.load(join(config.train_data_dir, f'{config.src_spk}_stats.npz')) self.src_wav_dir = f'{config.wav_dir}/{config.src_spk}' self.trg_spk_stats = np.load(join(config.train_data_dir, f'{config.trg_spk}_stats.npz')) self.logf0s_mean_src = self.src_spk_stats['log_f0s_mean'] self.logf0s_std_src = self.src_spk_stats['log_f0s_std'] self.logf0s_mean_trg = self.trg_spk_stats['log_f0s_mean'] self.logf0s_std_trg = self.trg_spk_stats['log_f0s_std'] self.mcep_mean_src = self.src_spk_stats['coded_sps_mean'] self.mcep_std_src = self.src_spk_stats['coded_sps_std'] self.mcep_mean_trg = self.trg_spk_stats['coded_sps_mean'] self.mcep_std_trg = self.trg_spk_stats['coded_sps_std'] self.spk_idx = spk2idx[config.trg_spk] spk_cat = to_categorical([self.spk_idx], num_classes=len(speakers)) self.spk_c_trg = spk_cat def get_batch_test_data(self, batch_size=4): batch_data = [] for i in range(batch_size): mcfile = self.mc_files[i] filename = basename(mcfile).split('-')[-1] wavfile_path = join(self.src_wav_dir, filename.replace('npy', 'wav')) batch_data.append(wavfile_path) return batch_data def load_wav(wavfile, sr=16000): wav, _ = librosa.load(wavfile, sr=sr, mono=True) return wav_padding(wav, sr=sr, frame_period=5, multiple = 4) # TODO # return wav def test(config): os.makedirs(join(config.convert_dir, str(config.resume_iters)), exist_ok=True) sampling_rate, num_mcep, frame_period=16000, 36, 5 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') G = Generator().to(device) test_loader = TestDataset(config) # Restore model print(f'Loading the trained models from step {config.resume_iters}...') G_path = join(config.model_save_dir, f'{config.resume_iters}-G.ckpt') G.load_state_dict(torch.load(G_path, map_location=lambda storage, loc: storage)) # Read a batch of testdata test_wavfiles = test_loader.get_batch_test_data(batch_size=config.num_converted_wavs) test_wavs = [load_wav(wavfile, sampling_rate) for wavfile in test_wavfiles] with torch.no_grad(): for idx, wav in enumerate(test_wavs): print(len(wav)) wav_name = basename(test_wavfiles[idx]) # print(wav_name) f0, timeaxis, sp, ap = world_decompose(wav=wav, fs=sampling_rate, frame_period=frame_period) f0_converted = pitch_conversion(f0=f0, mean_log_src=test_loader.logf0s_mean_src, std_log_src=test_loader.logf0s_std_src, mean_log_target=test_loader.logf0s_mean_trg, std_log_target=test_loader.logf0s_std_trg) coded_sp = world_encode_spectral_envelop(sp=sp, fs=sampling_rate, dim=num_mcep) print("Before being fed into G: ", coded_sp.shape) coded_sp_norm = (coded_sp - test_loader.mcep_mean_src) / test_loader.mcep_std_src coded_sp_norm_tensor = torch.FloatTensor(coded_sp_norm.T).unsqueeze_(0).unsqueeze_(1).to(device) spk_conds = torch.FloatTensor(test_loader.spk_c_trg).to(device) # print(spk_conds.size()) coded_sp_converted_norm = G(coded_sp_norm_tensor, spk_conds).data.cpu().numpy() coded_sp_converted = np.squeeze(coded_sp_converted_norm).T * test_loader.mcep_std_trg + test_loader.mcep_mean_trg coded_sp_converted = np.ascontiguousarray(coded_sp_converted) print("After being fed into G: ", coded_sp_converted.shape) wav_transformed = world_speech_synthesis(f0=f0_converted, coded_sp=coded_sp_converted, ap=ap, fs=sampling_rate, frame_period=frame_period) wav_id = wav_name.split('.')[0] librosa.output.write_wav(join(config.convert_dir, str(config.resume_iters), f'{wav_id}-vcto-{test_loader.trg_spk}.wav'), wav_transformed, sampling_rate) if [True, False][0]: wav_cpsyn = world_speech_synthesis(f0=f0, coded_sp=coded_sp, ap=ap, fs=sampling_rate, frame_period=frame_period) librosa.output.write_wav(join(config.convert_dir, str(config.resume_iters), f'cpsyn-{wav_name}'), wav_cpsyn, sampling_rate) if __name__ == '__main__': parser = argparse.ArgumentParser() # Model configuration. parser.add_argument('--num_speakers', type=int, default=10, help='dimension of speaker labels') parser.add_argument('--num_converted_wavs', type=int, default=8, help='number of wavs to convert.') parser.add_argument('--resume_iters', type=int, default=None, help='step to resume for testing.') parser.add_argument('--src_spk', type=str, default='p262', help = 'target speaker.') parser.add_argument('--trg_spk', type=str, default='p272', help = 'target speaker.') # Directories. parser.add_argument('--train_data_dir', type=str, default='./data/mc/train') parser.add_argument('--test_data_dir', type=str, default='./data/mc/test') parser.add_argument('--wav_dir', type=str, default="./data/VCTK-Corpus/wav16") parser.add_argument('--log_dir', type=str, default='./logs') parser.add_argument('--model_save_dir', type=str, default='./models') parser.add_argument('--convert_dir', type=str, default='./converted') config = parser.parse_args() print(config) if config.resume_iters is None: raise RuntimeError("Please specify the step number for resuming.") test(config) ================================================ FILE: converted_samples/Readme ================================================ cpsyn stands for copy-synthesis for references. These converted samples were obtained from not-well-fine-tuned model. If you want to get better results, please tune the hyper-parameters carefully. ================================================ FILE: data_loader.py ================================================ from torch.utils import data import torch import os import random import glob from os.path import join, basename, dirname, split import numpy as np # Below is the accent info for the used 10 speakers. spk2acc = {'262': 'Edinburgh', #F '272': 'Edinburgh', #M '229': 'SouthEngland', #F '232': 'SouthEngland', #M '292': 'NorthernIrishBelfast', #M '293': 'NorthernIrishBelfast', #F '360': 'AmericanNewJersey', #M '361': 'AmericanNewJersey', #F '248': 'India', #F '251': 'India'} #M min_length = 256 # Since we slice 256 frames from each utterance when training. # Build a dict useful when we want to get one-hot representation of speakers. speakers = ['p262', 'p272', 'p229', 'p232', 'p292', 'p293', 'p360', 'p361', 'p248', 'p251'] spk2idx = dict(zip(speakers, range(len(speakers)))) def to_categorical(y, num_classes=None): """Converts a class vector (integers) to binary class matrix. E.g. for use with categorical_crossentropy. # Arguments y: class vector to be converted into a matrix (integers from 0 to num_classes). num_classes: total number of classes. # Returns A binary matrix representation of the input. The classes axis is placed last. From Keras np_utils """ y = np.array(y, dtype='int') input_shape = y.shape if input_shape and input_shape[-1] == 1 and len(input_shape) > 1: input_shape = tuple(input_shape[:-1]) y = y.ravel() if not num_classes: num_classes = np.max(y) + 1 n = y.shape[0] categorical = np.zeros((n, num_classes), dtype=np.float32) categorical[np.arange(n), y] = 1 output_shape = input_shape + (num_classes,) categorical = np.reshape(categorical, output_shape) return categorical class MyDataset(data.Dataset): """Dataset for MCEP features and speaker labels.""" def __init__(self, data_dir): mc_files = glob.glob(join(data_dir, '*.npy')) mc_files = [i for i in mc_files if basename(i)[:4] in speakers] self.mc_files = self.rm_too_short_utt(mc_files) self.num_files = len(self.mc_files) print("\t Number of training samples: ", self.num_files) for f in self.mc_files: mc = np.load(f) if mc.shape[0] <= min_length: print(f) raise RuntimeError(f"The data may be corrupted! We need all MCEP features having more than {min_length} frames!") def rm_too_short_utt(self, mc_files, min_length=min_length): new_mc_files = [] for mcfile in mc_files: mc = np.load(mcfile) if mc.shape[0] > min_length: new_mc_files.append(mcfile) return new_mc_files def sample_seg(self, feat, sample_len=min_length): assert feat.shape[0] - sample_len >= 0 s = np.random.randint(0, feat.shape[0] - sample_len + 1) return feat[s:s+sample_len, :] def __len__(self): return self.num_files def __getitem__(self, index): filename = self.mc_files[index] spk = basename(filename).split('_')[0] spk_idx = spk2idx[spk] mc = np.load(filename) mc = self.sample_seg(mc) mc = np.transpose(mc, (1, 0)) # (T, D) -> (D, T), since pytorch need feature having shape # to one-hot spk_cat = np.squeeze(to_categorical([spk_idx], num_classes=len(speakers))) return torch.FloatTensor(mc), torch.LongTensor([spk_idx]).squeeze_(), torch.FloatTensor(spk_cat) class TestDataset(object): """Dataset for testing.""" def __init__(self, data_dir, wav_dir, src_spk='p262', trg_spk='p272'): self.src_spk = src_spk self.trg_spk = trg_spk self.mc_files = sorted(glob.glob(join(data_dir, '{}*.npy'.format(self.src_spk)))) self.src_spk_stats = np.load(join(data_dir.replace('test', 'train'), '{}_stats.npz'.format(src_spk))) self.trg_spk_stats = np.load(join(data_dir.replace('test', 'train'), '{}_stats.npz'.format(trg_spk))) self.logf0s_mean_src = self.src_spk_stats['log_f0s_mean'] self.logf0s_std_src = self.src_spk_stats['log_f0s_std'] self.logf0s_mean_trg = self.trg_spk_stats['log_f0s_mean'] self.logf0s_std_trg = self.trg_spk_stats['log_f0s_std'] self.mcep_mean_src = self.src_spk_stats['coded_sps_mean'] self.mcep_std_src = self.src_spk_stats['coded_sps_std'] self.mcep_mean_trg = self.trg_spk_stats['coded_sps_mean'] self.mcep_std_trg = self.trg_spk_stats['coded_sps_std'] self.src_wav_dir = f'{wav_dir}/{src_spk}' self.spk_idx = spk2idx[trg_spk] spk_cat = to_categorical([self.spk_idx], num_classes=len(speakers)) self.spk_c_trg = spk_cat def get_batch_test_data(self, batch_size=8): batch_data = [] for i in range(batch_size): mcfile = self.mc_files[i] filename = basename(mcfile).split('-')[-1] wavfile_path = join(self.src_wav_dir, filename.replace('npy', 'wav')) batch_data.append(wavfile_path) return batch_data def get_loader(data_dir, batch_size=32, mode='train', num_workers=1): dataset = MyDataset(data_dir) data_loader = data.DataLoader(dataset=dataset, batch_size=batch_size, shuffle=(mode=='train'), num_workers=num_workers, drop_last=True) return data_loader if __name__ == '__main__': loader = get_loader('./data/mc/train') data_iter = iter(loader) for i in range(10): mc, spk_idx, acc_idx, spk_acc_cat = next(data_iter) print('-'*50) print(mc.size()) print(spk_idx.size()) print(acc_idx.size()) print(spk_acc_cat.size()) print(spk_idx.squeeze_()) print(spk_acc_cat) print('-'*50) ================================================ FILE: logger.py ================================================ # import tensorflow as tf from tensorboardX import SummaryWriter # class Logger(object): # """Tensorflow Tensorboard logger.""" # def __init__(self, log_dir): # """Initialize summary writer.""" # self.writer = tf.summary.FileWriter(log_dir) # def scalar_summary(self, tag, value, step): # """Add scalar summary.""" # summary = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)]) # self.writer.add_summary(summary, step) class Logger(object): """Using tensorboardX such that need no dependency on tensorflow.""" def __init__(self, log_dir): """Initialize summary writer.""" self.writer = SummaryWriter(log_dir) def scalar_summary(self, tag, value, step): self.writer.add_scalar(tag, value, step) ================================================ FILE: main.py ================================================ import os import argparse from solver import Solver from data_loader import get_loader, TestDataset from torch.backends import cudnn def str2bool(v): return v.lower() in ('true') def main(config): # For fast training. cudnn.benchmark = True # Create directories if not exist. if not os.path.exists(config.log_dir): os.makedirs(config.log_dir) if not os.path.exists(config.model_save_dir): os.makedirs(config.model_save_dir) if not os.path.exists(config.sample_dir): os.makedirs(config.sample_dir) # Data loader. train_loader = get_loader(config.train_data_dir, config.batch_size, 'train', num_workers=config.num_workers) test_loader = TestDataset(config.test_data_dir, config.wav_dir, src_spk='p262', trg_spk='p272') # Solver for training and testing StarGAN. solver = Solver(train_loader, test_loader, config) if config.mode == 'train': solver.train() # elif config.mode == 'test': # solver.test() if __name__ == '__main__': parser = argparse.ArgumentParser() # Model configuration. parser.add_argument('--num_speakers', type=int, default=10, help='dimension of speaker labels') parser.add_argument('--lambda_cls', type=float, default=10, help='weight for domain classification loss') parser.add_argument('--lambda_rec', type=float, default=10, help='weight for reconstruction loss') parser.add_argument('--lambda_gp', type=float, default=10, help='weight for gradient penalty') parser.add_argument('--sampling_rate', type=int, default=16000, help='sampling rate') # Training configuration. parser.add_argument('--batch_size', type=int, default=32, help='mini-batch size') parser.add_argument('--num_iters', type=int, default=200000, help='number of total iterations for training D') parser.add_argument('--num_iters_decay', type=int, default=100000, help='number of iterations for decaying lr') parser.add_argument('--g_lr', type=float, default=0.0001, help='learning rate for G') parser.add_argument('--d_lr', type=float, default=0.0001, help='learning rate for D') parser.add_argument('--n_critic', type=int, default=5, help='number of D updates per each G update') parser.add_argument('--beta1', type=float, default=0.5, help='beta1 for Adam optimizer') parser.add_argument('--beta2', type=float, default=0.999, help='beta2 for Adam optimizer') parser.add_argument('--resume_iters', type=int, default=None, help='resume training from this step') # Test configuration. parser.add_argument('--test_iters', type=int, default=100000, help='test model from this step') # Miscellaneous. parser.add_argument('--num_workers', type=int, default=1) parser.add_argument('--mode', type=str, default='train', choices=['train', 'test']) parser.add_argument('--use_tensorboard', type=str2bool, default=True) # Directories. parser.add_argument('--train_data_dir', type=str, default='./data/mc/train') parser.add_argument('--test_data_dir', type=str, default='./data/mc/test') parser.add_argument('--wav_dir', type=str, default="./data/VCTK-Corpus/wav16") parser.add_argument('--log_dir', type=str, default='./logs') parser.add_argument('--model_save_dir', type=str, default='./models') parser.add_argument('--sample_dir', type=str, default='./samples') # Step size. parser.add_argument('--log_step', type=int, default=10) parser.add_argument('--sample_step', type=int, default=1000) parser.add_argument('--model_save_step', type=int, default=1000) parser.add_argument('--lr_update_step', type=int, default=1000) config = parser.parse_args() print(config) main(config) ================================================ FILE: model.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F import numpy as np # from data_loader import get_loader class ResidualBlock(nn.Module): """Residual Block with instance normalization.""" def __init__(self, dim_in, dim_out): super(ResidualBlock, self).__init__() self.main = nn.Sequential( nn.Conv2d(dim_in, dim_out, kernel_size=3, stride=1, padding=1, bias=False), nn.InstanceNorm2d(dim_out, affine=True, track_running_stats=True), nn.ReLU(inplace=True), nn.Conv2d(dim_out, dim_out, kernel_size=3, stride=1, padding=1, bias=False), nn.InstanceNorm2d(dim_out, affine=True, track_running_stats=True)) def forward(self, x): return x + self.main(x) class Generator(nn.Module): """Generator network.""" def __init__(self, conv_dim=64, num_speakers=10, repeat_num=6): super(Generator, self).__init__() c_dim = num_speakers layers = [] layers.append(nn.Conv2d(1+c_dim, conv_dim, kernel_size=(3, 9), padding=(1, 4), bias=False)) layers.append(nn.InstanceNorm2d(conv_dim, affine=True, track_running_stats=True)) layers.append(nn.ReLU(inplace=True)) # Down-sampling layers. curr_dim = conv_dim for i in range(2): layers.append(nn.Conv2d(curr_dim, curr_dim*2, kernel_size=(4, 8), stride=(2, 2), padding=(1, 3), bias=False)) layers.append(nn.InstanceNorm2d(curr_dim*2, affine=True, track_running_stats=True)) layers.append(nn.ReLU(inplace=True)) curr_dim = curr_dim * 2 # Bottleneck layers. for i in range(repeat_num): layers.append(ResidualBlock(dim_in=curr_dim, dim_out=curr_dim)) # Up-sampling layers. for i in range(2): layers.append(nn.ConvTranspose2d(curr_dim, curr_dim//2, kernel_size=4, stride=2, padding=1, bias=False)) layers.append(nn.InstanceNorm2d(curr_dim//2, affine=True, track_running_stats=True)) layers.append(nn.ReLU(inplace=True)) curr_dim = curr_dim // 2 layers.append(nn.Conv2d(curr_dim, 1, kernel_size=7, stride=1, padding=3, bias=False)) self.main = nn.Sequential(*layers) def forward(self, x, c): # Replicate spatially and concatenate domain information. c = c.view(c.size(0), c.size(1), 1, 1) c = c.repeat(1, 1, x.size(2), x.size(3)) x = torch.cat([x, c], dim=1) return self.main(x) class Discriminator(nn.Module): """Discriminator network with PatchGAN.""" def __init__(self, input_size=(36, 256), conv_dim=64, repeat_num=5, num_speakers=10): super(Discriminator, self).__init__() layers = [] layers.append(nn.Conv2d(1, conv_dim, kernel_size=4, stride=2, padding=1)) layers.append(nn.LeakyReLU(0.01)) curr_dim = conv_dim for i in range(1, repeat_num): layers.append(nn.Conv2d(curr_dim, curr_dim*2, kernel_size=4, stride=2, padding=1)) layers.append(nn.LeakyReLU(0.01)) curr_dim = curr_dim * 2 kernel_size_0 = int(input_size[0] / np.power(2, repeat_num)) # 1 kernel_size_1 = int(input_size[1] / np.power(2, repeat_num)) # 8 self.main = nn.Sequential(*layers) self.conv_dis = nn.Conv2d(curr_dim, 1, kernel_size=(kernel_size_0, kernel_size_1), stride=1, padding=0, bias=False) # padding should be 0 self.conv_clf_spks = nn.Conv2d(curr_dim, num_speakers, kernel_size=(kernel_size_0, kernel_size_1), stride=1, padding=0, bias=False) # for num_speaker def forward(self, x): h = self.main(x) out_src = self.conv_dis(h) out_cls_spks = self.conv_clf_spks(h) return out_src, out_cls_spks.view(out_cls_spks.size(0), out_cls_spks.size(1)) if __name__ == '__main__': device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') train_loader = get_loader('/scratch/sxliu/data_exp/VCTK-Corpus-22.05k/mc/train', 16, 'train', num_workers=1) data_iter = iter(train_loader) G = Generator().to(device) D = Discriminator().to(device) for i in range(10): mc_real, spk_label_org, acc_label_org, spk_acc_c_org = next(data_iter) mc_real.unsqueeze_(1) # (B, D, T) -> (B, 1, D, T) for conv2d mc_real = mc_real.to(device) # Input mc. spk_label_org = spk_label_org.to(device) # Original spk labels. acc_label_org = acc_label_org.to(device) # Original acc labels. spk_acc_c_org = spk_acc_c_org.to(device) # Original spk acc conditioning. mc_fake = G(mc_real, spk_acc_c_org) print(mc_fake.size()) out_src, out_cls_spks, out_cls_emos = D(mc_fake) ================================================ FILE: preprocess.py ================================================ import librosa import numpy as np import os, sys import argparse import pyworld from multiprocessing import cpu_count from concurrent.futures import ProcessPoolExecutor from functools import partial from utils import * from tqdm import tqdm from collections import defaultdict from collections import namedtuple from sklearn.model_selection import train_test_split import glob from os.path import join, basename import subprocess def resample(spk, origin_wavpath, target_wavpath): wavfiles = [i for i in os.listdir(join(origin_wavpath, spk)) if i.endswith(".wav")] for wav in wavfiles: folder_to = join(target_wavpath, spk) os.makedirs(folder_to, exist_ok=True) wav_to = join(folder_to, wav) wav_from = join(origin_wavpath, spk, wav) subprocess.call(['sox', wav_from, "-r", "16000", wav_to]) return 0 def resample_to_16k(origin_wavpath, target_wavpath, num_workers=1): os.makedirs(target_wavpath, exist_ok=True) spk_folders = os.listdir(origin_wavpath) print(f"> Using {num_workers} workers!") executor = ProcessPoolExecutor(max_workers=num_workers) futures = [] for spk in spk_folders: futures.append(executor.submit(partial(resample, spk, origin_wavpath, target_wavpath))) result_list = [future.result() for future in tqdm(futures)] print(result_list) def split_data(paths): indices = np.arange(len(paths)) test_size = 0.1 train_indices, test_indices = train_test_split(indices, test_size=test_size, random_state=1234) train_paths = list(np.array(paths)[train_indices]) test_paths = list(np.array(paths)[test_indices]) return train_paths, test_paths def get_spk_world_feats(spk_fold_path, mc_dir_train, mc_dir_test, sample_rate=16000): paths = glob.glob(join(spk_fold_path, '*.wav')) spk_name = basename(spk_fold_path) train_paths, test_paths = split_data(paths) f0s = [] coded_sps = [] for wav_file in train_paths: f0, _, _, _, coded_sp = world_encode_wav(wav_file, fs=sample_rate) f0s.append(f0) coded_sps.append(coded_sp) log_f0s_mean, log_f0s_std = logf0_statistics(f0s) coded_sps_mean, coded_sps_std = coded_sp_statistics(coded_sps) np.savez(join(mc_dir_train, spk_name+'_stats.npz'), log_f0s_mean=log_f0s_mean, log_f0s_std=log_f0s_std, coded_sps_mean=coded_sps_mean, coded_sps_std=coded_sps_std) for wav_file in tqdm(train_paths): wav_nam = basename(wav_file) f0, timeaxis, sp, ap, coded_sp = world_encode_wav(wav_file, fs=sample_rate) normed_coded_sp = normalize_coded_sp(coded_sp, coded_sps_mean, coded_sps_std) np.save(join(mc_dir_train, wav_nam.replace('.wav', '.npy')), normed_coded_sp, allow_pickle=False) for wav_file in tqdm(test_paths): wav_nam = basename(wav_file) f0, timeaxis, sp, ap, coded_sp = world_encode_wav(wav_file, fs=sample_rate) normed_coded_sp = normalize_coded_sp(coded_sp, coded_sps_mean, coded_sps_std) np.save(join(mc_dir_test, wav_nam.replace('.wav', '.npy')), normed_coded_sp, allow_pickle=False) return 0 if __name__ == '__main__': parser = argparse.ArgumentParser() sample_rate_default = 16000 origin_wavpath_default = "./data/VCTK-Corpus/wav48" target_wavpath_default = "./data/VCTK-Corpus/wav16" mc_dir_train_default = './data/mc/train' mc_dir_test_default = './data/mc/test' parser.add_argument("--sample_rate", type = int, default = 16000, help = "Sample rate.") parser.add_argument("--origin_wavpath", type = str, default = origin_wavpath_default, help = "The original wav path to resample.") parser.add_argument("--target_wavpath", type = str, default = target_wavpath_default, help = "The original wav path to resample.") parser.add_argument("--mc_dir_train", type = str, default = mc_dir_train_default, help = "The directory to store the training features.") parser.add_argument("--mc_dir_test", type = str, default = mc_dir_test_default, help = "The directory to store the testing features.") parser.add_argument("--num_workers", type = int, default = None, help = "The number of cpus to use.") argv = parser.parse_args() sample_rate = argv.sample_rate origin_wavpath = argv.origin_wavpath target_wavpath = argv.target_wavpath mc_dir_train = argv.mc_dir_train mc_dir_test = argv.mc_dir_test num_workers = argv.num_workers if argv.num_workers is not None else cpu_count() # The original wav in VCTK is 48K, first we want to resample to 16K resample_to_16k(origin_wavpath, target_wavpath, num_workers=num_workers) # WE only use 10 speakers listed below for this experiment. speaker_used = ['262', '272', '229', '232', '292', '293', '360', '361', '248', '251'] speaker_used = ['p'+i for i in speaker_used] ## Next we are to extract the acoustic features (MCEPs, lf0) and compute the corresponding stats (means, stds). # Make dirs to contain the MCEPs os.makedirs(mc_dir_train, exist_ok=True) os.makedirs(mc_dir_test, exist_ok=True) num_workers = len(speaker_used) #cpu_count() print("number of workers: ", num_workers) executor = ProcessPoolExecutor(max_workers=num_workers) work_dir = target_wavpath # spk_folders = os.listdir(work_dir) # print("processing {} speaker folders".format(len(spk_folders))) # print(spk_folders) futures = [] for spk in speaker_used: spk_path = os.path.join(work_dir, spk) futures.append(executor.submit(partial(get_spk_world_feats, spk_path, mc_dir_train, mc_dir_test, sample_rate))) result_list = [future.result() for future in tqdm(futures)] print(result_list) sys.exit(0) ================================================ FILE: solver.py ================================================ from model import Generator from model import Discriminator from torch.autograd import Variable from torchvision.utils import save_image import torch import torch.nn.functional as F import numpy as np import os from os.path import join, basename, dirname, split import time import datetime from data_loader import to_categorical import librosa from utils import * from tqdm import tqdm class Solver(object): """Solver for training and testing StarGAN.""" def __init__(self, train_loader, test_loader, config): """Initialize configurations.""" # Data loader. self.train_loader = train_loader self.test_loader = test_loader self.sampling_rate = config.sampling_rate # Model configurations. self.num_speakers = config.num_speakers self.lambda_cls = config.lambda_cls self.lambda_rec = config.lambda_rec self.lambda_gp = config.lambda_gp # Training configurations. self.batch_size = config.batch_size self.num_iters = config.num_iters self.num_iters_decay = config.num_iters_decay self.g_lr = config.g_lr self.d_lr = config.d_lr self.n_critic = config.n_critic self.beta1 = config.beta1 self.beta2 = config.beta2 self.resume_iters = config.resume_iters # Test configurations. self.test_iters = config.test_iters # Miscellaneous. self.use_tensorboard = config.use_tensorboard self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Directories. self.log_dir = config.log_dir self.sample_dir = config.sample_dir self.model_save_dir = config.model_save_dir # Step size. self.log_step = config.log_step self.sample_step = config.sample_step self.model_save_step = config.model_save_step self.lr_update_step = config.lr_update_step # Build the model and tensorboard. self.build_model() if self.use_tensorboard: self.build_tensorboard() def build_model(self): """Create a generator and a discriminator.""" self.G = Generator(num_speakers=self.num_speakers) self.D = Discriminator(num_speakers=self.num_speakers) self.g_optimizer = torch.optim.Adam(self.G.parameters(), self.g_lr, [self.beta1, self.beta2]) self.d_optimizer = torch.optim.Adam(self.D.parameters(), self.d_lr, [self.beta1, self.beta2]) self.print_network(self.G, 'G') self.print_network(self.D, 'D') self.G.to(self.device) self.D.to(self.device) def print_network(self, model, name): """Print out the network information.""" num_params = 0 for p in model.parameters(): num_params += p.numel() print(model) print(name) print("The number of parameters: {}".format(num_params)) def restore_model(self, resume_iters): """Restore the trained generator and discriminator.""" print('Loading the trained models from step {}...'.format(resume_iters)) G_path = os.path.join(self.model_save_dir, '{}-G.ckpt'.format(resume_iters)) D_path = os.path.join(self.model_save_dir, '{}-D.ckpt'.format(resume_iters)) self.G.load_state_dict(torch.load(G_path, map_location=lambda storage, loc: storage)) self.D.load_state_dict(torch.load(D_path, map_location=lambda storage, loc: storage)) def build_tensorboard(self): """Build a tensorboard logger.""" from logger import Logger self.logger = Logger(self.log_dir) def update_lr(self, g_lr, d_lr): """Decay learning rates of the generator and discriminator.""" for param_group in self.g_optimizer.param_groups: param_group['lr'] = g_lr for param_group in self.d_optimizer.param_groups: param_group['lr'] = d_lr def reset_grad(self): """Reset the gradient buffers.""" self.g_optimizer.zero_grad() self.d_optimizer.zero_grad() def denorm(self, x): """Convert the range from [-1, 1] to [0, 1].""" out = (x + 1) / 2 return out.clamp_(0, 1) def gradient_penalty(self, y, x): """Compute gradient penalty: (L2_norm(dy/dx) - 1)**2.""" weight = torch.ones(y.size()).to(self.device) dydx = torch.autograd.grad(outputs=y, inputs=x, grad_outputs=weight, retain_graph=True, create_graph=True, only_inputs=True)[0] dydx = dydx.view(dydx.size(0), -1) dydx_l2norm = torch.sqrt(torch.sum(dydx**2, dim=1)) return torch.mean((dydx_l2norm-1)**2) def label2onehot(self, labels, dim): """Convert label indices to one-hot vectors.""" batch_size = labels.size(0) out = torch.zeros(batch_size, dim) out[np.arange(batch_size), labels.long()] = 1 return out def sample_spk_c(self, size): spk_c = np.random.randint(0, self.num_speakers, size=size) spk_c_cat = to_categorical(spk_c, self.num_speakers) return torch.LongTensor(spk_c), torch.FloatTensor(spk_c_cat) def classification_loss(self, logit, target): """Compute softmax cross entropy loss.""" return F.cross_entropy(logit, target) def load_wav(self, wavfile, sr=16000): wav, _ = librosa.load(wavfile, sr=sr, mono=True) return wav_padding(wav, sr=16000, frame_period=5, multiple = 4) # TODO def train(self): """Train StarGAN.""" # Set data loader. train_loader = self.train_loader data_iter = iter(train_loader) # Read a batch of testdata test_wavfiles = self.test_loader.get_batch_test_data(batch_size=4) test_wavs = [self.load_wav(wavfile) for wavfile in test_wavfiles] # Determine whether do copysynthesize when first do training-time conversion test. cpsyn_flag = [True, False][0] # f0, timeaxis, sp, ap = world_decompose(wav = wav, fs = sampling_rate, frame_period = frame_period) # Learning rate cache for decaying. g_lr = self.g_lr d_lr = self.d_lr # Start training from scratch or resume training. start_iters = 0 if self.resume_iters: print("resuming step %d ..."% self.resume_iters) start_iters = self.resume_iters self.restore_model(self.resume_iters) # Start training. print('Start training...') start_time = time.time() for i in range(start_iters, self.num_iters): # =================================================================================== # # 1. Preprocess input data # # =================================================================================== # # Fetch labels. try: mc_real, spk_label_org, spk_c_org = next(data_iter) except: data_iter = iter(train_loader) mc_real, spk_label_org, spk_c_org = next(data_iter) mc_real.unsqueeze_(1) # (B, D, T) -> (B, 1, D, T) for conv2d # Generate target domain labels randomly. # spk_label_trg: int, spk_c_trg:one-hot representation spk_label_trg, spk_c_trg = self.sample_spk_c(mc_real.size(0)) mc_real = mc_real.to(self.device) # Input mc. spk_label_org = spk_label_org.to(self.device) # Original spk labels. spk_c_org = spk_c_org.to(self.device) # Original spk acc conditioning. spk_label_trg = spk_label_trg.to(self.device) # Target spk labels for classification loss for G. spk_c_trg = spk_c_trg.to(self.device) # Target spk conditioning. # =================================================================================== # # 2. Train the discriminator # # =================================================================================== # # Compute loss with real mc feats. out_src, out_cls_spks = self.D(mc_real) d_loss_real = - torch.mean(out_src) d_loss_cls_spks = self.classification_loss(out_cls_spks, spk_label_org) # Compute loss with fake mc feats. mc_fake = self.G(mc_real, spk_c_trg) out_src, out_cls_spks = self.D(mc_fake.detach()) d_loss_fake = torch.mean(out_src) # Compute loss for gradient penalty. alpha = torch.rand(mc_real.size(0), 1, 1, 1).to(self.device) x_hat = (alpha * mc_real.data + (1 - alpha) * mc_fake.data).requires_grad_(True) out_src, _ = self.D(x_hat) d_loss_gp = self.gradient_penalty(out_src, x_hat) # Backward and optimize. d_loss = d_loss_real + d_loss_fake + self.lambda_cls * d_loss_cls_spks + self.lambda_gp * d_loss_gp self.reset_grad() d_loss.backward() self.d_optimizer.step() # Logging. loss = {} loss['D/loss_real'] = d_loss_real.item() loss['D/loss_fake'] = d_loss_fake.item() loss['D/loss_cls_spks'] = d_loss_cls_spks.item() loss['D/loss_gp'] = d_loss_gp.item() # =================================================================================== # # 3. Train the generator # # =================================================================================== # if (i+1) % self.n_critic == 0: # Original-to-target domain. mc_fake = self.G(mc_real, spk_c_trg) out_src, out_cls_spks = self.D(mc_fake) g_loss_fake = - torch.mean(out_src) g_loss_cls_spks = self.classification_loss(out_cls_spks, spk_label_trg) # Target-to-original domain. mc_reconst = self.G(mc_fake, spk_c_org) g_loss_rec = torch.mean(torch.abs(mc_real - mc_reconst)) # Backward and optimize. g_loss = g_loss_fake + self.lambda_rec * g_loss_rec + self.lambda_cls * g_loss_cls_spks self.reset_grad() g_loss.backward() self.g_optimizer.step() # Logging. loss['G/loss_fake'] = g_loss_fake.item() loss['G/loss_rec'] = g_loss_rec.item() loss['G/loss_cls_spks'] = g_loss_cls_spks.item() # =================================================================================== # # 4. Miscellaneous # # =================================================================================== # # Print out training information. if (i+1) % self.log_step == 0: et = time.time() - start_time et = str(datetime.timedelta(seconds=et))[:-7] log = "Elapsed [{}], Iteration [{}/{}]".format(et, i+1, self.num_iters) for tag, value in loss.items(): log += ", {}: {:.4f}".format(tag, value) print(log) if self.use_tensorboard: for tag, value in loss.items(): self.logger.scalar_summary(tag, value, i+1) if (i+1) % self.sample_step == 0: sampling_rate=16000 num_mcep=36 frame_period=5 with torch.no_grad(): for idx, wav in tqdm(enumerate(test_wavs)): wav_name = basename(test_wavfiles[idx]) # print(wav_name) f0, timeaxis, sp, ap = world_decompose(wav=wav, fs=sampling_rate, frame_period=frame_period) f0_converted = pitch_conversion(f0=f0, mean_log_src=self.test_loader.logf0s_mean_src, std_log_src=self.test_loader.logf0s_std_src, mean_log_target=self.test_loader.logf0s_mean_trg, std_log_target=self.test_loader.logf0s_std_trg) coded_sp = world_encode_spectral_envelop(sp=sp, fs=sampling_rate, dim=num_mcep) coded_sp_norm = (coded_sp - self.test_loader.mcep_mean_src) / self.test_loader.mcep_std_src coded_sp_norm_tensor = torch.FloatTensor(coded_sp_norm.T).unsqueeze_(0).unsqueeze_(1).to(self.device) conds = torch.FloatTensor(self.test_loader.spk_c_trg).to(self.device) # print(conds.size()) coded_sp_converted_norm = self.G(coded_sp_norm_tensor, conds).data.cpu().numpy() coded_sp_converted = np.squeeze(coded_sp_converted_norm).T * self.test_loader.mcep_std_trg + self.test_loader.mcep_mean_trg coded_sp_converted = np.ascontiguousarray(coded_sp_converted) # decoded_sp_converted = world_decode_spectral_envelop(coded_sp = coded_sp_converted, fs = sampling_rate) wav_transformed = world_speech_synthesis(f0=f0_converted, coded_sp=coded_sp_converted, ap=ap, fs=sampling_rate, frame_period=frame_period) librosa.output.write_wav( join(self.sample_dir, str(i+1)+'-'+wav_name.split('.')[0]+'-vcto-{}'.format(self.test_loader.trg_spk)+'.wav'), wav_transformed, sampling_rate) if cpsyn_flag: wav_cpsyn = world_speech_synthesis(f0=f0, coded_sp=coded_sp, ap=ap, fs=sampling_rate, frame_period=frame_period) librosa.output.write_wav(join(self.sample_dir, 'cpsyn-'+wav_name), wav_cpsyn, sampling_rate) cpsyn_flag = False # Save model checkpoints. if (i+1) % self.model_save_step == 0: G_path = os.path.join(self.model_save_dir, '{}-G.ckpt'.format(i+1)) D_path = os.path.join(self.model_save_dir, '{}-D.ckpt'.format(i+1)) torch.save(self.G.state_dict(), G_path) torch.save(self.D.state_dict(), D_path) print('Saved model checkpoints into {}...'.format(self.model_save_dir)) # Decay learning rates. if (i+1) % self.lr_update_step == 0 and (i+1) > (self.num_iters - self.num_iters_decay): g_lr -= (self.g_lr / float(self.num_iters_decay)) d_lr -= (self.d_lr / float(self.num_iters_decay)) self.update_lr(g_lr, d_lr) print ('Decayed learning rates, g_lr: {}, d_lr: {}.'.format(g_lr, d_lr)) ================================================ FILE: utils.py ================================================ import librosa import numpy as np import os import pyworld def load_wav(wav_file, sr): wav, _ = librosa.load(wav_file, sr=sr, mono=True) return wav def world_decompose(wav, fs, frame_period = 5.0): # Decompose speech signal into f0, spectral envelope and aperiodicity using WORLD wav = wav.astype(np.float64) f0, timeaxis = pyworld.harvest(wav, fs, frame_period = frame_period, f0_floor = 71.0, f0_ceil = 800.0) sp = pyworld.cheaptrick(wav, f0, timeaxis, fs) ap = pyworld.d4c(wav, f0, timeaxis, fs) return f0, timeaxis, sp, ap def world_encode_spectral_envelop(sp, fs, dim=36): # Get Mel-cepstral coefficients (MCEPs) #sp = sp.astype(np.float64) coded_sp = pyworld.code_spectral_envelope(sp, fs, dim) return coded_sp def world_decode_spectral_envelop(coded_sp, fs): # Decode Mel-cepstral to sp fftlen = pyworld.get_cheaptrick_fft_size(fs) decoded_sp = pyworld.decode_spectral_envelope(coded_sp, fs, fftlen) return decoded_sp def world_encode_wav(wav_file, fs, frame_period=5.0, coded_dim=36): wav = load_wav(wav_file, sr=fs) f0, timeaxis, sp, ap = world_decompose(wav=wav, fs=fs, frame_period=frame_period) coded_sp = world_encode_spectral_envelop(sp = sp, fs = fs, dim = coded_dim) return f0, timeaxis, sp, ap, coded_sp def world_speech_synthesis(f0, coded_sp, ap, fs, frame_period): decoded_sp = world_decode_spectral_envelop(coded_sp, fs) # TODO min_len = min([len(f0), len(coded_sp), len(ap)]) f0 = f0[:min_len] coded_sp = coded_sp[:min_len] ap = ap[:min_len] wav = pyworld.synthesize(f0, decoded_sp, ap, fs, frame_period) # Librosa could not save wav if not doing so wav = wav.astype(np.float32) return wav def world_synthesis_data(f0s, coded_sps, aps, fs, frame_period): wavs = list() for f0, decoded_sp, ap in zip(f0s, coded_sps, aps): wav = world_speech_synthesis(f0, coded_sp, ap, fs, frame_period) wavs.append(wav) return wavs def coded_sps_normalization_fit_transoform(coded_sps): coded_sps_concatenated = np.concatenate(coded_sps, axis = 1) coded_sps_mean = np.mean(coded_sps_concatenated, axis = 1, keepdims = True) coded_sps_std = np.std(coded_sps_concatenated, axis = 1, keepdims = True) coded_sps_normalized = list() for coded_sp in coded_sps: coded_sps_normalized.append((coded_sp - coded_sps_mean) / coded_sps_std) return coded_sps_normalized, coded_sps_mean, coded_sps_std def coded_sp_statistics(coded_sps): # sp shape (T, D) coded_sps_concatenated = np.concatenate(coded_sps, axis = 0) coded_sps_mean = np.mean(coded_sps_concatenated, axis = 0, keepdims = False) coded_sps_std = np.std(coded_sps_concatenated, axis = 0, keepdims = False) return coded_sps_mean, coded_sps_std def normalize_coded_sp(coded_sp, coded_sp_mean, coded_sp_std): normed = (coded_sp - coded_sp_mean) / coded_sp_std return normed def coded_sps_normalization_transoform(coded_sps, coded_sps_mean, coded_sps_std): coded_sps_normalized = list() for coded_sp in coded_sps: coded_sps_normalized.append((coded_sp - coded_sps_mean) / coded_sps_std) return coded_sps_normalized def coded_sps_normalization_inverse_transoform(normalized_coded_sps, coded_sps_mean, coded_sps_std): coded_sps = list() for normalized_coded_sp in normalized_coded_sps: coded_sps.append(normalized_coded_sp * coded_sps_std + coded_sps_mean) return coded_sps def coded_sp_padding(coded_sp, multiple = 4): num_features = coded_sp.shape[0] num_frames = coded_sp.shape[1] num_frames_padded = int(np.ceil(num_frames / multiple)) * multiple num_frames_diff = num_frames_padded - num_frames num_pad_left = num_frames_diff // 2 num_pad_right = num_frames_diff - num_pad_left coded_sp_padded = np.pad(coded_sp, ((0, 0), (num_pad_left, num_pad_right)), 'constant', constant_values = 0) return coded_sp_padded def wav_padding(wav, sr, frame_period, multiple = 4): assert wav.ndim == 1 num_frames = len(wav) num_frames_padded = int((np.ceil((np.floor(num_frames / (sr * frame_period / 1000)) + 1) / multiple + 1) * multiple - 1) * (sr * frame_period / 1000)) num_frames_diff = num_frames_padded - num_frames num_pad_left = num_frames_diff // 2 num_pad_right = num_frames_diff - num_pad_left wav_padded = np.pad(wav, (num_pad_left, num_pad_right), 'constant', constant_values = 0) return wav_padded def logf0_statistics(f0s): log_f0s_concatenated = np.ma.log(np.concatenate(f0s)) log_f0s_mean = log_f0s_concatenated.mean() log_f0s_std = log_f0s_concatenated.std() return log_f0s_mean, log_f0s_std def pitch_conversion(f0, mean_log_src, std_log_src, mean_log_target, std_log_target): # Logarithm Gaussian normalization for Pitch Conversions f0_converted = np.exp((np.ma.log(f0) - mean_log_src) / std_log_src * std_log_target + mean_log_target) return f0_converted def wavs_to_specs(wavs, n_fft = 1024, hop_length = None): stfts = list() for wav in wavs: stft = librosa.stft(wav, n_fft = n_fft, hop_length = hop_length) stfts.append(stft) return stfts def wavs_to_mfccs(wavs, sr, n_fft = 1024, hop_length = None, n_mels = 128, n_mfcc = 24): mfccs = list() for wav in wavs: mfcc = librosa.feature.mfcc(y = wav, sr = sr, n_fft = n_fft, hop_length = hop_length, n_mels = n_mels, n_mfcc = n_mfcc) mfccs.append(mfcc) return mfccs def mfccs_normalization(mfccs): mfccs_concatenated = np.concatenate(mfccs, axis = 1) mfccs_mean = np.mean(mfccs_concatenated, axis = 1, keepdims = True) mfccs_std = np.std(mfccs_concatenated, axis = 1, keepdims = True) mfccs_normalized = list() for mfcc in mfccs: mfccs_normalized.append((mfcc - mfccs_mean) / mfccs_std) return mfccs_normalized, mfccs_mean, mfccs_std def sample_train_data(dataset_A, dataset_B, n_frames = 128): num_samples = min(len(dataset_A), len(dataset_B)) train_data_A_idx = np.arange(len(dataset_A)) train_data_B_idx = np.arange(len(dataset_B)) np.random.shuffle(train_data_A_idx) np.random.shuffle(train_data_B_idx) train_data_A_idx_subset = train_data_A_idx[:num_samples] train_data_B_idx_subset = train_data_B_idx[:num_samples] train_data_A = list() train_data_B = list() for idx_A, idx_B in zip(train_data_A_idx_subset, train_data_B_idx_subset): data_A = dataset_A[idx_A] frames_A_total = data_A.shape[1] assert frames_A_total >= n_frames start_A = np.random.randint(frames_A_total - n_frames + 1) end_A = start_A + n_frames train_data_A.append(data_A[:,start_A:end_A]) data_B = dataset_B[idx_B] frames_B_total = data_B.shape[1] assert frames_B_total >= n_frames start_B = np.random.randint(frames_B_total - n_frames + 1) end_B = start_B + n_frames train_data_B.append(data_B[:,start_B:end_B]) train_data_A = np.array(train_data_A) train_data_B = np.array(train_data_B) return train_data_A, train_data_B