Repository: yongxuUSTC/dcase2017_task4_cvssp Branch: main Commit: 84eae9b0054c Files: 14 Total size: 79.8 KB Directory structure: gitextract_vkgzfg8z/ ├── README.md ├── config.py ├── data_generator.py ├── download.py ├── main.py ├── main_cnnrnn_at.py ├── main_cnnrnn_balbatch_norm_baseline_icassp2018.py ├── main_crnn_at.py ├── main_crnn_sed.py ├── predict_audio_tagging_icassp2018_cnnrnn_baseline.py ├── predict_crnn_at.py ├── predict_crnn_sed.py ├── prepare_data.py └── runme.sh ================================================ FILE CONTENTS ================================================ ================================================ FILE: README.md ================================================ # dcase2017_task4_cvssp Source code for the DCASE2017 task4 "Large-scale weakly supervised sound event detection for smart cars" challenge. # Warning: this code is out of date, new version of our DCASE2017 code could be found here: https://github.com/qiuqiangkong/sound_event_detection_dcase2017_task4 The detailed results can be found here: http://www.cs.tut.fi/sgn/arg/dcase2017/challenge/task-large-scale-sound-event-detection-results The system submitted by CVSSP team is summarized here: http://www.cs.tut.fi/sgn/arg/dcase2017/documents/challenge_technical_reports/DCASE2017_Xu_146.pdf The data set used in this chanllege is part of Google Audioset. DCASE 2017 challenge is organized by TUT, CMU and INRIA, sponsored by Google and Audio Analytic. The CVSSP team submitted four systems to the audio tagging sub-task,which took all the top four places on the result table, among the 29 systems submitted by a number of organisations. CVSSP's system is also ranked at the 3rd place in the sound event detection subtask, among 17 systems. The competitors include CMU, New York University, Bosch, USC, TUT, Singapore A*Star, Korean Advanced Institute of Science and Technology, Seoul National University, National Taiwan university, etc. For downloading the data, please see https://groups.google.com/forum/#!searchin/dcase-discussions/own%7Csort:relevance/dcase-discussions/Lk2dTScX3A8/kvW17tlzAgAJ Number of training files: 51172 Number of testing files: 488 Number of evaluation files: 1103 @inproceedings{xu2018large, title={Large-scale weakly supervised audio classification using gated convolutional neural network}, author={Xu, Yong and Kong, Qiuqiang and Wang, Wenwu and Plumbley, Mark D}, booktitle={2018 IEEE international conference on acoustics, speech and signal processing (ICASSP)}, pages={121--125}, year={2018}, organization={IEEE} } ================================================ FILE: config.py ================================================ # workspace = "" workspace = "/vol/vssp/msos/qk/workspaces/ICASSP2018_dcase" # config sample_rate = 16000. n_window = 1024 n_overlap = 360 # ensure 240 frames in 10 seconds max_len = 240 # sequence max length is 10 s, 240 frames. step_time_in_sec = float(n_window - n_overlap) / sample_rate # Id of classes ids = ['/m/0284vy3', '/m/05x_td', '/m/02mfyn', '/m/02rhddq', '/m/0199g', '/m/06_fw', '/m/012n7d', '/m/012ndj', '/m/0dgbq', '/m/04qvtq', '/m/03qc9zr', '/m/0k4j', '/t/dd00134', '/m/01bjv', '/m/07r04', '/m/04_sv', '/m/07jdr'] # Name of classes lbs = ['Train horn', 'Air horn, truck horn', 'Car alarm', 'Reversing beeps', 'Bicycle', 'Skateboard', 'Ambulance (siren)', 'Fire engine, fire truck (siren)', 'Civil defense siren', 'Police car (siren)', 'Screaming', 'Car', 'Car passing by', 'Bus', 'Truck', 'Motorcycle', 'Train'] idx_to_id = {index: id for index, id in enumerate(ids)} id_to_idx = {id: index for index, id in enumerate(ids)} idx_to_lb = {index: lb for index, lb in enumerate(lbs)} lb_to_idx = {lb: index for index, lb in enumerate(lbs)} num_classes = len(lbs) ================================================ FILE: data_generator.py ================================================ import numpy as np import random class BalanceDataGenerator(object): def __init__(self, batch_size, type, te_max_iter=100): assert type in ['train', 'test'] self._batch_size_ = batch_size self._type_ = type self._te_max_iter_ = te_max_iter def generate(self, xs, ys): batch_size = self._batch_size_ x = xs[0] y = ys[0] (n_samples, n_labs) = y.shape n_each = batch_size // n_labs index_list = [] for i1 in xrange(n_labs): index_list.append(np.where(y[:, i1] == 1)[0]) for i1 in xrange(n_labs): np.random.shuffle(index_list[i1]) pointer_list = [0] * n_labs len_list = [len(e) for e in index_list] iter = 0 while True: if (self._type_) == 'test' and (iter == self._te_max_iter_): break iter += 1 batch_x = [] batch_y = [] for i1 in xrange(n_labs): if pointer_list[i1] >= len_list[i1]: pointer_list[i1] = 0 np.random.shuffle(index_list[i1]) batch_idx = index_list[i1][pointer_list[i1] : min(pointer_list[i1] + n_each, len_list[i1])] batch_x.append(x[batch_idx]) batch_y.append(y[batch_idx]) pointer_list[i1] += n_each batch_x = np.concatenate(batch_x, axis=0) batch_y = np.concatenate(batch_y, axis=0) yield batch_x, batch_y class RatioDataGenerator(object): def __init__(self, batch_size, type, te_max_iter=100, verbose=1): assert type in ['train', 'test'] self._batch_size_ = batch_size self._type_ = type self._te_max_iter_ = te_max_iter self._verbose_ = verbose def _get_lb_list(self, n_samples_list): lb_list = [] for idx in xrange(len(n_samples_list)): n_samples = n_samples_list[idx] if n_samples < 1000: lb_list += [idx] elif n_samples < 2000: lb_list += [idx] * 2 elif n_samples < 3000: lb_list += [idx] * 3 elif n_samples < 4000: lb_list += [idx] * 4 else: lb_list += [idx] * 5 return lb_list def generate(self, xs, ys): batch_size = self._batch_size_ x = xs[0] y = ys[0] (n_samples, n_labs) = y.shape n_samples_list = np.sum(y, axis=0) lb_list = self._get_lb_list(n_samples_list) if self._verbose_ == 1: print("n_samples_list: %s" % (n_samples_list,)) print("lb_list: %s" % (lb_list,)) print("len(lb_list): %d" % len(lb_list)) index_list = [] for i1 in xrange(n_labs): index_list.append(np.where(y[:, i1] == 1)[0]) for i1 in xrange(n_labs): np.random.shuffle(index_list[i1]) queue = [] pointer_list = [0] * n_labs len_list = [len(e) for e in index_list] iter = 0 while True: if (self._type_) == 'test' and (iter == self._te_max_iter_): break iter += 1 batch_x = [] batch_y = [] while len(queue) < batch_size: random.shuffle(lb_list) queue += lb_list batch_idx = queue[0 : batch_size] queue[0 : batch_size] = [] n_per_class_list = [batch_idx.count(idx) for idx in xrange(n_labs)] for i1 in xrange(n_labs): if pointer_list[i1] >= len_list[i1]: pointer_list[i1] = 0 np.random.shuffle(index_list[i1]) per_class_batch_idx = index_list[i1][pointer_list[i1] : min(pointer_list[i1] + n_per_class_list[i1], len_list[i1])] batch_x.append(x[per_class_batch_idx]) batch_y.append(y[per_class_batch_idx]) pointer_list[i1] += n_per_class_list[i1] batch_x = np.concatenate(batch_x, axis=0) batch_y = np.concatenate(batch_y, axis=0) yield batch_x, batch_y ================================================ FILE: download.py ================================================ ================================================ FILE: main.py ================================================ """Train a cldnn on the task4 of DCASE2017 dataset. GPU run command with Theano backend (with TensorFlow, the GPU is automatically used): THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatx=float32 python train*.py Author: Yong XU Creat date: 03/04/2017 """ from __future__ import print_function import sys import cPickle import numpy as np import argparse import time import os import keras from keras import backend as K from keras.models import Sequential,Model from keras.layers.core import Dense, Dropout, Activation, Flatten, Reshape, Permute,Lambda, RepeatVector from keras.layers.convolutional import ZeroPadding2D, AveragePooling2D, Conv2D,MaxPooling2D, Convolution1D,MaxPooling1D from keras.layers.pooling import GlobalMaxPooling2D from keras.layers import Merge, Input, merge from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler from keras.layers import LSTM, SimpleRNN, GRU, TimeDistributed, Bidirectional from keras.layers.normalization import BatchNormalization import h5py from keras.layers.merge import Multiply from sklearn import preprocessing import random import config as cfg from prepare_data import create_folder, load_hdf5_data, calculate_scaler, do_scale from data_generator import RatioDataGenerator def scheduler(epoch): initial_lrate = float(0.001) cur_lr=initial_lrate*pow(float(0.5),(epoch//5)) print("learning rate: %f" % cur_lr) return cur_lr def block(input): cnn = Conv2D(128, (3, 3), padding="same", activation="linear", use_bias=False)(input) cnn = BatchNormalization(axis=-1)(cnn) cnn1 = Lambda(slice1, output_shape=slice1_output_shape)(cnn) cnn2 = Lambda(slice2, output_shape=slice2_output_shape)(cnn) cnn1 = Activation('linear')(cnn1) cnn2 = Activation('sigmoid')(cnn2) out = Multiply()([cnn1, cnn2]) return out def slice1(x): return x[:, :, :, 0:64] def slice2(x): return x[:, :, :, 64:128] def slice1_output_shape(input_shape): return tuple([input_shape[0],input_shape[1],input_shape[2],64]) def slice2_output_shape(input_shape): return tuple([input_shape[0],input_shape[1],input_shape[2],64]) def train(): num_classes = cfg.num_classes # Load training & testing data (tr_x, tr_y, tr_na_list) = load_hdf5_data(args.tr_hdf5_path, verbose=1) # (tr_x, tr_y, tr_na_list) = load_hdf5(args.te_hdf5_path, verbose=1) (te_x, te_y, te_na_list) = load_hdf5_data(args.te_hdf5_path, verbose=1) print("") # Scale data scaler = calculate_scaler(tr_x, verbose=2) tr_x = do_scale(tr_x, scaler, verbose=1) te_x = do_scale(te_x, scaler, verbose=1) pause # Build model (_, n_time, n_freq) = tr_x.shape input_logmel = Input(shape=(n_time, n_freq)) a1 = Reshape((n_time, n_freq, 1))(input_logmel) cnn1 = block(a1) cnn1 = block(cnn1) cnn1 = MaxPooling2D(pool_size=(2, 2))(cnn1) cnn1 = block(cnn1) cnn1 = block(cnn1) cnn1 = MaxPooling2D(pool_size=(2, 2))(cnn1) cnn1 = block(cnn1) cnn1 = block(cnn1) cnn1 = MaxPooling2D(pool_size=(2, 2))(cnn1) cnn1 = block(cnn1) cnn3 = block(cnn1) cnn3 = MaxPooling2D(pool_size=(1, 2))(cnn1) cnnout = Conv2D(256, (3, 3), padding="same", activation="relu", use_bias=True)(cnn3) cnnout = MaxPooling2D(pool_size=(1, 4))(cnnout) cnnout = Reshape((30, 256))(cnnout) # Time step is downsampled to 30. rnnout = Bidirectional(GRU(128, activation='linear', return_sequences=True))(cnnout) rnnout_gate = Bidirectional(GRU(128, activation='sigmoid', return_sequences=True))(cnnout) out = Multiply()([rnnout, rnnout_gate]) out = TimeDistributed(Dense(num_classes, activation='sigmoid'))(out) out = Lambda(lambda x: K.mean(x, axis=1),output_shape=(num_classes,))(out) model = Model(input_logmel, out) model.summary() # Compile model model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # Save model callback filepath = os.path.join(args.out_model_dir, "gatedAct_rationBal44_lr0.001_normalization_at_cnnRNN_64newMel_240fr.{epoch:02d}-{val_acc:.4f}.hdf5") create_folder(os.path.dirname(filepath)) save_model = ModelCheckpoint(filepath=filepath, monitor='val_acc', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', period=1) # Data generator # gen = RatioDataGenerator(batch_size=44, type='train') gen = RatioDataGenerator(batch_size=100, type='train') # Train model.fit_generator(generator=gen.generate([tr_x], [tr_y]), steps_per_epoch=10, # 100 iters is called an 'epoch' epochs=2000000010, # Maximum 'epoch' to train verbose=1, callbacks=[save_model], validation_data=(te_x, te_y)) if __name__ == '__main__': parser = argparse.ArgumentParser(description="") subparsers = parser.add_subparsers(dest='mode') parser_train = subparsers.add_parser('train') parser_train.add_argument('--tr_hdf5_path', type=str) parser_train.add_argument('--te_hdf5_path', type=str) parser_train.add_argument('--out_model_dir', type=str) args = parser.parse_args() if args.mode == 'train': train() ================================================ FILE: main_cnnrnn_at.py ================================================ '''Train a cldnn on the task4 of DCASE2017 dataset. GPU run command with Theano backend (with TensorFlow, the GPU is automatically used): THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatx=float32 python train*.py Author: Yong XU Creat date: 03/04/2017 ''' #from __future__ import print_function import keras from keras import backend as K import sys import cPickle import numpy as np from keras.models import Sequential,Model #from keras.layers.core import Dense, Dropout, Activation, Flatten, Flatten_last2d, Reshape,Permute,Lambda, RepeatVector from keras.layers.core import Dense, Dropout, Activation, Flatten, Reshape,Permute,Lambda, RepeatVector from keras.layers.convolutional import ZeroPadding2D, AveragePooling2D, Conv2D,MaxPooling2D, Convolution1D,MaxPooling1D from keras.layers.pooling import GlobalMaxPooling2D from keras.layers import Merge, Input, merge from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler from keras.layers import LSTM, SimpleRNN, GRU, TimeDistributed, Bidirectional from keras.layers.normalization import BatchNormalization import h5py from keras.layers.merge import Multiply from sklearn import preprocessing import random def scheduler(epoch): initial_lrate = float(0.001) cur_lr=initial_lrate*pow(float(0.5),(epoch//5)) print "learning rate:", cur_lr return cur_lr class RatioDataGenerator(object): def __init__(self, batch_size, type, te_max_iter=100): assert type in ['train', 'test'] self._batch_size_ = batch_size self._type_ = type self._te_max_iter_ = te_max_iter def _get_lb_list(self, n_samples_list): lb_list = [] for idx in xrange(len(n_samples_list)): n_samples = n_samples_list[idx] if n_samples < 1000: lb_list += [idx] elif n_samples < 2000: lb_list += [idx] * 2 elif n_samples < 3000: lb_list += [idx] * 3 elif n_samples < 4000: lb_list += [idx] * 4 else: lb_list += [idx] * 5 return lb_list def generate(self, xs, ys): batch_size = self._batch_size_ x = xs[0] y = ys[0] (n_samples, n_labs) = y.shape n_samples_list = np.sum(y, axis=0) lb_list = self._get_lb_list(n_samples_list) print "n_samples_list:", n_samples_list print "lb_list:", lb_list print "len(lb_list):", len(lb_list) index_list = [] for i1 in xrange(n_labs): index_list.append(np.where(y[:, i1] == 1)[0]) for i1 in xrange(n_labs): np.random.shuffle(index_list[i1]) queue = [] pointer_list = [0] * n_labs len_list = [len(e) for e in index_list] iter = 0 while True: if (self._type_) == 'test' and (iter == self._te_max_iter_): break iter += 1 batch_x = [] batch_y = [] while len(queue) < batch_size: random.shuffle(lb_list) queue += lb_list batch_idx = queue[0 : batch_size] queue[0 : batch_size] = [] n_per_class_list = [batch_idx.count(idx) for idx in xrange(n_labs)] for i1 in xrange(n_labs): if pointer_list[i1] >= len_list[i1]: pointer_list[i1] = 0 np.random.shuffle(index_list[i1]) per_class_batch_idx = index_list[i1][pointer_list[i1] : min(pointer_list[i1] + n_per_class_list[i1], len_list[i1])] batch_x.append(x[per_class_batch_idx]) batch_y.append(y[per_class_batch_idx]) pointer_list[i1] += n_per_class_list[i1] batch_x = np.concatenate(batch_x, axis=0) batch_y = np.concatenate(batch_y, axis=0) yield batch_x, batch_y class BalanceDataGenerator(object): def __init__(self, batch_size, type, max_iter=100): assert type in ['train', 'test'] self._batch_size_ = batch_size self._type_ = type self._max_iter_ = max_iter def generate(self, xs, ys): batch_size = self._batch_size_ x = xs[0] y = ys[0] #(n_samples, n_features) = x.shape ### yong xu commented (n_samples, n_labs) = y.shape n_each = batch_size // n_labs index_list = [] for i1 in xrange(n_labs): index_list.append(np.where(y[:, i1] == 1)[0]) for i1 in xrange(n_labs): np.random.shuffle(index_list[i1]) pointer_list = [0] * n_labs iter = 0 while True: if (self._type_) == 'test' and (iter == self._max_iter_): break iter += 1 batch_x = [] batch_y = [] for i1 in xrange(n_labs): idx_num = len(index_list[i1]) if pointer_list[i1] >= idx_num: pointer_list[i1] = 0 np.random.shuffle(index_list[i1]) batch_idx = index_list[i1][pointer_list[i1] : min(pointer_list[i1] + n_each, idx_num)] batch_x.append(x[batch_idx]) batch_y.append(y[batch_idx]) pointer_list[i1] += n_each batch_x = np.concatenate(batch_x, axis=0) batch_y = np.concatenate(batch_y, axis=0) yield batch_x, batch_y # resize data for fit into CNN. size: (batch_num*color_maps*height*weight) def reshapeX1( X ): N = len(X) return X.reshape( (N, t_delay, feadim, 1, 1) ) # resize data for fit into CNN. size: (batch_num*color_maps*height*weight) def reshapeX2( X ): N = len(X) return X.reshape( (N, t_delay, feadim) ) def reshapeX3( X ): N = len(X) return X.reshape( (N, t_delay, feadim, 1) ) def reshapeX4( X ): N = len(X) return X.reshape( (N*t_delay, feadim) ) def reshapeX5( X , sample_num): N = len(X) return X.reshape( (sample_num, t_delay, feadim, 1) ) def outfunc(vects): x,y=vects #y=K.sum( y, axis=1 ) y = K.clip( y, 1.0e-9, 1 ) # clip to avoid numerical underflow #z=Lambda(lambda x: K.sum(x, axis=1),output_shape=(8,))(y) y = K.sum(y, axis=1) #y = K.sum(y, axis=1) #z = RepeatVector(249)(z) #z=Permute((2,1))(z) #return K.sum( x / z, axis=1 ) #x = K.sum( x, axis=(1,2) ) x = K.sum( x, axis=1 ) #x = K.sum( x, axis=1 ) return x / y def slice1(x): return x[:,:,:, 0:64] def slice2(x): return x[:,:,:, 64:128] def slice1_output_shape(input_shape): return tuple([input_shape[0],input_shape[1],input_shape[2],64]) def slice2_output_shape(input_shape): return tuple([input_shape[0],input_shape[1],input_shape[2],64]) #parameters: num_classes=17 feadim=64 t_delay=240 # the len of Utterance is 300 model_out_path="/vol/vssp/msos/yx/t4_d2017/models_val" # train and test sets: with h5py.File("/vol/vssp/msos/Audioset/task4_dcase2017_features/packed_features/logmel/training_pack.h5", 'r') as hf: tr_x = np.array(hf.get('x')) tr_y = np.array(hf.get('y')) print tr_x.shape print tr_y.shape with h5py.File("/vol/vssp/msos/Audioset/task4_dcase2017_features/packed_features/logmel/testing_pack.h5", 'r') as hf: va_x = np.array(hf.get('x')) va_y = np.array(hf.get('y')) print va_x.shape print va_y.shape ###########normalization training and test set tr_x2=reshapeX4(tr_x) va_x2=reshapeX4(va_x) scaler = preprocessing.StandardScaler().fit(tr_x2) print scaler.mean_, scaler.scale_ tr_x2 = scaler.transform( tr_x2 ) va_x2 = scaler.transform( va_x2 ) tr_x=reshapeX5( tr_x2 , len(tr_x)) va_x=reshapeX5( va_x2 , len(va_x)) #################################### #tr_x=reshapeX3(tr_x) print tr_x.shape #va_x=reshapeX3(va_x) print va_x.shape def block(input): cnn1=Conv2D(128, (3, 3), padding="same", activation="linear", use_bias=False)(input) cnn1=BatchNormalization(axis=-1)(cnn1) cnn11 = Lambda(slice1,output_shape=slice1_output_shape)(cnn1) cnn12 = Lambda(slice2,output_shape=slice2_output_shape)(cnn1) cnn11=Activation('linear')(cnn11) cnn12=Activation('sigmoid')(cnn12) cnn1=Multiply()([cnn11,cnn12]) return cnn1 ###build model by keras input_audio=Input(shape=(t_delay, feadim, 1)) #input_flat=TimeDistributed(Flatten())(input_audio) ###detection factor for each tag (7 meaningful tags + 1 silence tag = 8 tags) #det =TimeDistributed(Dense(17,activation='softmax'))(input_flat) # The posterior sum of each tag is 1.0, now the dims of det are 33 frs * 8 tags cnn1 = block(input_audio) cnn1 = block(cnn1) cnn1=MaxPooling2D(pool_size=(2, 2))(cnn1) cnn1 = block(cnn1) cnn1 = block(cnn1) cnn1=MaxPooling2D(pool_size=(2, 2))(cnn1) cnn1 = block(cnn1) cnn1 = block(cnn1) cnn1=MaxPooling2D(pool_size=(2, 2))(cnn1) cnn1 = block(cnn1) cnn3 = block(cnn1) cnn3=MaxPooling2D(pool_size=(1, 3))(cnn1) cnnout=Conv2D(256, (3, 3), padding="same", activation="relu", use_bias=True)(cnn3) cnnout=MaxPooling2D(pool_size=(1, 2))(cnnout) #cnnout=Flatten_last2d()(cnnout) cnnout=Reshape((30,256))(cnnout) rnnout=Bidirectional(GRU(128, activation='linear', return_sequences=True))(cnnout) rnnout_gate=Bidirectional(GRU(128, activation='sigmoid', return_sequences=True))(cnnout) out=Multiply()([rnnout,rnnout_gate]) out=TimeDistributed(Dense(17,activation='sigmoid'))(out) #det =TimeDistributed(Dense(17,activation='softmax'))(out) #out=Multiply()([out,det]) #out=Lambda(outfunc,output_shape=(17,))([out,det]) out=Lambda(lambda x: K.mean(x, axis=1),output_shape=(17,))(out) allmodel=Model(input_audio, out) allmodel.summary() # Let's train the model using RMSprop allmodel.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) dump_fd=model_out_path+'/gatedAct_rationBal44_lr0.001_normalization_at_cnnRNN_64newMel_240fr.{epoch:02d}-{val_acc:.4f}.hdf5' eachmodel=ModelCheckpoint(dump_fd,monitor='val_acc',verbose=0,save_best_only=False,save_weights_only=False,mode='auto',period=10) #reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, # patience=5, min_lr=0.00000001) #reduce_lr = LearningRateScheduler(scheduler) gen = RatioDataGenerator(batch_size=44, type='train') #gen = BalanceDataGenerator(batch_size=52, type='train') #for (batch_x, batch_y) in gen.generate([x], [y]): # train_on_batch(batch_x, batch_y, class_weight=None, sample_weight=None) steps_per_epoch=100 allmodel.fit_generator(gen.generate([tr_x], [tr_y]), steps_per_epoch, epochs=2000000010, verbose=1, callbacks=[eachmodel], validation_data=(va_x, va_y)) #allmodel.fit(tr_x, tr_y, batch_size=100, epochs=31, # verbose=1, validation_data=(va_x, va_y), callbacks=[eachmodel]) #, callbacks=[best_model]) ================================================ FILE: main_cnnrnn_balbatch_norm_baseline_icassp2018.py ================================================ '''Train a cldnn on the task4 of DCASE2017 dataset. GPU run command with Theano backend (with TensorFlow, the GPU is automatically used): THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatx=float32 python train*.py Author: Yong XU Creat date: 03/04/2017 ''' #from __future__ import print_function import keras from keras import backend as K import sys import cPickle import numpy as np from keras.models import Sequential,Model #from keras.layers.core import Dense, Dropout, Activation, Flatten, Flatten_last2d, Reshape,Permute,Lambda, RepeatVector from keras.layers.core import Dense, Dropout, Activation, Flatten, Reshape,Permute,Lambda, RepeatVector from keras.layers.convolutional import ZeroPadding2D, AveragePooling2D, Conv2D,MaxPooling2D, Convolution1D,MaxPooling1D from keras.layers.pooling import GlobalMaxPooling2D from keras.layers import Merge, Input, merge from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler from keras.layers import LSTM, SimpleRNN, GRU, TimeDistributed, Bidirectional from keras.layers.normalization import BatchNormalization import h5py from keras.layers.merge import Multiply from sklearn import preprocessing import random def scheduler(epoch): initial_lrate = float(0.001) cur_lr=initial_lrate*pow(float(0.5),(epoch//5)) print "learning rate:", cur_lr return cur_lr class RatioDataGenerator(object): def __init__(self, batch_size, type, te_max_iter=100): assert type in ['train', 'test'] self._batch_size_ = batch_size self._type_ = type self._te_max_iter_ = te_max_iter def _get_lb_list(self, n_samples_list): lb_list = [] for idx in xrange(len(n_samples_list)): n_samples = n_samples_list[idx] if n_samples < 1000: lb_list += [idx] elif n_samples < 2000: lb_list += [idx] * 2 elif n_samples < 3000: lb_list += [idx] * 3 elif n_samples < 4000: lb_list += [idx] * 4 else: lb_list += [idx] * 5 return lb_list def generate(self, xs, ys): batch_size = self._batch_size_ x = xs[0] y = ys[0] (n_samples, n_labs) = y.shape n_samples_list = np.sum(y, axis=0) lb_list = self._get_lb_list(n_samples_list) print "n_samples_list:", n_samples_list print "lb_list:", lb_list print "len(lb_list):", len(lb_list) index_list = [] for i1 in xrange(n_labs): index_list.append(np.where(y[:, i1] == 1)[0]) for i1 in xrange(n_labs): np.random.shuffle(index_list[i1]) queue = [] pointer_list = [0] * n_labs len_list = [len(e) for e in index_list] iter = 0 while True: if (self._type_) == 'test' and (iter == self._te_max_iter_): break iter += 1 batch_x = [] batch_y = [] while len(queue) < batch_size: random.shuffle(lb_list) queue += lb_list batch_idx = queue[0 : batch_size] queue[0 : batch_size] = [] n_per_class_list = [batch_idx.count(idx) for idx in xrange(n_labs)] for i1 in xrange(n_labs): if pointer_list[i1] >= len_list[i1]: pointer_list[i1] = 0 np.random.shuffle(index_list[i1]) per_class_batch_idx = index_list[i1][pointer_list[i1] : min(pointer_list[i1] + n_per_class_list[i1], len_list[i1])] batch_x.append(x[per_class_batch_idx]) batch_y.append(y[per_class_batch_idx]) pointer_list[i1] += n_per_class_list[i1] batch_x = np.concatenate(batch_x, axis=0) batch_y = np.concatenate(batch_y, axis=0) yield batch_x, batch_y class BalanceDataGenerator(object): def __init__(self, batch_size, type, max_iter=100): assert type in ['train', 'test'] self._batch_size_ = batch_size self._type_ = type self._max_iter_ = max_iter def generate(self, xs, ys): batch_size = self._batch_size_ x = xs[0] y = ys[0] #(n_samples, n_features) = x.shape ### yong xu commented (n_samples, n_labs) = y.shape n_each = batch_size // n_labs index_list = [] for i1 in xrange(n_labs): index_list.append(np.where(y[:, i1] == 1)[0]) for i1 in xrange(n_labs): np.random.shuffle(index_list[i1]) pointer_list = [0] * n_labs iter = 0 while True: if (self._type_) == 'test' and (iter == self._max_iter_): break iter += 1 batch_x = [] batch_y = [] for i1 in xrange(n_labs): idx_num = len(index_list[i1]) if pointer_list[i1] >= idx_num: pointer_list[i1] = 0 np.random.shuffle(index_list[i1]) batch_idx = index_list[i1][pointer_list[i1] : min(pointer_list[i1] + n_each, idx_num)] batch_x.append(x[batch_idx]) batch_y.append(y[batch_idx]) pointer_list[i1] += n_each batch_x = np.concatenate(batch_x, axis=0) batch_y = np.concatenate(batch_y, axis=0) yield batch_x, batch_y # resize data for fit into CNN. size: (batch_num*color_maps*height*weight) def reshapeX1( X ): N = len(X) return X.reshape( (N, t_delay, feadim, 1, 1) ) # resize data for fit into CNN. size: (batch_num*color_maps*height*weight) def reshapeX2( X ): N = len(X) return X.reshape( (N, t_delay, feadim) ) def reshapeX3( X ): N = len(X) return X.reshape( (N, t_delay, feadim, 1) ) def reshapeX4( X ): N = len(X) return X.reshape( (N*t_delay, feadim) ) def reshapeX5( X , sample_num): N = len(X) return X.reshape( (sample_num, t_delay, feadim, 1) ) def outfunc(vects): x,y=vects #y=K.sum( y, axis=1 ) y = K.clip( y, 1.0e-9, 1 ) # clip to avoid numerical underflow #z=Lambda(lambda x: K.sum(x, axis=1),output_shape=(8,))(y) y = K.sum(y, axis=1) #y = K.sum(y, axis=1) #z = RepeatVector(249)(z) #z=Permute((2,1))(z) #return K.sum( x / z, axis=1 ) #x = K.sum( x, axis=(1,2) ) x = K.sum( x, axis=1 ) #x = K.sum( x, axis=1 ) return x / y def slice1(x): return x[:,:,:, 0:64] def slice2(x): return x[:,:,:, 64:128] def slice1_output_shape(input_shape): return tuple([input_shape[0],input_shape[1],input_shape[2],64]) def slice2_output_shape(input_shape): return tuple([input_shape[0],input_shape[1],input_shape[2],64]) #parameters: num_classes=17 feadim=64 t_delay=240 # the len of Utterance is 300 model_out_path="/vol/vssp/msos/yx/t4_d2017/models_val" # train and test sets: with h5py.File("/vol/vssp/msos/Audioset/task4_dcase2017_features/packed_features/logmel/training_pack.h5", 'r') as hf: tr_x = np.array(hf.get('x')) tr_y = np.array(hf.get('y')) print tr_x.shape print tr_y.shape with h5py.File("/vol/vssp/msos/Audioset/task4_dcase2017_features/packed_features/logmel/testing_pack.h5", 'r') as hf: va_x = np.array(hf.get('x')) va_y = np.array(hf.get('y')) print va_x.shape print va_y.shape ###########normalization training and test set tr_x2=reshapeX4(tr_x) va_x2=reshapeX4(va_x) scaler = preprocessing.StandardScaler().fit(tr_x2) print scaler.mean_, scaler.scale_ tr_x2 = scaler.transform( tr_x2 ) va_x2 = scaler.transform( va_x2 ) tr_x=reshapeX5( tr_x2 , len(tr_x)) va_x=reshapeX5( va_x2 , len(va_x)) #################################### #tr_x=reshapeX3(tr_x) print tr_x.shape #va_x=reshapeX3(va_x) print va_x.shape def block(input): cnn1=Conv2D(64, (3, 3), padding="same", activation="linear", use_bias=False)(input) cnn1=BatchNormalization(axis=-1)(cnn1) #cnn11 = Lambda(slice1,output_shape=slice1_output_shape)(cnn1) #cnn12 = Lambda(slice2,output_shape=slice2_output_shape)(cnn1) #cnn11=Activation('linear')(cnn11) #cnn12=Activation('sigmoid')(cnn12) #cnn1=Multiply()([cnn11,cnn12]) return cnn1 ###build model by keras input_audio=Input(shape=(t_delay, feadim, 1)) #input_flat=TimeDistributed(Flatten())(input_audio) ###detection factor for each tag (7 meaningful tags + 1 silence tag = 8 tags) #det =TimeDistributed(Dense(17,activation='softmax'))(input_flat) # The posterior sum of each tag is 1.0, now the dims of det are 33 frs * 8 tags cnn1 = block(input_audio) cnn1 = block(cnn1) cnn1=MaxPooling2D(pool_size=(2, 2))(cnn1) cnn1 = block(cnn1) cnn1 = block(cnn1) cnn1=MaxPooling2D(pool_size=(2, 2))(cnn1) cnn1 = block(cnn1) cnn1 = block(cnn1) cnn1=MaxPooling2D(pool_size=(2, 2))(cnn1) cnn1 = block(cnn1) cnn3 = block(cnn1) cnn3=MaxPooling2D(pool_size=(1, 3))(cnn1) cnnout=Conv2D(256, (3, 3), padding="same", activation="relu", use_bias=True)(cnn3) cnnout=MaxPooling2D(pool_size=(1, 2))(cnnout) #cnnout=Flatten_last2d()(cnnout) cnnout=Reshape((30,256))(cnnout) out=Bidirectional(GRU(128, activation='relu', return_sequences=True))(cnnout) #rnnout_gate=Bidirectional(GRU(128, activation='sigmoid', return_sequences=True))(cnnout) #out=Multiply()([rnnout,rnnout_gate]) out=TimeDistributed(Dense(17,activation='sigmoid'))(out) #det =TimeDistributed(Dense(17,activation='softmax'))(out) #out=Multiply()([out,det]) #out=Lambda(outfunc,output_shape=(17,))([out,det]) out=Lambda(lambda x: K.mean(x, axis=1),output_shape=(17,))(out) allmodel=Model(input_audio, out) allmodel.summary() # Let's train the model using RMSprop allmodel.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) dump_fd=model_out_path+'/icassp_baseline_crnn_rationBal44_lr0.001_norm_64newMel_240fr.{epoch:02d}-{val_acc:.4f}.hdf5' eachmodel=ModelCheckpoint(dump_fd,monitor='val_acc',verbose=0,save_best_only=False,save_weights_only=False,mode='auto',period=10) #reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, # patience=5, min_lr=0.00000001) #reduce_lr = LearningRateScheduler(scheduler) gen = RatioDataGenerator(batch_size=44, type='train') #gen = BalanceDataGenerator(batch_size=52, type='train') #for (batch_x, batch_y) in gen.generate([x], [y]): # train_on_batch(batch_x, batch_y, class_weight=None, sample_weight=None) steps_per_epoch=100 allmodel.fit_generator(gen.generate([tr_x], [tr_y]), steps_per_epoch, epochs=2000000010, verbose=1, callbacks=[eachmodel], validation_data=(va_x, va_y)) #allmodel.fit(tr_x, tr_y, batch_size=100, epochs=31, # verbose=1, validation_data=(va_x, va_y), callbacks=[eachmodel]) #, callbacks=[best_model]) ================================================ FILE: main_crnn_at.py ================================================ """ Summary: Train a CNN-RNN audio tagging classifier on the task4 of DCASE2017 dataset. Author: Yong XU, Qiuqiang Kong Created: 03/04/2017 Modified: 24/09/2017 -------------------------------------- """ from __future__ import print_function import sys import cPickle import numpy as np import argparse import glob import time import os import keras from keras import backend as K from keras.models import Sequential,Model, load_model from keras.layers.core import (Dense, Dropout, Activation, Flatten, Reshape, Permute,Lambda, RepeatVector) from keras.layers.convolutional import (ZeroPadding2D, AveragePooling2D, Conv2D,MaxPooling2D, Convolution1D,MaxPooling1D) from keras.layers.pooling import GlobalMaxPooling2D from keras.layers import Merge, Input, merge from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler from keras.layers import LSTM, SimpleRNN, GRU, TimeDistributed, Bidirectional from keras.layers.normalization import BatchNormalization import h5py from keras.layers.merge import Multiply from sklearn import preprocessing import random import config as cfg from prepare_data import create_folder, load_hdf5_data, do_scale from data_generator import RatioDataGenerator from evaluation import io_task4, evaluate def scheduler(epoch): initial_lrate = float(0.001) cur_lr=initial_lrate*pow(float(0.5),(epoch//5)) print("learning rate: %f" % cur_lr) return cur_lr def block(input): cnn = Conv2D(128, (3, 3), padding="same", activation="linear", use_bias=False)(input) cnn = BatchNormalization(axis=-1)(cnn) cnn1 = Lambda(slice1, output_shape=slice1_output_shape)(cnn) cnn2 = Lambda(slice2, output_shape=slice2_output_shape)(cnn) cnn1 = Activation('linear')(cnn1) cnn2 = Activation('sigmoid')(cnn2) out = Multiply()([cnn1, cnn2]) return out def slice1(x): return x[:, :, :, 0:64] def slice2(x): return x[:, :, :, 64:128] def slice1_output_shape(input_shape): return tuple([input_shape[0],input_shape[1],input_shape[2],64]) def slice2_output_shape(input_shape): return tuple([input_shape[0],input_shape[1],input_shape[2],64]) def train(args): num_classes = cfg.num_classes # Load training & testing data (tr_x, tr_y, tr_na_list) = load_hdf5_data(args.tr_hdf5_path, verbose=1) (te_x, te_y, te_na_list) = load_hdf5_data(args.te_hdf5_path, verbose=1) print("") # Scale data tr_x = do_scale(tr_x, args.scaler_path, verbose=1) te_x = do_scale(te_x, args.scaler_path, verbose=1) # Build model (_, n_time, n_freq) = tr_x.shape input_logmel = Input(shape=(n_time, n_freq), name='in_layer') a1 = Reshape((n_time, n_freq, 1))(input_logmel) cnn1 = block(a1) cnn1 = block(cnn1) cnn1 = MaxPooling2D(pool_size=(2, 2))(cnn1) cnn1 = block(cnn1) cnn1 = block(cnn1) cnn1 = MaxPooling2D(pool_size=(2, 2))(cnn1) cnn1 = block(cnn1) cnn1 = block(cnn1) cnn1 = MaxPooling2D(pool_size=(2, 2))(cnn1) cnn1 = block(cnn1) cnn3 = block(cnn1) cnn3 = MaxPooling2D(pool_size=(1, 2))(cnn1) cnnout = Conv2D(256, (3, 3), padding="same", activation="relu", use_bias=True)(cnn3) cnnout = MaxPooling2D(pool_size=(1, 4))(cnnout) cnnout = Reshape((30, 256))(cnnout) # Time step is downsampled to 30. rnnout = Bidirectional(GRU(128, activation='linear', return_sequences=True))(cnnout) rnnout_gate = Bidirectional(GRU(128, activation='sigmoid', return_sequences=True))(cnnout) out = Multiply()([rnnout, rnnout_gate]) out = TimeDistributed(Dense(num_classes, activation='sigmoid'), name='localization_layer')(out) out = Lambda(lambda x: K.mean(x, axis=1),output_shape=(num_classes,))(out) model = Model(input_logmel, out) model.summary() # Compile model model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # Save model callback filepath = os.path.join(args.out_model_dir, "gatedAct_rationBal44_lr0.001_normalization_at_cnnRNN_64mel_240fr.{epoch:02d}-{val_acc:.4f}.hdf5") create_folder(os.path.dirname(filepath)) save_model = ModelCheckpoint(filepath=filepath, monitor='val_acc', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', period=1) # Data generator gen = RatioDataGenerator(batch_size=44, type='train') # Train model.fit_generator(generator=gen.generate([tr_x], [tr_y]), steps_per_epoch=100, # 100 iters is called an 'epoch' epochs=2000000010, # Maximum 'epoch' to train verbose=1, callbacks=[save_model], validation_data=(te_x, te_y)) def run_func(func, x, batch_size): pred_all = [] for ptr in xrange(0, len(x), batch_size): batch_x = x[ptr : ptr + batch_size] [pred] = func([batch_x, 0.]) pred_all.append(pred) pred_all = np.concatenate(pred_all, axis=0) return pred_all def recognize(args, at_bool, sed_bool): (te_x, te_y, te_na_list) = load_hdf5_data(args.te_hdf5_path, verbose=1) x = te_x y = te_y na_list = te_na_list x = do_scale(x, args.scaler_path, verbose=1) fusion_at_list = [] fusion_sed_list = [] # for epoch in range(19, 81, 10): for epoch in range(20, 30, 1): t1 = time.time() [model_path] = glob.glob(os.path.join(args.model_dir, "*.%02d-0.*.hdf5" % epoch)) model = load_model(model_path) # Audio tagging if at_bool: pred = model.predict(x) fusion_at_list.append(pred) # Sound event detection if sed_bool: in_layer = model.get_layer('in_layer') loc_layer = model.get_layer('localization_layer') func_loc_output = K.function([in_layer.input, K.learning_phase()], [loc_layer.output]) pred3d = run_func(func_loc_output, x, batch_size=64) fusion_sed_list.append(pred3d) print("Prediction time: %s" % (time.time() - t1,)) # Write out AT probabilities if at_bool: fusion_at = np.mean(np.array(fusion_at_list), axis=0) print("AT shape: %s" % (fusion_at.shape,)) io_task4.at_write_prob_mat_to_csv( na_list=na_list, prob_mat=fusion_at, out_path=os.path.join(args.out_dir, "at_prob_mat.csv.gz")) # Write out SED probabilites if sed_bool: fusion_sed = np.mean(np.array(fusion_sed_list), axis=0) print("SED shape:%s" % (fusion_sed.shape,)) io_task4.sed_write_prob_mat_list_to_csv( na_list=na_list, prob_mat_list=fusion_sed, out_path=os.path.join(args.out_dir, "sed_prob_mat_list.csv.gz")) print("Prediction finished!") def get_stat(args, at_bool, sed_bool): lbs = cfg.lbs step_time_in_sec = cfg.step_time_in_sec max_len = cfg.max_len thres_ary = [0.3] * len(lbs) # Calculate AT stat if at_bool: pd_prob_mat_csv_path = os.path.join(args.pred_dir, "at_prob_mat.csv.gz") at_stat_path = os.path.join(args.stat_dir, "at_stat.csv") at_submission_path = os.path.join(args.submission_dir, "at_submission.csv") at_evaluator = evaluate.AudioTaggingEvaluate( weak_gt_csv="meta_data/groundtruth_weak_label_testing_set.csv", lbs=lbs) at_stat = at_evaluator.get_stats_from_prob_mat_csv( pd_prob_mat_csv=pd_prob_mat_csv_path, thres_ary=thres_ary) # Write out & print AT stat at_evaluator.write_stat_to_csv(stat=at_stat, stat_path=at_stat_path) at_evaluator.print_stat(stat_path=at_stat_path) # Write AT to submission format io_task4.at_write_prob_mat_csv_to_submission_csv( at_prob_mat_path=pd_prob_mat_csv_path, lbs=lbs, thres_ary=at_stat['thres_ary'], out_path=at_submission_path) # Calculate SED stat if sed_bool: sed_prob_mat_list_path = os.path.join(args.pred_dir, "sed_prob_mat_list.csv.gz") sed_stat_path = os.path.join(args.stat_dir, "sed_stat.csv") sed_submission_path = os.path.join(args.submission_dir, "sed_submission.csv") sed_evaluator = evaluate.SoundEventDetectionEvaluate( strong_gt_csv="meta_data/groundtruth_strong_label_testing_set.csv", lbs=lbs, step_sec=step_time_in_sec, max_len=max_len) # Write out & print SED stat sed_stat = sed_evaluator.get_stats_from_prob_mat_list_csv( pd_prob_mat_list_csv=sed_prob_mat_list_path, thres_ary=thres_ary) # Write SED to submission format sed_evaluator.write_stat_to_csv(stat=sed_stat, stat_path=sed_stat_path) sed_evaluator.print_stat(stat_path=sed_stat_path) # Write SED to submission format io_task4.sed_write_prob_mat_list_csv_to_submission_csv( sed_prob_mat_list_path=sed_prob_mat_list_path, lbs=lbs, thres_ary=thres_ary, step_sec=step_time_in_sec, out_path=sed_submission_path) print("Calculating stat finished!") if __name__ == '__main__': parser = argparse.ArgumentParser(description="") subparsers = parser.add_subparsers(dest='mode') parser_train = subparsers.add_parser('train') parser_train.add_argument('--tr_hdf5_path', type=str) parser_train.add_argument('--te_hdf5_path', type=str) parser_train.add_argument('--scaler_path', type=str) parser_train.add_argument('--out_model_dir', type=str) parser_recognize = subparsers.add_parser('recognize') parser_recognize.add_argument('--te_hdf5_path', type=str) parser_recognize.add_argument('--scaler_path', type=str) parser_recognize.add_argument('--model_dir', type=str) parser_recognize.add_argument('--out_dir', type=str) parser_get_stat = subparsers.add_parser('get_stat') parser_get_stat.add_argument('--pred_dir', type=str) parser_get_stat.add_argument('--stat_dir', type=str) parser_get_stat.add_argument('--submission_dir', type=str) args = parser.parse_args() if args.mode == 'train': train(args) elif args.mode == 'recognize': recognize(args, at_bool=True, sed_bool=False) elif args.mode == 'get_stat': get_stat(args, at_bool=True, sed_bool=False) else: raise Exception("Incorrect argument!") ================================================ FILE: main_crnn_sed.py ================================================ """Train a cldnn on the task4 of DCASE2017 dataset. GPU run command with Theano backend (with TensorFlow, the GPU is automatically used): THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatx=float32 python train*.py Author: Yong XU Creat date: 03/04/2017 """ from __future__ import print_function import sys import cPickle import numpy as np import argparse import glob import time import os import keras from keras import backend as K from keras.models import Sequential,Model, load_model from keras.layers.core import Dense, Dropout, Activation, Flatten, Reshape, Permute,Lambda, RepeatVector from keras.layers.convolutional import ZeroPadding2D, AveragePooling2D, Conv2D,MaxPooling2D, Convolution1D,MaxPooling1D from keras.layers.pooling import GlobalMaxPooling2D from keras.layers import Merge, Input, merge from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler from keras.layers import LSTM, SimpleRNN, GRU, TimeDistributed, Bidirectional from keras.layers.normalization import BatchNormalization import h5py from keras.layers.merge import Multiply from sklearn import preprocessing import random import config as cfg from prepare_data import create_folder, load_hdf5_data, do_scale from data_generator import RatioDataGenerator from evaluation import io_task4, evaluate from main_crnn_at import block, slice1, slice2, slice1_output_shape, slice2_output_shape, run_func, recognize, get_stat def outfunc(vects): x,y=vects y = K.clip(y, 1e-7, 1.) # clip to avoid numerical underflow y = K.sum(y, axis=1) x = K.sum(x, axis=1) return x / y def train(args): num_classes = cfg.num_classes # Load training & testing data (tr_x, tr_y, tr_na_list) = load_hdf5_data(args.tr_hdf5_path, verbose=1) # (tr_x, tr_y, tr_na_list) = load_hdf5(args.te_hdf5_path, verbose=1) (te_x, te_y, te_na_list) = load_hdf5_data(args.te_hdf5_path, verbose=1) print("") # Scale data tr_x = do_scale(tr_x, args.scaler_path, verbose=1) te_x = do_scale(te_x, args.scaler_path, verbose=1) # Build model (_, n_time, n_freq) = tr_x.shape input_logmel = Input(shape=(n_time, n_freq), name='in_layer') a1 = Reshape((n_time, n_freq, 1))(input_logmel) cnn1 = block(a1) cnn1 = block(cnn1) cnn1 = MaxPooling2D(pool_size=(1, 2))(cnn1) cnn1 = block(cnn1) cnn1 = block(cnn1) cnn1 = MaxPooling2D(pool_size=(1, 2))(cnn1) cnn1 = block(cnn1) cnn1 = block(cnn1) cnn1 = MaxPooling2D(pool_size=(1, 2))(cnn1) cnn1 = block(cnn1) cnn3 = block(cnn1) cnn3 = MaxPooling2D(pool_size=(1, 2))(cnn1) cnnout = Conv2D(256, (3, 3), padding="same", activation="relu", use_bias=True)(cnn3) cnnout = MaxPooling2D(pool_size=(1, 4))(cnnout) cnnout = Reshape((240, 256))(cnnout) # Time step is downsampled to 30. rnnout = Bidirectional(GRU(128, activation='linear', return_sequences=True))(cnnout) rnnout_gate = Bidirectional(GRU(128, activation='sigmoid', return_sequences=True))(cnnout) out = Multiply()([rnnout, rnnout_gate]) out = TimeDistributed(Dense(num_classes, activation='sigmoid'), name='localization_layer')(out) det =TimeDistributed(Dense(num_classes, activation='softmax'))(out) out=Multiply()([out,det]) out=Lambda(outfunc, output_shape=(num_classes,))([out, det]) model = Model(input_logmel, out) model.summary() # Compile model model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # Save model callback filepath = os.path.join(args.out_model_dir, "gatedAct_rationBal44_lr0.001_normalization_at_cnnRNN_64newMel_240fr.{epoch:02d}-{val_acc:.4f}.hdf5") create_folder(os.path.dirname(filepath)) save_model = ModelCheckpoint(filepath=filepath, monitor='val_acc', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', period=1) # Data generator gen = RatioDataGenerator(batch_size=44, type='train') # Train model.fit_generator(generator=gen.generate([tr_x], [tr_y]), steps_per_epoch=100, # 100 iters is called an 'epoch' epochs=2000000010, # Maximum 'epoch' to train verbose=1, callbacks=[save_model], validation_data=(te_x, te_y)) if __name__ == '__main__': parser = argparse.ArgumentParser(description="") subparsers = parser.add_subparsers(dest='mode') parser_train = subparsers.add_parser('train') parser_train.add_argument('--tr_hdf5_path', type=str) parser_train.add_argument('--te_hdf5_path', type=str) parser_train.add_argument('--scaler_path', type=str) parser_train.add_argument('--out_model_dir', type=str) parser_recognize = subparsers.add_parser('recognize') parser_recognize.add_argument('--te_hdf5_path', type=str) parser_recognize.add_argument('--scaler_path', type=str) parser_recognize.add_argument('--model_dir', type=str) parser_recognize.add_argument('--out_dir', type=str) parser_get_stat = subparsers.add_parser('get_stat') parser_get_stat.add_argument('--pred_dir', type=str) parser_get_stat.add_argument('--stat_dir', type=str) parser_get_stat.add_argument('--submission_dir', type=str) args = parser.parse_args() if args.mode == 'train': train(args) elif args.mode == 'recognize': recognize(args, at_bool=True, sed_bool=True) elif args.mode == 'get_stat': get_stat(args, at_bool=True, sed_bool=True) else: raise Exception("Incorrect argument!") ================================================ FILE: predict_audio_tagging_icassp2018_cnnrnn_baseline.py ================================================ '''Train a cldnn on the task4 of DCASE2017 dataset. GPU run command with Theano backend (with TensorFlow, the GPU is automatically used): THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatx=float32 python train*.py Author: Yong XU Creat date: 03/04/2017 ''' #from __future__ import print_function import keras from keras import backend as K from keras.models import load_model import sys import cPickle as pickle import numpy as np from keras.models import Sequential,Model from keras.layers.core import Dense, Dropout, Activation, Flatten, Reshape,Permute,Lambda, RepeatVector from keras.layers.convolutional import ZeroPadding2D, AveragePooling2D, Conv2D,MaxPooling2D, Convolution1D,MaxPooling1D from keras.layers import Merge, Input, merge from keras.callbacks import ModelCheckpoint from keras.layers import LSTM, SimpleRNN, GRU, TimeDistributed, Bidirectional import h5py import os import shutil from sklearn import preprocessing import gzip import glob # resize data for fit into CNN. size: (batch_num*color_maps*height*weight) def reshapeX1( X ): N = len(X) return X.reshape( (1, N, feadim, 1, 1) ) # resize data for fit into CNN. size: (batch_num*color_maps*height*weight) def reshapeX2( X ): N = len(X) return X.reshape( (N, t_delay, feadim) ) def reshapeX3( X ): N = len(X) return X.reshape( (1, N, feadim) ) def reshapeX6( X ): N = len(X) return X.reshape( (1, N, feadim, 1) ) def reshapeX4( X ): N = len(X) return X.reshape( (N*t_delay, feadim) ) def reshapeX5( X , sample_num): N = len(X) return X.reshape( (sample_num, t_delay, feadim, 1) ) #parameters: num_classes=17 feadim=64 t_delay=240 # the len of Utterance is 300 ## train sets: #with h5py.File("/vol/vssp/msos/Audioset/task4_dcase2017_features/packed_features/logmel/training_pack.h5", 'r') as hf: # tr_x = np.array(hf.get('x')) # tr_y = np.array(hf.get('y')) #print tr_x.shape #print tr_y.shape #test sets: with h5py.File("/vol/vssp/msos/Audioset/task4_dcase2017_features/packed_features/logmel/testing_pack.h5", 'r') as hf: va_x = np.array(hf.get('x')) va_y = np.array(hf.get('y')) va_id = np.array(hf.get('na_list')) print va_id.shape print va_x.shape print va_y.shape ###########normalization training and test set #tr_x2=reshapeX4(tr_x) va_x2=reshapeX4(va_x) #scaler = preprocessing.StandardScaler().fit(tr_x2) #print scaler.mean_, scaler.scale_ #with open('tr_norm.pickle', 'wb') as handle: # pickle.dump(scaler, handle, protocol=pickle.HIGHEST_PROTOCOL) with open('tr_norm.pickle', 'rb') as handle: scaler = pickle.load(handle) va_x2 = scaler.transform( va_x2 ) va_x=reshapeX5( va_x2 , len(va_x)) #################################### f=gzip.open('/vol/vssp/msos/yx/t4_d2017/dcase2017_task4_evaluation_code_20170712/data/at_prob_mat_icassp2018_crnn_baseline.csv.gz','w') #f2=gzip.open('/vol/vssp/msos/yx/t4_d2017/dcase2017_task4_evaluation_code_20170712/data/sed_prob_mat_list.csv.gz','w') pred_fusion=[[0 for x in range(17)] for y in range(488)] print len(pred_fusion) model_num=0 #for epoch in range(20,110,10): for epoch in range(19,81,10): #19-61: f1=56,59 #19-71: f1=56,58.5 #19-81: f1=0.567,0.596 #19-91: f1=0.565,0.584 print "epoch:", epoch model_num=model_num+1 print "model num:", model_num path='/vol/vssp/msos/yx/t4_d2017/models_val/icassp_baseline_crnn_rationBal44_lr0.001_norm_64newMel_240fr.%d-0.*.hdf5'%epoch for model_f in glob.glob(path): print model_f md=load_model(model_f) #md.summary() # def recognize(): p_y_pred = md.predict( va_x ) pred_fusion=pred_fusion+p_y_pred #shutil.copy('at_prediction.csv', '/vol/vssp/msos/yx/t4_d2017/dcase2017_task4_evaluation_code_20170616/examples/.') #os.wait() #shutil.copy('prob_mat.csv', '/vol/vssp/msos/yx/t4_d2017/dcase2017_task4_evaluation_code_20170616/examples/.') #os.wait() #path="/vol/vssp/msos/yx/t4_d2017/dcase2017_task4_evaluation_code_20170616" #os.chdir(path) #cmd="python runme.py" #os.system(cmd) # do fusion p_y_pred= pred_fusion/model_num print p_y_pred.shape p_y_pred=list(p_y_pred) #print p_y_pred for i in range(len(va_x)): #print i p_y_pred_s = p_y_pred[i] f.write("%s.wav" % (va_id[i])) for e in p_y_pred_s: # f.write(" %s" % e) f.write(" %s" % e) f.write("\n") #sys.exit() # copy the audio tagging results to form the sed results, simple mode #for j in range(240): # #print i # f2.write("%s.wav" % (va_id[i])) # for e in p_y_pred_s: # f2.write(" %s" % e) # f2.write("\n") f.close #f2.close # if __name__ == '__main__': # recognize() ================================================ FILE: predict_crnn_at.py ================================================ '''Train a cldnn on the task4 of DCASE2017 dataset. GPU run command with Theano backend (with TensorFlow, the GPU is automatically used): THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatx=float32 python train*.py Author: Yong XU Creat date: 03/04/2017 ''' #from __future__ import print_function import keras from keras import backend as K from keras.models import load_model import sys import cPickle as pickle import numpy as np from keras.models import Sequential,Model from keras.layers.core import Dense, Dropout, Activation, Flatten, Reshape,Permute,Lambda, RepeatVector from keras.layers.convolutional import ZeroPadding2D, AveragePooling2D, Conv2D,MaxPooling2D, Convolution1D,MaxPooling1D from keras.layers import Merge, Input, merge from keras.callbacks import ModelCheckpoint from keras.layers import LSTM, SimpleRNN, GRU, TimeDistributed, Bidirectional import h5py import os import shutil from sklearn import preprocessing import gzip import glob # resize data for fit into CNN. size: (batch_num*color_maps*height*weight) def reshapeX1( X ): N = len(X) return X.reshape( (1, N, feadim, 1, 1) ) # resize data for fit into CNN. size: (batch_num*color_maps*height*weight) def reshapeX2( X ): N = len(X) return X.reshape( (N, t_delay, feadim) ) def reshapeX3( X ): N = len(X) return X.reshape( (1, N, feadim) ) def reshapeX6( X ): N = len(X) return X.reshape( (1, N, feadim, 1) ) def reshapeX4( X ): N = len(X) return X.reshape( (N*t_delay, feadim) ) def reshapeX5( X , sample_num): N = len(X) return X.reshape( (sample_num, t_delay, feadim, 1) ) #parameters: num_classes=17 feadim=64 t_delay=240 # the len of Utterance is 300 ## train sets: #with h5py.File("/vol/vssp/msos/Audioset/task4_dcase2017_features/packed_features/logmel/training_pack.h5", 'r') as hf: # tr_x = np.array(hf.get('x')) # tr_y = np.array(hf.get('y')) #print tr_x.shape #print tr_y.shape #test sets: with h5py.File("/vol/vssp/msos/Audioset/task4_dcase2017_features/packed_features/logmel/testing_pack.h5", 'r') as hf: va_x = np.array(hf.get('x')) va_y = np.array(hf.get('y')) va_id = np.array(hf.get('na_list')) print va_id.shape print va_x.shape print va_y.shape ###########normalization training and test set #tr_x2=reshapeX4(tr_x) va_x2=reshapeX4(va_x) #scaler = preprocessing.StandardScaler().fit(tr_x2) #print scaler.mean_, scaler.scale_ #with open('tr_norm.pickle', 'wb') as handle: # pickle.dump(scaler, handle, protocol=pickle.HIGHEST_PROTOCOL) with open('tr_norm.pickle', 'rb') as handle: scaler = pickle.load(handle) va_x2 = scaler.transform( va_x2 ) va_x=reshapeX5( va_x2 , len(va_x)) #################################### f=gzip.open('/vol/vssp/msos/yx/t4_d2017/dcase2017_task4_evaluation_code_20170712/data/at_prob_mat_icassp2018_crnn_baseline.csv.gz','w') #f2=gzip.open('/vol/vssp/msos/yx/t4_d2017/dcase2017_task4_evaluation_code_20170712/data/sed_prob_mat_list.csv.gz','w') pred_fusion=[[0 for x in range(17)] for y in range(488)] print len(pred_fusion) model_num=0 #for epoch in range(20,110,10): for epoch in range(19,81,10): #19-61: f1=56,59 #19-71: f1=56,58.5 #19-81: f1=0.567,0.596 #19-91: f1=0.565,0.584 print "epoch:", epoch model_num=model_num+1 print "model num:", model_num path='/vol/vssp/msos/yx/t4_d2017/models_val/icassp_baseline_crnn_rationBal44_lr0.001_norm_64newMel_240fr.%d-0.*.hdf5'%epoch for model_f in glob.glob(path): print model_f md=load_model(model_f) #md.summary() # def recognize(): p_y_pred = md.predict( va_x ) pred_fusion=pred_fusion+p_y_pred #shutil.copy('at_prediction.csv', '/vol/vssp/msos/yx/t4_d2017/dcase2017_task4_evaluation_code_20170616/examples/.') #os.wait() #shutil.copy('prob_mat.csv', '/vol/vssp/msos/yx/t4_d2017/dcase2017_task4_evaluation_code_20170616/examples/.') #os.wait() #path="/vol/vssp/msos/yx/t4_d2017/dcase2017_task4_evaluation_code_20170616" #os.chdir(path) #cmd="python runme.py" #os.system(cmd) # do fusion p_y_pred= pred_fusion/model_num print p_y_pred.shape p_y_pred=list(p_y_pred) #print p_y_pred for i in range(len(va_x)): #print i p_y_pred_s = p_y_pred[i] f.write("%s.wav" % (va_id[i])) for e in p_y_pred_s: # f.write(" %s" % e) f.write(" %s" % e) f.write("\n") #sys.exit() # copy the audio tagging results to form the sed results, simple mode #for j in range(240): # #print i # f2.write("%s.wav" % (va_id[i])) # for e in p_y_pred_s: # f2.write(" %s" % e) # f2.write("\n") f.close #f2.close # if __name__ == '__main__': # recognize() ================================================ FILE: predict_crnn_sed.py ================================================ '''Train a cldnn on the task4 of DCASE2017 dataset. GPU run command with Theano backend (with TensorFlow, the GPU is automatically used): THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatx=float32 python train*.py Author: Yong XU Creat date: 03/04/2017 ''' #from __future__ import print_function import keras from keras import backend as K from keras.models import load_model import sys import cPickle as pickle import numpy as np from keras.models import Sequential,Model from keras.layers.core import Dense, Dropout, Activation, Flatten, Reshape,Permute,Lambda, RepeatVector from keras.layers.convolutional import ZeroPadding2D, AveragePooling2D, Conv2D,MaxPooling2D, Convolution1D,MaxPooling1D from keras.layers import Merge, Input, merge from keras.callbacks import ModelCheckpoint from keras.layers import LSTM, SimpleRNN, GRU, TimeDistributed, Bidirectional import h5py import os import shutil from sklearn import preprocessing import gzip import glob import matplotlib.pyplot as plt # resize data for fit into CNN. size: (batch_num*color_maps*height*weight) def reshapeX1( X ): N = len(X) return X.reshape( (1, N, feadim, 1, 1) ) # resize data for fit into CNN. size: (batch_num*color_maps*height*weight) def reshapeX2( X ): N = len(X) return X.reshape( (N, t_delay, feadim) ) def reshapeX3( X ): N = len(X) return X.reshape( (1, N, feadim) ) def reshapeX6( X ): N = len(X) return X.reshape( (1, N, feadim, 1) ) def reshapeX4( X ): N = len(X) return X.reshape( (N*t_delay, feadim) ) def reshapeX5( X , sample_num): N = len(X) return X.reshape( (sample_num, t_delay, feadim, 1) ) def reshapeX7( X ): N = len(X) return X.reshape( (batch_size, t_delay, feadim, 1) ) #parameters: num_classes=17 feadim=64 t_delay=240 # the len of Utterance is 300 batch_size=61 ## train sets: #with h5py.File("/vol/vssp/msos/Audioset/task4_dcase2017_features/packed_features/logmel/training_pack.h5", 'r') as hf: # tr_x = np.array(hf.get('x')) # tr_y = np.array(hf.get('y')) #print tr_x.shape #print tr_y.shape #test sets: with h5py.File("/vol/vssp/msos/Audioset/task4_dcase2017_features/packed_features/logmel/testing_pack.h5", 'r') as hf: va_x = np.array(hf.get('x')) va_y = np.array(hf.get('y')) va_id = np.array(hf.get('na_list')) print va_id.shape print va_x.shape print va_y.shape print va_y ###########normalization training and test set #tr_x2=reshapeX4(tr_x) va_x2=reshapeX4(va_x) #scaler = preprocessing.StandardScaler().fit(tr_x2) #print scaler.mean_, scaler.scale_ #with open('tr_norm.pickle', 'wb') as handle: # pickle.dump(scaler, handle, protocol=pickle.HIGHEST_PROTOCOL) with open('tr_norm.pickle', 'rb') as handle: scaler = pickle.load(handle) va_x2 = scaler.transform( va_x2 ) va_x=reshapeX5( va_x2 , len(va_x)) #################################### #f=gzip.open('/vol/vssp/msos/yx/t4_d2017/dcase2017_task4_evaluation_code_20170712/data/at_prob_mat.csv.gz','w') f2=gzip.open('/vol/vssp/msos/yx/t4_d2017/dcase2017_task4_evaluation_code_20170712/data/sed_prob_mat_list.csv.gz','w') pred_fusion=[[0 for x in range(17)] for y in range(488)] print len(pred_fusion) model_num=0 #for epoch in range(20,110,10): total_out=[[[0 for x in range(17)] for y in range(t_delay)] for z in range(len(va_x))] #[488,240,17] for epoch in range(19,81,10): #19-61: f1=56,59 #19-71: f1=56,58.5 #19-81: f1=0.567,0.596 #19-91: f1=0.565,0.584 print "epoch:", epoch model_num=model_num+1 print "model num:", model_num #path='/vol/vssp/msos/yx/t4_d2017/models_val/gatedAct_rationBal44_lr0.001_normalization_sigLOC240_cnnRNN_64newMel_240fr.%d-0.*.hdf5'%epoch path='/vol/vssp/msos/yx/t4_d2017/models_val/gatedAct_rationBal44_lr0.001_normalization_LOC240_cnnRNN_64newMel_240fr.%d-0.*.hdf5'%epoch # should be softmax ? #path='/vol/vssp/msos/yx/t4_d2017/models_val/gatedAct_rationBal44_lr0.001_normalization_LOC_cnnRNN_mfcc40dd_240fr.%d-0.*.hdf5'%epoch for model_f in glob.glob(path): print model_f md=load_model(model_f) #md.summary() # def recognize(): #p_y_pred = md.predict( va_x ) #pred_fusion=pred_fusion+p_y_pred batch_out=[]#for append, should be [batch_size, 240, 17] for utt in range(0,len(va_x),batch_size): print utt if 1: ### for localization get_3rd_layer_output = K.function([md.layers[0].input, K.learning_phase()], [md.layers[60].output]) ###60 should be the sigmoid layer, with quite large value 1.0, but with several 1.0 values ####61 should be the softmax layer ###62 seems to be much worse. layer_output = get_3rd_layer_output([reshapeX7(va_x[utt:utt+batch_size]), 0])[0] print layer_output print layer_output.shape #layer_output1=layer_output[:,:] #for utt in range(len(va_x)): print va_id[utt] #for fr in range(240): #f2.write("%s.wav" % (va_id[utt]))###output id, wav name batch_out.extend(layer_output) #### append will be additional index in the first position;;; extend will concatenate in the first index without additional index #for jz in range(batch_size): # for fr in range(240): #f2.write("%s.wav" % (va_id[utt])) # for e in layer_output[jz,fr,:]: # f2.write(" %s" % e) #f2.write("\n") # utt=utt+1 #print utt #f2.write("\n") #f2.close batch_out=np.array(batch_out) print "batch_out.shape:", batch_out.shape total_out=total_out+batch_out total_out=total_out/model_num ####do fusion print: for utt in range(488): for fr in range(240): f2.write("%s.wav" % (va_id[utt])) for tg in total_out[utt,fr,:]: f2.write(" %s" % tg) f2.write("\n") f2.close ================================================ FILE: prepare_data.py ================================================ from __future__ import print_function import numpy as np import sys import soundfile import os import librosa from scipy import signal import pickle import cPickle import scipy import time import csv import gzip import h5py import matplotlib.ticker as ticker import matplotlib.pyplot as plt from sklearn import preprocessing from sklearn import metrics import argparse import config as cfg # Read wav def read_audio(path, target_fs=None): (audio, fs) = soundfile.read(path) if audio.ndim > 1: audio = np.mean(audio, axis=1) if target_fs is not None and fs != target_fs: audio = librosa.resample(audio, orig_sr=fs, target_sr=target_fs) fs = target_fs return audio, fs # Write wav def write_audio(path, audio, sample_rate): soundfile.write(file=path, data=audio, samplerate=sample_rate) # Create an empty folder def create_folder(fd): if not os.path.exists(fd): os.makedirs(fd) ### Feature extraction. def extract_features(wav_dir, out_dir, recompute): """Extract log mel spectrogram features. Args: wav_dir: string, directory of wavs. out_dir: string, directory to write out features. recompute: bool, if True recompute all features, if False skip existed extracted features. Returns: None """ fs = cfg.sample_rate n_window = cfg.n_window n_overlap = cfg.n_overlap create_folder(out_dir) names = [na for na in os.listdir(wav_dir) if na.endswith(".wav")] names = sorted(names) print("Total file number: %d" % len(names)) # Mel filter bank melW = librosa.filters.mel(sr=fs, n_fft=n_window, n_mels=64, fmin=0., fmax=8000.) cnt = 0 t1 = time.time() for na in names: wav_path = wav_dir + '/' + na out_path = out_dir + '/' + os.path.splitext(na)[0] + '.p' # Skip features already computed if recompute or (not os.path.isfile(out_path)): print(cnt, out_path) (audio, _) = read_audio(wav_path, fs) # Skip corrupted wavs if audio.shape[0] == 0: print("File %s is corrupted!" % wav_path) else: # Compute spectrogram ham_win = np.hamming(n_window) [f, t, x] = signal.spectral.spectrogram( x=audio, window=ham_win, nperseg=n_window, noverlap=n_overlap, detrend=False, return_onesided=True, mode='magnitude') x = x.T x = np.dot(x, melW.T) x = np.log(x + 1e-8) x = x.astype(np.float32) # Dump to pickle cPickle.dump(x, open(out_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) cnt += 1 print("Extracting feature time: %s" % (time.time() - t1,)) ### Pack features of hdf5 file def pack_features_to_hdf5(fe_dir, csv_path, out_path): """Pack extracted features to a single hdf5 file. This hdf5 file can speed up loading the features. This hdf5 file has structure: na_list: list of names x: bool array, (n_clips) y: float32 array, (n_clips, n_time, n_freq) Args: fe_dir: string, directory of features. csv_path: string | "", path of csv file. E.g. "testing_set.csv". If the string is empty, then pack features with all labels False. out_path: string, path to write out the created hdf5 file. Returns: None """ max_len = cfg.max_len create_folder(os.path.dirname(out_path)) t1 = time.time() x_all, y_all, na_all = [], [], [] if csv_path != "": # Pack from csv file (training & testing from dev. data) with open(csv_path, 'rb') as f: reader = csv.reader(f) lis = list(reader) cnt = 0 for li in lis: [na, bgn, fin, lbs, ids] = li if cnt % 100 == 0: print(cnt) na = os.path.splitext(na)[0] bare_na = 'Y' + na + '_' + bgn + '_' + fin # Correspond to the wav name. fe_na = bare_na + ".p" fe_path = os.path.join(fe_dir, fe_na) if not os.path.isfile(fe_path): print("File %s is in the csv file but the feature is not extracted!" % fe_path) else: na_all.append(bare_na[1:] + ".wav") # Remove 'Y' in the begining. x = cPickle.load(open(fe_path, 'rb')) x = pad_trunc_seq(x, max_len) x_all.append(x) ids = ids.split(',') y = ids_to_multinomial(ids) y_all.append(y) cnt += 1 else: # Pack from features without ground truth label (dev. data) names = os.listdir(fe_dir) names = sorted(names) for fe_na in names: bare_na = os.path.splitext(fe_na)[0] fe_path = os.path.join(fe_dir, fe_na) na_all.append(bare_na + ".wav") x = cPickle.load(open(fe_path, 'rb')) x = pad_trunc_seq(x, max_len) x_all.append(x) y_all.append(None) x_all = np.array(x_all, dtype=np.float32) y_all = np.array(y_all, dtype=np.bool) print("len(na_all): %d", len(na_all)) print("x_all.shape: %s, %s" % (x_all.shape, x_all.dtype)) print("y_all.shape: %s, %s" % (y_all.shape, y_all.dtype)) with h5py.File(out_path, 'w') as hf: hf.create_dataset('na_list', data=na_all) hf.create_dataset('x', data=x_all) hf.create_dataset('y', data=y_all) print("Save hdf5 to %s" % out_path) print("Pack features time: %s" % (time.time() - t1,)) def ids_to_multinomial(ids): """Ids of wav to multinomial representation. Args: ids: list of id, e.g. ['/m/0284vy3', '/m/02mfyn'] Returns: 1d array, multimonial representation, e.g. [1,0,1,0,0,...] """ y = np.zeros(len(cfg.lbs)) for id in ids: index = cfg.id_to_idx[id] y[index] = 1 return y def pad_trunc_seq(x, max_len): """Pad or truncate a sequence data to a fixed length. Args: x: ndarray, input sequence data. max_len: integer, length of sequence to be padded or truncated. Returns: ndarray, Padded or truncated input sequence data. """ L = len(x) shape = x.shape if L < max_len: pad_shape = (max_len - L,) + shape[1:] pad = np.zeros(pad_shape) x_new = np.concatenate((x, pad), axis=0) else: x_new = x[0:max_len] return x_new ### Load data & scale data def load_hdf5_data(hdf5_path, verbose=1): """Load hdf5 data. Args: hdf5_path: string, path of hdf5 file. verbose: integar, print flag. Returns: x: ndarray (np.float32), shape: (n_clips, n_time, n_freq) y: ndarray (np.bool), shape: (n_clips, n_classes) na_list: list, containing wav names. """ t1 = time.time() with h5py.File(hdf5_path, 'r') as hf: x = np.array(hf.get('x')) y = np.array(hf.get('y')) na_list = list(hf.get('na_list')) if verbose == 1: print("--- %s ---" % hdf5_path) print("x.shape: %s %s" % (x.shape, x.dtype)) print("y.shape: %s %s" % (y.shape, y.dtype)) print("len(na_list): %d" % len(na_list)) print("Loading time: %s" % (time.time() - t1,)) return x, y, na_list def calculate_scaler(hdf5_path, out_path): """Calculate scaler of input data on each frequency bin. Args: hdf5_path: string, path of packed hdf5 features file. out_path: string, path to write out the calculated scaler. Returns: None. """ t1 = time.time() (x, y, na_list) = load_hdf5_data(hdf5_path, verbose=1) (n_clips, n_time, n_freq) = x.shape x2d = x.reshape((n_clips * n_time, n_freq)) scaler = preprocessing.StandardScaler().fit(x2d) print("Mean: %s" % (scaler.mean_,)) print("Std: %s" % (scaler.scale_,)) print("Calculating scaler time: %s" % (time.time() - t1,)) pickle.dump(scaler, open(out_path, 'wb')) def do_scale(x3d, scaler_path, verbose=1): """Do scale on the input sequence data. Args: x3d: ndarray, input sequence data, shape: (n_clips, n_time, n_freq) scaler_path: string, path of pre-calculated scaler. verbose: integar, print flag. Returns: Scaled input sequence data. """ t1 = time.time() scaler = pickle.load(open(scaler_path, 'rb')) (n_clips, n_time, n_freq) = x3d.shape x2d = x3d.reshape((n_clips * n_time, n_freq)) x2d_scaled = scaler.transform(x2d) x3d_scaled = x2d_scaled.reshape((n_clips, n_time, n_freq)) if verbose == 1: print("Scaling time: %s" % (time.time() - t1,)) return x3d_scaled ### Main function if __name__ == '__main__': parser = argparse.ArgumentParser(description="") subparsers = parser.add_subparsers(dest='mode') parser_ef = subparsers.add_parser('extract_features') parser_ef.add_argument('--wav_dir', type=str) parser_ef.add_argument('--out_dir', type=str) parser_ef.add_argument('--recompute', type=bool) parser_pf = subparsers.add_parser('pack_features') parser_pf.add_argument('--fe_dir', type=str) parser_pf.add_argument('--csv_path', type=str) parser_pf.add_argument('--out_path', type=str) parser_cs = subparsers.add_parser('calculate_scaler') parser_cs.add_argument('--hdf5_path', type=str) parser_cs.add_argument('--out_path', type=str) args = parser.parse_args() if args.mode == 'extract_features': extract_features(wav_dir=args.wav_dir, out_dir=args.out_dir, recompute=args.recompute) elif args.mode == 'pack_features': pack_features_to_hdf5(fe_dir=args.fe_dir, csv_path=args.csv_path, out_path=args.out_path) elif args.mode == 'calculate_scaler': calculate_scaler(hdf5_path=args.hdf5_path, out_path=args.out_path) else: raise Exception("Incorrect argument!") ================================================ FILE: runme.sh ================================================ #!/bin/bash TEST_WAV_DIR="/vol/vssp/AP_datasets/audio/audioset/task4_dcase2017_audio/official_downloads/testing" TRAIN_WAV_DIR="/vol/vssp/AP_datasets/audio/audioset/task4_dcase2017_audio/official_downloads/training" EVALUATION_WAV_DIR="/vol/vssp/datasets/audio/audioset/task4_dcase2017_audio/official_downloads/evaluation" WORKSPACE="/vol/vssp/msos/qk/workspaces/ICASSP2018_dcase" # Extract features python prepare_data.py extract_features --wav_dir=$TEST_WAV_DIR --out_dir=$WORKSPACE"/features/logmel/testing" --recompute=True python prepare_data.py extract_features --wav_dir=$TRAIN_WAV_DIR --out_dir=$WORKSPACE"/features/logmel/training" --recompute=True python prepare_data.py extract_features --wav_dir=$EVALUATION_WAV_DIR --out_dir=$WORKSPACE"/features/logmel/evaluation" --recompute=True # Pack features python prepare_data.py pack_features --fe_dir=$WORKSPACE"/features/logmel/testing" --csv_path="meta_data/testing_set.csv" --out_path=$WORKSPACE"/packed_features/logmel/testing.h5" python prepare_data.py pack_features --fe_dir=$WORKSPACE"/features/logmel/training" --csv_path="meta_data/training_set.csv" --out_path=$WORKSPACE"/packed_features/logmel/training.h5" python prepare_data.py pack_features --fe_dir=$WORKSPACE"/features/logmel/evaluation" --csv_path="" --out_path=$WORKSPACE"/packed_features/logmel/evaluation.h5" # Calculate scaler python prepare_data.py calculate_scaler --hdf5_path=$WORKSPACE"/packed_features/logmel/training.h5" --out_path=$WORKSPACE"/scalers/logmel/training.scaler" # Train AT THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python main_crnn_at.py train --tr_hdf5_path=$WORKSPACE"/packed_features/logmel/training.h5" --te_hdf5_path=$WORKSPACE"/packed_features/logmel/testing.h5" --scaler_path=$WORKSPACE"/scalers/logmel/training.scaler" --out_model_dir=$WORKSPACE"/models/crnn_at" # Recognize AT THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python main_crnn_at.py recognize --te_hdf5_path=$WORKSPACE"/packed_features/logmel/testing.h5" --scaler_path=$WORKSPACE"/scalers/logmel/training.scaler" --model_dir=$WORKSPACE"/models/crnn_at" --out_dir=$WORKSPACE"/preds/crnn_at" # Get stat of AT THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python main_crnn_at.py get_stat --pred_dir=$WORKSPACE"/preds/crnn_at" --stat_dir=$WORKSPACE"/stats/crnn_at" --submission_dir=$WORKSPACE"/submissions/crnn_at" # Train SED THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python main_crnn_sed.py train --tr_hdf5_path=$WORKSPACE"/packed_features/logmel/training.h5" --te_hdf5_path=$WORKSPACE"/packed_features/logmel/testing.h5" --scaler_path=$WORKSPACE"/scalers/logmel/training.scaler" --out_model_dir=$WORKSPACE"/models/crnn_sed" # Recognize SED THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python main_crnn_at.py recognize --te_hdf5_path=$WORKSPACE"/packed_features/logmel/testing.h5" --scaler_path=$WORKSPACE"/scalers/logmel/training.scaler" --model_dir=$WORKSPACE"/models/crnn_sed" --out_dir=$WORKSPACE"/preds/crnn_sed" # Get stat of SED THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python main_crnn_at.py get_stat --pred_dir=$WORKSPACE"/preds/crnn_sed" --stat_dir=$WORKSPACE"/stats/crnn_sed" --submission_dir=$WORKSPACE"/submissions/crnn_sed"